wip: dicom indexer

This commit is contained in:
bpinsard 2024-01-25 11:40:52 -05:00
parent 54881b3166
commit 9b28f8f8fb
1 changed files with 120 additions and 48 deletions

View File

@ -1,16 +1,17 @@
import os import os
import dicom import pydicom as dicom
import argparse import argparse
import pathlib import pathlib
import urllib.parse import urllib.parse
import datalad.api as dlad import datalad.api as dlad
import shutil import shutil
import gitlab
GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab") GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab")
def sort_series(path: str) -> None: def sort_series(path: pathlib.Path) -> None:
"""Sort series in separate folder """Sort series in separate folder
Parameters Parameters
@ -19,7 +20,7 @@ def sort_series(path: str) -> None:
path to dicoms path to dicoms
""" """
files = glob.glob(os.path.join(path, "*")) files = path.glob(os.path.join(path, "*"))
for f in files: for f in files:
if not os.path.isfile(f): if not os.path.isfile(f):
continue continue
@ -40,6 +41,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
p.add_argument( p.add_argument(
"--gitlab-url", "--gitlab-url",
type=str, type=str,
default=os.environ.get("GITLAB_SERVER", None),
help="http(s) url to the gitlab server where to push repos", help="http(s) url to the gitlab server where to push repos",
) )
p.add_argument( p.add_argument(
@ -48,6 +50,12 @@ def _build_arg_parser() -> argparse.ArgumentParser:
type=str, type=str,
help="string with placeholder for dicom tags", help="string with placeholder for dicom tags",
) )
p.add_argument(
"--session-name-tag",
default="PatientName",
type=str,
help="dicom tags that contains the name of the session",
)
p.add_argument("--storage-remote", help="url to the datalad remote") p.add_argument("--storage-remote", help="url to the datalad remote")
p.add_argument( p.add_argument(
"--sort-series", "--sort-series",
@ -62,6 +70,12 @@ def _build_arg_parser() -> argparse.ArgumentParser:
action="store_true", action="store_true",
help="use fake dates for datalad dataset", help="use fake dates for datalad dataset",
) )
p.add_argument(
"--p7z-opts",
type=str,
default="-mx5 -ms=off",
help="option for 7z generated archives",
)
return p return p
@ -71,32 +85,39 @@ def main() -> None:
input = urllib.parse.urlparse(args.input) input = urllib.parse.urlparse(args.input)
output_remote = urllib.parse.urlparse(args.storage_remote) output_remote = urllib.parse.urlparse(args.storage_remote)
logger.info(f"input data: {input}") gitlab_url = urllib.parse.urlparse(args.gitlab_url)
process( with index_dicoms(
input, input,
output_remote,
sort_series=p.sort_series, sort_series=p.sort_series,
fake_dates=p.fake_dates, fake_dates=p.fake_dates,
) p7z_opts=p.p7z_opts,
gitlab_group_template=args.gitlab_group_template,
) as dicom_session_ds:
session_metas = extract_session_metas(dicom_session_ds)
if not input.scheme or input.scheme == "file" or args.force_export:
export_data(dicom_session_ds, output_remote, session_metas)
setup_gitlab_remote(
dicom_session_ds,
gitlab_url=gitlab_url,
dicom_session_name=args.session_name_tag,
session_metas=session_meta,
)
def process( def index_dicoms(
input: urllib.parse.ParseResult, input: urllib.parse.ParseResult,
output_remote: urllib.parse.ParseResult,
sort_series: bool, sort_series: bool,
fake_dates: bool, fake_dates: bool,
p7z_opts: str, p7z_opts: str,
gitlab_url: urllib.parse.ParseResult, ) -> dlad.Dataset:
gitlab_group_template: str,
force_export: bool = False,
) -> None:
"""Process incoming dicoms into datalad repo""" """Process incoming dicoms into datalad repo"""
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates) dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates)
do_export = force_export
if not input.scheme or input.scheme == "file": if not input.scheme or input.scheme == "file":
dest = import_local_data( dest = import_local_data(
dicom_session_ds, dicom_session_ds,
@ -104,7 +125,6 @@ def process(
sort_series=sort_series, sort_series=sort_series,
p7z_opts=p7z_opts, p7z_opts=p7z_opts,
) )
do_export = True
elif input.scheme in ["http", "https", "s3"]: elif input.scheme in ["http", "https", "s3"]:
dest = import_remote_data(dicom_session_ds, input_url) dest = import_remote_data(dicom_session_ds, input_url)
@ -118,28 +138,33 @@ def process(
dicom_session_ds.save(message="index dicoms from archive") # dicom_session_ds.save(message="index dicoms from archive") #
# optimize git index after large import # optimize git index after large import
dicom_session_ds.repo.gc() # aggressive by default dicom_session_ds.repo.gc() # aggressive by default
yield dicom_session_ds
session_metas = extract_session_metas(dicom_session_ds)
if do_export: def export_data(
if output_remote.scheme == "ria": dicom_session_ds: dlad.Dataset,
export_to_ria(dicom_session_ds, output_remote, session_metas) output_remote: urllib.parse.ParseResult,
elif output_remote.scheme == "s3": session_metas: dict,
export_to_s3(dicom_session_ds, output_remote, session_metas) ):
if output_remote.scheme == "ria":
setup_gitlab_remote(dicom_session_ds, gitlab_url, session_metas) export_to_ria(dicom_session_ds, output_remote, session_metas)
elif output_remote.scheme == "s3":
export_to_s3(dicom_session_ds, output_remote, session_metas)
def setup_gitlab_repos( def setup_gitlab_repos(
dicom_session_ds: dlad.Dataset, dicom_session_ds: dlad.Dataset,
gitlab_url: urllib.parse.ParseResult, gitlab_url: urllib.parse.ParseResult,
gitlab_group_path: str,
session_metas: dict, session_metas: dict,
): ) -> None:
gitlab_conn = connect_gitlab() gitlab_conn = connect_gitlab(gitlab_url)
gitlab_group_path = gitlab_group_template.format(session_metas) gitlab_group_path = gitlab_group_template.format(session_metas)
dicom_sourcedata_path = "/".join([dicom_session_path, "sourcedata/dicoms"]) dicom_sourcedata_path = "/".join([dicom_session_path, "sourcedata/dicoms"])
dicom_session_path = "/".join([dicom_sourcedata_path, ["StudyInstanceUID"]]) dicom_session_path = "/".join(
[dicom_sourcedata_path, session_metas["StudyInstanceUID"]]
)
dicom_study_path = "/".join([dicom_sourcedata_path, "study"]) dicom_study_path = "/".join([dicom_sourcedata_path, "study"])
dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path) dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path)
@ -159,6 +184,7 @@ def setup_gitlab_repos(
} }
) )
## add the session to the dicom study repo
dicom_study_repo = get_or_create_project(gl, dicom_study_path) dicom_study_repo = get_or_create_project(gl, dicom_study_path)
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
dicom_study_ds = datalad.api.install( dicom_study_ds = datalad.api.install(
@ -170,28 +196,69 @@ def setup_gitlab_repos(
dicom_study_ds.create(force=True) dicom_study_ds.create(force=True)
dicom_study_ds.push(to="origin") dicom_study_ds.push(to="origin")
# add default study DS structure. # add default study DS structure.
init_dicom_study(dicom_study_ds, PI, study_name) init_dicom_study(dicom_study_ds, gitlab_group_path)
# initialize BIDS project # initialize BIDS project
init_bids(gl, PI, study_name, dicom_study_repo) init_bids(gl, dicom_study_repo, gitlab_group_path)
create_group(gl, [PI, study_name, "derivatives"]) # create subgroup for QC and derivatives repos
create_group(gl, [PI, study_name, "qc"]) create_group(gl, f"{gitlab_group_path}/derivatives")
create_group(gl, f"{gitlab_group_path}/qc")
dicom_study_ds.install( dicom_study_ds.install(
source=dicom_session_repo._attrs["ssh_url_to_repo"], source=dicom_session_repo._attrs["ssh_url_to_repo"],
path=session_meta["PatientName"], path=session_meta["PatientName"],
) )
dicom_study_ds.create_sibling_ria(
UNF_DICOMS_RIA_URL,
name=UNF_DICOMS_RIA_NAME,
alias=study_name,
existing="reconfigure",
)
# Push to gitlab + local ria-store # Push to gitlab + local ria-store
dicom_study_ds.push(to="origin") dicom_study_ds.push(to="origin")
dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME) dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME)
def init_bids(
gl: gitlab.Gitlab,
dicom_study_repo: dlad.Dataset,
gitlab_group_path: str,
) -> None:
bids_project_repo = create_project(gl, f"{gitlab_group_path}/bids")
with tempfile.TemporaryDirectory() as tmpdir:
bids_project_ds = datalad.api.install(
source=bids_project_repo._attrs["ssh_url_to_repo"],
path=tmpdir,
)
bids_project_ds.create(force=True)
shutil.copytree("repo_templates/bids", bids_project_ds.path, dirs_exist_ok=True)
bids_project_ds.save(path=".", message="init structure and pipelines")
bids_project_ds.install(
path="sourcedata/dicoms",
source=dicom_study_repo._attrs["ssh_url_to_repo"],
)
# TODO: setup sensitive / non-sensitive S3 buckets
bids_project_ds.push(to="origin")
# create dev branch and push for merge requests
bids_project_ds.gitrepo.checkout(BIDS_DEV_BRANCH, ["-b"])
bids_project_ds.push(to="origin")
bids_project_ds.protectedbranches.create(data={"name": "convert/*"})
bids_project_ds.protectedbranches.create(data={"name": "dev"})
def init_dicom_study(
dicom_study_ds: dlad.Dataset,
gitlab_group_path: str,
) -> None:
shutil.copytree(
"repo_templates/dicom_study", dicom_study_ds.path, dirs_exist_ok=True
)
env = {
"variables": {
"STUDY_PATH": gitlab_group_path,
"BIDS_PATH": f"{gitlab_group_path}/bids",
}
}
with open(os.path.join(dicom_study_ds.path, "ci-env.yml"), "w") as outfile:
yaml.dump(env, outfile, default_flow_style=False)
dicom_study_ds.save(path=".", message="init structure and pipelines")
dicom_study_ds.push(to="origin")
SESSION_META_KEYS = [ SESSION_META_KEYS = [
"StudyInstanceUID", "StudyInstanceUID",
"PatientID", "PatientID",
@ -202,7 +269,7 @@ SESSION_META_KEYS = [
] ]
def extract_session_metas(dicom_session_ds: dlad.Dataset): def extract_session_metas(dicom_session_ds: dlad.Dataset) -> dict:
all_files = dicom_session_ds.repo.find("*") all_files = dicom_session_ds.repo.find("*")
for f in all_files: for f in all_files:
try: try:
@ -273,25 +340,31 @@ def export_to_s3(
s3_url: urllib.parse.ParseResult, s3_url: urllib.parse.ParseResult,
session_metas: dict, session_metas: dict,
): ):
... ds.repo.initremote()
# git-annex initremote remotename ... # git-annex initremote remotename ...
# git-annex wanted remotename include=**.{7z,tar.gz,zip} # git-annex wanted remotename include=**.{7z,tar.gz,zip}
# datalad push --data auto --to remotename # datalad push --data auto --to remotename
def connect_gitlab(debug=False): def connect_gitlab(
gitlab_url: urllib.parse.ParseResult, debug: bool = False
) -> gitlab.Gitlab:
""" """
Connection to Gitlab Connection to Gitlab
""" """
gl = gitlab.Gitlab(GITLAB_SERVER, private_token=GITLAB_TOKEN) gl = gitlab.Gitlab(str(gitlab_url), private_token=GITLAB_TOKEN)
if debug: if debug:
gl.enable_debug() gl.enable_debug()
gl.auth() gl.auth()
return gl return gl
def get_or_create_gitlab_group(gl, group_list): def get_or_create_gitlab_group(
""" """ gl: gitlab.Gitlab,
group_path: str,
):
"""fetch or create a gitlab group"""
group_list = group.split("/")
found = False found = False
for keep_groups in reversed(range(len(group_list) + 1)): for keep_groups in reversed(range(len(group_list) + 1)):
tmp_repo_path = "/".join(group_list[0:keep_groups]) tmp_repo_path = "/".join(group_list[0:keep_groups])
@ -326,8 +399,9 @@ def get_or_create_gitlab_group(gl, group_list):
return g return g
def get_or_create_gitlab_project(gl, project_name): def get_or_create_gitlab_project(gl: gitlab.Gitlab, project_path: str):
""" """ """fetch or create a gitlab repo"""
project_name = project_path.split("/")
if len(project_name) == 1: if len(project_name) == 1:
# Check if exists # Check if exists
p = gl.projects.list(search=project_name[0]) p = gl.projects.list(search=project_name[0])
@ -337,13 +411,11 @@ def get_or_create_gitlab_project(gl, project_name):
else: else:
return p[0].id return p[0].id
repo_full_path = "/".join(project_name)
# Look for exact repo/project: # Look for exact repo/project:
p = gl.projects.list(search=project_name[-1]) p = gl.projects.list(search=project_name[-1])
if p: if p:
for curr_p in p: for curr_p in p:
if curr_p.path_with_namespace == repo_full_path: if curr_p.path_with_namespace == project_path:
return curr_p return curr_p
g = get_or_create_gitlab_group(gl, project_name[:-1]) g = get_or_create_gitlab_group(gl, project_name[:-1])