containers/docker/dicom_indexer/indexer/index_dicom.py

352 lines
11 KiB
Python
Raw Normal View History

2024-01-23 21:37:03 +00:00
import os
import dicom
import argparse
import pathlib
import urllib.parse
import datalad.api as dlad
import shutil
2024-01-24 14:15:36 +00:00
GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab")
2024-01-23 21:37:03 +00:00
def sort_series(path: str) -> None:
"""Sort series in separate folder
Parameters
----------
path : str
path to dicoms
"""
2024-01-24 14:15:36 +00:00
files = glob.glob(os.path.join(path, "*"))
2024-01-23 21:37:03 +00:00
for f in files:
if not os.path.isfile(f):
continue
dic = dicom.read_file(f, stop_before_pixels=True)
# series_number = dic.SeriesNumber
series_instance_uid = dic.SeriesInstanceUID
subpath = os.path.join(path, series_instance_uid)
if not os.path.exists(subpath):
os.mkdir(subpath)
os.rename(f, os.path.join(subpath, os.path.basename(f)))
def _build_arg_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
2024-01-24 14:15:36 +00:00
description="dicom_indexer - indexes dicoms into datalad"
)
p.add_argument("input", help="path/url of the dicom.")
2024-01-23 21:37:03 +00:00
p.add_argument(
2024-01-24 14:15:36 +00:00
"--gitlab-url",
type=str,
help="http(s) url to the gitlab server where to push repos",
)
2024-01-23 21:37:03 +00:00
p.add_argument(
2024-01-24 14:15:36 +00:00
"--gitlab-group-template",
default="{ReferringPhysicianName}/{StudyDescription.replace(" ^ "," / ")}",
type=str,
help="string with placeholder for dicom tags",
)
p.add_argument("--storage-remote", help="url to the datalad remote")
2024-01-23 21:37:03 +00:00
p.add_argument(
"--sort-series",
action="store_true",
type=bool,
default=True,
help="sort dicom series in separate folders",
)
p.add_argument(
"--fake-dates",
type=bool,
action="store_true",
help="use fake dates for datalad dataset",
)
return p
2024-01-24 14:15:36 +00:00
def main() -> None:
2024-01-23 21:37:03 +00:00
parser = _build_arg_parser()
args = parser.parse_args()
input = urllib.parse.urlparse(args.input)
output_remote = urllib.parse.urlparse(args.storage_remote)
logger.info(f"input data: {input}")
process(
input,
output_remote,
sort_series=p.sort_series,
fake_dates=p.fake_dates,
)
2024-01-24 14:15:36 +00:00
2024-01-23 21:37:03 +00:00
def process(
2024-01-24 14:15:36 +00:00
input: urllib.parse.ParseResult,
2024-01-23 21:37:03 +00:00
output_remote: urllib.parse.ParseResult,
sort_series: bool,
fake_dates: bool,
p7z_opts: str,
gitlab_url: urllib.parse.ParseResult,
gitlab_group_template: str,
2024-01-24 14:15:36 +00:00
force_export: bool = False,
2024-01-23 21:37:03 +00:00
) -> None:
2024-01-24 14:15:36 +00:00
"""Process incoming dicoms into datalad repo"""
2024-01-23 21:37:03 +00:00
with tempfile.TemporaryDirectory() as tmpdirname:
dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates)
do_export = force_export
2024-01-24 14:15:36 +00:00
if not input.scheme or input.scheme == "file":
2024-01-23 21:37:03 +00:00
dest = import_local_data(
dicom_session_ds,
pathlib.Path(input.path),
sort_series=sort_series,
p7z_opts=p7z_opts,
)
do_export = True
2024-01-24 14:15:36 +00:00
elif input.scheme in ["http", "https", "s3"]:
2024-01-23 21:37:03 +00:00
dest = import_remote_data(dicom_session_ds, input_url)
# index dicoms files
dicom_session_ds.add_archive_content(
dest,
strip_leading_dirs=True,
commit=False,
)
# cannot pass message above so commit now
2024-01-24 14:15:36 +00:00
dicom_session_ds.save(message="index dicoms from archive") #
2024-01-23 21:37:03 +00:00
# optimize git index after large import
2024-01-24 14:15:36 +00:00
dicom_session_ds.repo.gc() # aggressive by default
2024-01-23 21:37:03 +00:00
session_metas = extract_session_metas(dicom_session_ds)
if do_export:
2024-01-24 14:15:36 +00:00
if output_remote.scheme == "ria":
2024-01-23 21:37:03 +00:00
export_to_ria(dicom_session_ds, output_remote, session_metas)
2024-01-24 14:15:36 +00:00
elif output_remote.scheme == "s3":
2024-01-23 21:37:03 +00:00
export_to_s3(dicom_session_ds, output_remote, session_metas)
setup_gitlab_remote(dicom_session_ds, gitlab_url, session_metas)
def setup_gitlab_repos(
dicom_session_ds: dlad.Dataset,
gitlab_url: urllib.parse.ParseResult,
session_metas: dict,
):
gitlab_conn = connect_gitlab()
gitlab_group_path = gitlab_group_template.format(session_metas)
2024-01-24 14:15:36 +00:00
dicom_sourcedata_path = "/".join([dicom_session_path, "sourcedata/dicoms"])
dicom_session_path = "/".join([dicom_sourcedata_path, ["StudyInstanceUID"]])
dicom_study_path = "/".join([dicom_sourcedata_path, "study"])
2024-01-23 21:37:03 +00:00
dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path)
ds.siblings(
2024-01-24 14:15:36 +00:00
action="configure", # allow to overwrite existing config
2024-01-23 21:37:03 +00:00
name=GITLAB_REMOTE_NAME,
2024-01-24 14:15:36 +00:00
url=dicom_session_repo._attrs["ssh_url_to_repo"],
2024-01-23 21:37:03 +00:00
)
ds.push(to=GITLAB_REMOTE_NAME)
study_group = get_or_create_group(gl, gitlab_group_path)
bot_user = gl.users.list(username=GITLAB_BOT_USERNAME)[0]
2024-01-24 14:15:36 +00:00
study_group.members.create(
{
"user_id": bot_user.id,
"access_level": gitlab.const.AccessLevel.MAINTAINER,
}
)
2024-01-23 21:37:03 +00:00
dicom_study_repo = get_or_create_project(gl, dicom_study_path)
with tempfile.TemporaryDirectory() as tmpdir:
dicom_study_ds = datalad.api.install(
2024-01-24 14:15:36 +00:00
source=dicom_study_repo._attrs["ssh_url_to_repo"],
2024-01-23 21:37:03 +00:00
path=tmpdir,
2024-01-24 14:15:36 +00:00
)
2024-01-23 21:37:03 +00:00
if dicom_study_ds.repo.get_hexsha() is None or dicom_study_ds.id is None:
dicom_study_ds.create(force=True)
2024-01-24 14:15:36 +00:00
dicom_study_ds.push(to="origin")
2024-01-23 21:37:03 +00:00
# add default study DS structure.
init_dicom_study(dicom_study_ds, PI, study_name)
# initialize BIDS project
init_bids(gl, PI, study_name, dicom_study_repo)
create_group(gl, [PI, study_name, "derivatives"])
create_group(gl, [PI, study_name, "qc"])
dicom_study_ds.install(
2024-01-24 14:15:36 +00:00
source=dicom_session_repo._attrs["ssh_url_to_repo"],
path=session_meta["PatientName"],
)
2024-01-23 21:37:03 +00:00
dicom_study_ds.create_sibling_ria(
UNF_DICOMS_RIA_URL,
name=UNF_DICOMS_RIA_NAME,
alias=study_name,
2024-01-24 14:15:36 +00:00
existing="reconfigure",
)
2024-01-23 21:37:03 +00:00
# Push to gitlab + local ria-store
2024-01-24 14:15:36 +00:00
dicom_study_ds.push(to="origin")
2024-01-23 21:37:03 +00:00
dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME)
SESSION_META_KEYS = [
2024-01-24 14:15:36 +00:00
"StudyInstanceUID",
"PatientID",
"PatientName",
"ReferringPhysicianName",
"StudyDate",
"StudyDescription",
2024-01-23 21:37:03 +00:00
]
2024-01-24 14:15:36 +00:00
2024-01-23 21:37:03 +00:00
def extract_session_metas(dicom_session_ds: dlad.Dataset):
2024-01-24 14:15:36 +00:00
all_files = dicom_session_ds.repo.find("*")
2024-01-23 21:37:03 +00:00
for f in all_files:
try:
dic = dicom.read_file(f, stop_before_pixels=True)
2024-01-24 14:15:36 +00:00
except Exception: # TODO: what exception occurs when non-dicom ?
2024-01-23 21:37:03 +00:00
continue
# return at first dicom found
2024-01-24 14:15:36 +00:00
return {k: getattr(dic, k) for k in SESSION_META_KEYS}
2024-01-23 21:37:03 +00:00
def import_local_data(
dicom_session_ds: dlad.Dataset,
input_path: pathlib.Path,
2024-01-24 14:15:36 +00:00
sort_series: bool = True,
p7z_opts: str = "-mx5",
2024-01-23 21:37:03 +00:00
):
dest = input_path.basename()
if input_path.is_dir():
2024-01-24 14:15:36 +00:00
dest = dest + ".7z"
2024-01-23 21:37:03 +00:00
# create 7z archive with 1block/file parameters
subprocess.run(
2024-01-24 14:15:36 +00:00
["7z", "u", str(dest), "."] + p7z_opts,
cwd=str(dicom_session_ds.path),
)
2024-01-23 21:37:03 +00:00
elif input_path.is_file():
dest = dicom_session_ds.path / dest
2024-01-24 14:15:36 +00:00
try: # try hard-linking to avoid copying
2024-01-23 21:37:03 +00:00
os.link(str(input_path), str(dest))
2024-01-24 14:15:36 +00:00
except OSError: # fallback if hard-linking not supported
2024-01-23 21:37:03 +00:00
shutil.copyfile(str(input_path), str(dest))
2024-01-24 14:15:36 +00:00
dicom_session_ds.save(dest, message="add dicoms archive")
2024-01-23 21:37:03 +00:00
return dest
def import_remote_data(
2024-01-24 14:15:36 +00:00
dicom_session_ds: dlad.Dataset, input_url: urllib.parse.ParseResult
):
2024-01-23 21:37:03 +00:00
try:
dest = pathlib.Path(url.path).basename
dicom_session_ds.repo.add_url_to_file(dest, url)
except Exception:
2024-01-24 14:15:36 +00:00
... # TODO: check how things can fail here and deal with it.
2024-01-23 21:37:03 +00:00
return dest
def export_to_ria(
ds: dlad.Dataset,
2024-01-24 14:15:36 +00:00
ria_url: urllib.parse.ParseResult,
2024-01-23 21:37:03 +00:00
session_metas: dict,
):
ria_name = pathlib.Path(ria_url.path).basename
ds.create_sibling_ria(
2024-01-24 14:15:36 +00:00
ria_url, name=ria_name, alias=session_meta["PatientID"], existing="reconfigure"
)
ds.push(to=ria_name, data="nothing")
ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]["url"])
archive_path = ria_sibling_path / "archives" / "archive.7z"
2024-01-23 21:37:03 +00:00
ds.export_archive_ora(
2024-01-24 14:15:36 +00:00
archive_path, opts=[f"-mx{COMPRESSION_LEVEL}"], missing_content="error"
)
ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) # index
ds.push(to=ria_name, data="nothing")
2024-01-23 21:37:03 +00:00
def export_to_s3(
ds: dlad.Dataset,
2024-01-24 14:15:36 +00:00
s3_url: urllib.parse.ParseResult,
2024-01-23 21:37:03 +00:00
session_metas: dict,
):
...
# git-annex initremote remotename ...
# git-annex wanted remotename include=**.{7z,tar.gz,zip}
# datalad push --data auto --to remotename
def connect_gitlab(debug=False):
"""
Connection to Gitlab
"""
gl = gitlab.Gitlab(GITLAB_SERVER, private_token=GITLAB_TOKEN)
if debug:
gl.enable_debug()
gl.auth()
return gl
def get_or_create_gitlab_group(gl, group_list):
2024-01-24 14:15:36 +00:00
""" """
2024-01-23 21:37:03 +00:00
found = False
2024-01-24 14:15:36 +00:00
for keep_groups in reversed(range(len(group_list) + 1)):
tmp_repo_path = "/".join(group_list[0:keep_groups])
2024-01-23 21:37:03 +00:00
logging.warning(tmp_repo_path)
gs = gl.groups.list(search=tmp_repo_path)
for g in gs:
2024-01-24 14:15:36 +00:00
if g.full_path == tmp_repo_path:
found = True
break
2024-01-23 21:37:03 +00:00
if found:
break
for nb_groups in range(keep_groups, len(group_list)):
if nb_groups == 0:
msg = "Creating group {}".format(group_list[nb_groups])
logging.warning(msg)
logging.warning(len(msg) * "=")
2024-01-24 14:15:36 +00:00
g = gl.groups.create(
{"name": group_list[nb_groups], "path": group_list[nb_groups]}
)
2024-01-23 21:37:03 +00:00
else:
2024-01-24 14:15:36 +00:00
msg = "Creating group {} from {}".format(group_list[nb_groups], g.name)
2024-01-23 21:37:03 +00:00
logging.warning(msg)
logging.warning(len(msg) * "=")
2024-01-24 14:15:36 +00:00
g = gl.groups.create(
{
"name": group_list[nb_groups],
"path": group_list[nb_groups],
"parent_id": g.id,
}
)
2024-01-23 21:37:03 +00:00
return g
def get_or_create_gitlab_project(gl, project_name):
2024-01-24 14:15:36 +00:00
""" """
2024-01-23 21:37:03 +00:00
if len(project_name) == 1:
# Check if exists
p = gl.projects.list(search=project_name[0])
if not p:
2024-01-24 14:15:36 +00:00
p = gl.projects.create({"name": project_name[0], "path": project_name[0]})
2024-01-23 21:37:03 +00:00
return p.id
else:
return p[0].id
2024-01-24 14:15:36 +00:00
repo_full_path = "/".join(project_name)
2024-01-23 21:37:03 +00:00
# Look for exact repo/project:
p = gl.projects.list(search=project_name[-1])
if p:
for curr_p in p:
if curr_p.path_with_namespace == repo_full_path:
return curr_p
g = get_or_create_gitlab_group(gl, project_name[:-1])
2024-01-24 14:15:36 +00:00
p = gl.projects.create({"name": project_name[-1], "namespace_id": g.id})
2024-01-23 21:37:03 +00:00
return p