From a579defbb011da87c47d0286d8b4ee227981ea3c Mon Sep 17 00:00:00 2001 From: bpinsard Date: Tue, 23 Jan 2024 16:37:03 -0500 Subject: [PATCH 01/26] wip: flexible dicom indexer --- .gitlab-ci.yml | 9 + docker/dicom_indexer/Dockerfile | 10 + docker/dicom_indexer/indexer/index_dicom.py | 354 ++++++++++++++++++++ 3 files changed, 373 insertions(+) create mode 100644 docker/dicom_indexer/Dockerfile create mode 100644 docker/dicom_indexer/indexer/index_dicom.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d960585..cc95f01 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -79,3 +79,12 @@ build_pydeface: - changes: - docker/pydeface/**/* - .gitlab-ci.yml + +build_dicom_indexer: + extends: .build_tpl + variables: + IMAGE: $CI_REGISTRY_IMAGE/dicom_indexer + rules: + - changes: + - docker/dicom_indexer/**/* + - .gitlab-ci.yml diff --git a/docker/dicom_indexer/Dockerfile b/docker/dicom_indexer/Dockerfile new file mode 100644 index 0000000..21b30c0 --- /dev/null +++ b/docker/dicom_indexer/Dockerfile @@ -0,0 +1,10 @@ +FROM alpine:3.17.2 +RUN apk add --no-cache ca-certificates tzdata \ + python3 py3-pip git openssh-client git-annex curl bzip2 bash glab\ + && cp /usr/share/zoneinfo/UTC /etc/localtime \ + && apk del tzdata \ + && rm -rf /tmp/* /var/cache/apk/* + +RUN pip install --no-cache-dir datalad ssh_agent_setup python-gitlab pydicom + +WORKDIR /work diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py new file mode 100644 index 0000000..a33d5fa --- /dev/null +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -0,0 +1,354 @@ +import os +import dicom +import argparse +import pathlib +import urllib.parse +import datalad.api as dlad +import shutil + + +GITLAB_REMOTE_NAME = os.environ.get('GITLAB_REMOTE_NAME', 'gitlab') + +def sort_series(path: str) -> None: + """Sort series in separate folder + + Parameters + ---------- + path : str + path to dicoms + + """ + files = glob.glob(os.path.join(path, '*')) + for f in files: + if not os.path.isfile(f): + continue + dic = dicom.read_file(f, stop_before_pixels=True) + # series_number = dic.SeriesNumber + series_instance_uid = dic.SeriesInstanceUID + subpath = os.path.join(path, series_instance_uid) + if not os.path.exists(subpath): + os.mkdir(subpath) + os.rename(f, os.path.join(subpath, os.path.basename(f))) + + +def _build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="dicom_indexer - indexes dicoms into datalad") + p.add_argument( + 'input', nargs='+', + help='path/url of the dicom.') + p.add_argument() + p.add_argument( + 'gitlab_group_template', + default='{ReferringPhysicianName}/{StudyDescription.replace('^','/')}' + type=str) + p.add_argument( + '--storage-remote', + help='url to the datalad remote') + p.add_argument( + "--sort-series", + action="store_true", + type=bool, + default=True, + help="sort dicom series in separate folders", + ) + p.add_argument( + "--fake-dates", + type=bool, + action="store_true", + help="use fake dates for datalad dataset", + ) + return p + +def main() -> None: + + parser = _build_arg_parser() + args = parser.parse_args() + + input = urllib.parse.urlparse(args.input) + output_remote = urllib.parse.urlparse(args.storage_remote) + logger.info(f"input data: {input}") + + process( + input, + output_remote, + sort_series=p.sort_series, + fake_dates=p.fake_dates, + ) + +def process( + input:urllib.parse.ParseResult, + output_remote: urllib.parse.ParseResult, + sort_series: bool, + fake_dates: bool, + p7z_opts: str, + gitlab_url: urllib.parse.ParseResult, + gitlab_group_template: str, + force_export: bool=False, +) -> None: + """Process incoming dicoms into datalad repo + + """ + with tempfile.TemporaryDirectory() as tmpdirname: + dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates) + + do_export = force_export + + if not input.scheme or input.scheme == 'file': + dest = import_local_data( + dicom_session_ds, + pathlib.Path(input.path), + sort_series=sort_series, + p7z_opts=p7z_opts, + ) + do_export = True + elif input.scheme in ['http', 'https', 's3']: + dest = import_remote_data(dicom_session_ds, input_url) + + # index dicoms files + dicom_session_ds.add_archive_content( + dest, + strip_leading_dirs=True, + commit=False, + ) + # cannot pass message above so commit now + dicom_session_ds.save(message='index dicoms from archive')# + # optimize git index after large import + dicom_session_ds.repo.gc() # aggressive by default + + session_metas = extract_session_metas(dicom_session_ds) + + if do_export: + if output_remote.scheme == 'ria': + export_to_ria(dicom_session_ds, output_remote, session_metas) + elif output_remote.scheme == 's3': + export_to_s3(dicom_session_ds, output_remote, session_metas) + + + setup_gitlab_remote(dicom_session_ds, gitlab_url, session_metas) + + + + + +def setup_gitlab_repos( + dicom_session_ds: dlad.Dataset, + gitlab_url: urllib.parse.ParseResult, + session_metas: dict, +): + gitlab_conn = connect_gitlab() + + gitlab_group_path = gitlab_group_template.format(session_metas) + dicom_sourcedata_path = '/'.join([dicom_session_path, 'sourcedata/dicoms']) + dicom_session_path = '/'.join([dicom_sourcedata_path, ['StudyInstanceUID']]) + dicom_study_path = '/'.join([dicom_sourcedata_path, 'study']) + + dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path) + ds.siblings( + action='configure', # allow to overwrite existing config + name=GITLAB_REMOTE_NAME, + url=dicom_session_repo._attrs['ssh_url_to_repo'], + ) + ds.push(to=GITLAB_REMOTE_NAME) + + study_group = get_or_create_group(gl, gitlab_group_path) + bot_user = gl.users.list(username=GITLAB_BOT_USERNAME)[0] + study_group.members.create({ + 'user_id': bot_user.id, + 'access_level': gitlab.const.AccessLevel.MAINTAINER, + }) + + + dicom_study_repo = get_or_create_project(gl, dicom_study_path) + with tempfile.TemporaryDirectory() as tmpdir: + dicom_study_ds = datalad.api.install( + source = dicom_study_repo._attrs['ssh_url_to_repo'], + path=tmpdir, + ) + + if dicom_study_ds.repo.get_hexsha() is None or dicom_study_ds.id is None: + dicom_study_ds.create(force=True) + dicom_study_ds.push(to='origin') + # add default study DS structure. + init_dicom_study(dicom_study_ds, PI, study_name) + # initialize BIDS project + init_bids(gl, PI, study_name, dicom_study_repo) + create_group(gl, [PI, study_name, "derivatives"]) + create_group(gl, [PI, study_name, "qc"]) + + dicom_study_ds.install( + source=dicom_session_repo._attrs['ssh_url_to_repo'], + path=session_meta['PatientName'], + ) + dicom_study_ds.create_sibling_ria( + UNF_DICOMS_RIA_URL, + name=UNF_DICOMS_RIA_NAME, + alias=study_name, + existing='reconfigure') + + + # Push to gitlab + local ria-store + dicom_study_ds.push(to='origin') + dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME) + + +SESSION_META_KEYS = [ + 'StudyInstanceUID', + 'PatientID', + 'PatientName', + 'ReferringPhysicianName', + 'StudyDate', + 'StudyDescription', +] + +def extract_session_metas(dicom_session_ds: dlad.Dataset): + all_files = dicom_session_ds.repo.find('*') + for f in all_files: + try: + dic = dicom.read_file(f, stop_before_pixels=True) + except Exception: # TODO: what exception occurs when non-dicom ? + continue + # return at first dicom found + return {k:getattr(dic, k) for k in SESSION_META_KEYS} + + +def import_local_data( + dicom_session_ds: dlad.Dataset, + input_path: pathlib.Path, + sort_series: bool=True, + p7z_opts: str='-mx5' +): + dest = input_path.basename() + + if input_path.is_dir(): + dest = dest + '.7z' + # create 7z archive with 1block/file parameters + subprocess.run( + ['7z', 'u', str(dest), '.'] + p7z_opts, + cwd=str(dicom_session_ds.path), + ) + elif input_path.is_file(): + dest = dicom_session_ds.path / dest + try: # try hard-linking to avoid copying + os.link(str(input_path), str(dest)) + except OSError: #fallback if hard-linking not supported + shutil.copyfile(str(input_path), str(dest)) + dicom_session_ds.save(dest, message='add dicoms archive') + return dest + + +def import_remote_data( + dicom_session_ds:dlad.Dataset, + input_url:urllib.parse.ParseResult): + + try: + dest = pathlib.Path(url.path).basename + dicom_session_ds.repo.add_url_to_file(dest, url) + except Exception: + ... #TODO: check how things can fail here and deal with it. + return dest + + + +def export_to_ria( + ds: dlad.Dataset, + ria_url:urllib.parse.ParseResult, + session_metas: dict, +): + ria_name = pathlib.Path(ria_url.path).basename + ds.create_sibling_ria( + ria_url, + name=ria_name, + alias=session_meta['PatientID'], + existing='reconfigure') + ds.push(to=ria_name, data='nothing') + ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]['url']) + archive_path = ria_sibling_path / 'archives' / 'archive.7z' + ds.export_archive_ora( + archive_path, + opts=[f'-mx{COMPRESSION_LEVEL}'], + missing_content='error') + ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) #index + ds.push(to=ria_name, data='nothing') + +def export_to_s3( + ds: dlad.Dataset, + s3_url:urllib.parse.ParseResult, + session_metas: dict, +): + ... + # git-annex initremote remotename ... + # git-annex wanted remotename include=**.{7z,tar.gz,zip} + # datalad push --data auto --to remotename + + +def connect_gitlab(debug=False): + """ + Connection to Gitlab + """ + gl = gitlab.Gitlab(GITLAB_SERVER, private_token=GITLAB_TOKEN) + if debug: + gl.enable_debug() + gl.auth() + return gl + + +def get_or_create_gitlab_group(gl, group_list): + """ + """ + found = False + for keep_groups in reversed(range(len(group_list)+1)): + tmp_repo_path = '/'.join(group_list[0:keep_groups]) + logging.warning(tmp_repo_path) + gs = gl.groups.list(search=tmp_repo_path) + for g in gs: + if g.full_path == tmp_repo_path: + found = True + break + if found: + break + for nb_groups in range(keep_groups, len(group_list)): + if nb_groups == 0: + msg = "Creating group {}".format(group_list[nb_groups]) + logging.warning(msg) + logging.warning(len(msg) * "=") + g = gl.groups.create({'name': group_list[nb_groups], + 'path': group_list[nb_groups]}) + else: + msg = 'Creating group {} from {}'.format(group_list[nb_groups], + g.name) + logging.warning(msg) + logging.warning(len(msg) * "=") + g = gl.groups.create({'name': group_list[nb_groups], + 'path': group_list[nb_groups], + 'parent_id': g.id}) + + return g + + +def get_or_create_gitlab_project(gl, project_name): + """ + """ + if len(project_name) == 1: + # Check if exists + p = gl.projects.list(search=project_name[0]) + if not p: + p = gl.projects.create({'name': project_name[0], + 'path': project_name[0]}) + return p.id + else: + return p[0].id + + repo_full_path = '/'.join(project_name) + + # Look for exact repo/project: + p = gl.projects.list(search=project_name[-1]) + if p: + for curr_p in p: + if curr_p.path_with_namespace == repo_full_path: + return curr_p + + g = get_or_create_gitlab_group(gl, project_name[:-1]) + p = gl.projects.create({'name': project_name[-1], + 'namespace_id': g.id}) + return p From 54881b3166e9ab07a041d43111dbe50eb5d2b0c2 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Wed, 24 Jan 2024 09:15:36 -0500 Subject: [PATCH 02/26] wip: dicom indexre --- docker/dicom_indexer/indexer/index_dicom.py | 207 ++++++++++---------- 1 file changed, 102 insertions(+), 105 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index a33d5fa..2f22ca4 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -7,7 +7,8 @@ import datalad.api as dlad import shutil -GITLAB_REMOTE_NAME = os.environ.get('GITLAB_REMOTE_NAME', 'gitlab') +GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab") + def sort_series(path: str) -> None: """Sort series in separate folder @@ -18,7 +19,7 @@ def sort_series(path: str) -> None: path to dicoms """ - files = glob.glob(os.path.join(path, '*')) + files = glob.glob(os.path.join(path, "*")) for f in files: if not os.path.isfile(f): continue @@ -33,18 +34,21 @@ def sort_series(path: str) -> None: def _build_arg_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( - description="dicom_indexer - indexes dicoms into datalad") + description="dicom_indexer - indexes dicoms into datalad" + ) + p.add_argument("input", help="path/url of the dicom.") p.add_argument( - 'input', nargs='+', - help='path/url of the dicom.') - p.add_argument() + "--gitlab-url", + type=str, + help="http(s) url to the gitlab server where to push repos", + ) p.add_argument( - 'gitlab_group_template', - default='{ReferringPhysicianName}/{StudyDescription.replace('^','/')}' - type=str) - p.add_argument( - '--storage-remote', - help='url to the datalad remote') + "--gitlab-group-template", + default="{ReferringPhysicianName}/{StudyDescription.replace(" ^ "," / ")}", + type=str, + help="string with placeholder for dicom tags", + ) + p.add_argument("--storage-remote", help="url to the datalad remote") p.add_argument( "--sort-series", action="store_true", @@ -60,8 +64,8 @@ def _build_arg_parser() -> argparse.ArgumentParser: ) return p -def main() -> None: +def main() -> None: parser = _build_arg_parser() args = parser.parse_args() @@ -76,25 +80,24 @@ def main() -> None: fake_dates=p.fake_dates, ) + def process( - input:urllib.parse.ParseResult, + input: urllib.parse.ParseResult, output_remote: urllib.parse.ParseResult, sort_series: bool, fake_dates: bool, p7z_opts: str, gitlab_url: urllib.parse.ParseResult, gitlab_group_template: str, - force_export: bool=False, + force_export: bool = False, ) -> None: - """Process incoming dicoms into datalad repo - - """ + """Process incoming dicoms into datalad repo""" with tempfile.TemporaryDirectory() as tmpdirname: dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates) do_export = force_export - if not input.scheme or input.scheme == 'file': + if not input.scheme or input.scheme == "file": dest = import_local_data( dicom_session_ds, pathlib.Path(input.path), @@ -102,7 +105,7 @@ def process( p7z_opts=p7z_opts, ) do_export = True - elif input.scheme in ['http', 'https', 's3']: + elif input.scheme in ["http", "https", "s3"]: dest = import_remote_data(dicom_session_ds, input_url) # index dicoms files @@ -112,25 +115,21 @@ def process( commit=False, ) # cannot pass message above so commit now - dicom_session_ds.save(message='index dicoms from archive')# + dicom_session_ds.save(message="index dicoms from archive") # # optimize git index after large import - dicom_session_ds.repo.gc() # aggressive by default + dicom_session_ds.repo.gc() # aggressive by default session_metas = extract_session_metas(dicom_session_ds) if do_export: - if output_remote.scheme == 'ria': + if output_remote.scheme == "ria": export_to_ria(dicom_session_ds, output_remote, session_metas) - elif output_remote.scheme == 's3': + elif output_remote.scheme == "s3": export_to_s3(dicom_session_ds, output_remote, session_metas) - setup_gitlab_remote(dicom_session_ds, gitlab_url, session_metas) - - - def setup_gitlab_repos( dicom_session_ds: dlad.Dataset, gitlab_url: urllib.parse.ParseResult, @@ -139,36 +138,37 @@ def setup_gitlab_repos( gitlab_conn = connect_gitlab() gitlab_group_path = gitlab_group_template.format(session_metas) - dicom_sourcedata_path = '/'.join([dicom_session_path, 'sourcedata/dicoms']) - dicom_session_path = '/'.join([dicom_sourcedata_path, ['StudyInstanceUID']]) - dicom_study_path = '/'.join([dicom_sourcedata_path, 'study']) + dicom_sourcedata_path = "/".join([dicom_session_path, "sourcedata/dicoms"]) + dicom_session_path = "/".join([dicom_sourcedata_path, ["StudyInstanceUID"]]) + dicom_study_path = "/".join([dicom_sourcedata_path, "study"]) dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path) ds.siblings( - action='configure', # allow to overwrite existing config + action="configure", # allow to overwrite existing config name=GITLAB_REMOTE_NAME, - url=dicom_session_repo._attrs['ssh_url_to_repo'], + url=dicom_session_repo._attrs["ssh_url_to_repo"], ) ds.push(to=GITLAB_REMOTE_NAME) study_group = get_or_create_group(gl, gitlab_group_path) bot_user = gl.users.list(username=GITLAB_BOT_USERNAME)[0] - study_group.members.create({ - 'user_id': bot_user.id, - 'access_level': gitlab.const.AccessLevel.MAINTAINER, - }) - + study_group.members.create( + { + "user_id": bot_user.id, + "access_level": gitlab.const.AccessLevel.MAINTAINER, + } + ) dicom_study_repo = get_or_create_project(gl, dicom_study_path) with tempfile.TemporaryDirectory() as tmpdir: dicom_study_ds = datalad.api.install( - source = dicom_study_repo._attrs['ssh_url_to_repo'], + source=dicom_study_repo._attrs["ssh_url_to_repo"], path=tmpdir, - ) + ) if dicom_study_ds.repo.get_hexsha() is None or dicom_study_ds.id is None: dicom_study_ds.create(force=True) - dicom_study_ds.push(to='origin') + dicom_study_ds.push(to="origin") # add default study DS structure. init_dicom_study(dicom_study_ds, PI, study_name) # initialize BIDS project @@ -177,103 +177,100 @@ def setup_gitlab_repos( create_group(gl, [PI, study_name, "qc"]) dicom_study_ds.install( - source=dicom_session_repo._attrs['ssh_url_to_repo'], - path=session_meta['PatientName'], - ) + source=dicom_session_repo._attrs["ssh_url_to_repo"], + path=session_meta["PatientName"], + ) dicom_study_ds.create_sibling_ria( UNF_DICOMS_RIA_URL, name=UNF_DICOMS_RIA_NAME, alias=study_name, - existing='reconfigure') - + existing="reconfigure", + ) # Push to gitlab + local ria-store - dicom_study_ds.push(to='origin') + dicom_study_ds.push(to="origin") dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME) SESSION_META_KEYS = [ - 'StudyInstanceUID', - 'PatientID', - 'PatientName', - 'ReferringPhysicianName', - 'StudyDate', - 'StudyDescription', + "StudyInstanceUID", + "PatientID", + "PatientName", + "ReferringPhysicianName", + "StudyDate", + "StudyDescription", ] + def extract_session_metas(dicom_session_ds: dlad.Dataset): - all_files = dicom_session_ds.repo.find('*') + all_files = dicom_session_ds.repo.find("*") for f in all_files: try: dic = dicom.read_file(f, stop_before_pixels=True) - except Exception: # TODO: what exception occurs when non-dicom ? + except Exception: # TODO: what exception occurs when non-dicom ? continue # return at first dicom found - return {k:getattr(dic, k) for k in SESSION_META_KEYS} + return {k: getattr(dic, k) for k in SESSION_META_KEYS} def import_local_data( dicom_session_ds: dlad.Dataset, input_path: pathlib.Path, - sort_series: bool=True, - p7z_opts: str='-mx5' + sort_series: bool = True, + p7z_opts: str = "-mx5", ): dest = input_path.basename() if input_path.is_dir(): - dest = dest + '.7z' + dest = dest + ".7z" # create 7z archive with 1block/file parameters subprocess.run( - ['7z', 'u', str(dest), '.'] + p7z_opts, - cwd=str(dicom_session_ds.path), - ) + ["7z", "u", str(dest), "."] + p7z_opts, + cwd=str(dicom_session_ds.path), + ) elif input_path.is_file(): dest = dicom_session_ds.path / dest - try: # try hard-linking to avoid copying + try: # try hard-linking to avoid copying os.link(str(input_path), str(dest)) - except OSError: #fallback if hard-linking not supported + except OSError: # fallback if hard-linking not supported shutil.copyfile(str(input_path), str(dest)) - dicom_session_ds.save(dest, message='add dicoms archive') + dicom_session_ds.save(dest, message="add dicoms archive") return dest def import_remote_data( - dicom_session_ds:dlad.Dataset, - input_url:urllib.parse.ParseResult): - + dicom_session_ds: dlad.Dataset, input_url: urllib.parse.ParseResult +): try: dest = pathlib.Path(url.path).basename dicom_session_ds.repo.add_url_to_file(dest, url) except Exception: - ... #TODO: check how things can fail here and deal with it. + ... # TODO: check how things can fail here and deal with it. return dest - def export_to_ria( ds: dlad.Dataset, - ria_url:urllib.parse.ParseResult, + ria_url: urllib.parse.ParseResult, session_metas: dict, ): ria_name = pathlib.Path(ria_url.path).basename ds.create_sibling_ria( - ria_url, - name=ria_name, - alias=session_meta['PatientID'], - existing='reconfigure') - ds.push(to=ria_name, data='nothing') - ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]['url']) - archive_path = ria_sibling_path / 'archives' / 'archive.7z' + ria_url, name=ria_name, alias=session_meta["PatientID"], existing="reconfigure" + ) + ds.push(to=ria_name, data="nothing") + ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]["url"]) + archive_path = ria_sibling_path / "archives" / "archive.7z" ds.export_archive_ora( - archive_path, - opts=[f'-mx{COMPRESSION_LEVEL}'], - missing_content='error') - ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) #index - ds.push(to=ria_name, data='nothing') + archive_path, opts=[f"-mx{COMPRESSION_LEVEL}"], missing_content="error" + ) + ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) # index + ds.push(to=ria_name, data="nothing") + def export_to_s3( ds: dlad.Dataset, - s3_url:urllib.parse.ParseResult, + s3_url: urllib.parse.ParseResult, session_metas: dict, ): ... @@ -294,17 +291,16 @@ def connect_gitlab(debug=False): def get_or_create_gitlab_group(gl, group_list): - """ - """ + """ """ found = False - for keep_groups in reversed(range(len(group_list)+1)): - tmp_repo_path = '/'.join(group_list[0:keep_groups]) + for keep_groups in reversed(range(len(group_list) + 1)): + tmp_repo_path = "/".join(group_list[0:keep_groups]) logging.warning(tmp_repo_path) gs = gl.groups.list(search=tmp_repo_path) for g in gs: - if g.full_path == tmp_repo_path: - found = True - break + if g.full_path == tmp_repo_path: + found = True + break if found: break for nb_groups in range(keep_groups, len(group_list)): @@ -312,34 +308,36 @@ def get_or_create_gitlab_group(gl, group_list): msg = "Creating group {}".format(group_list[nb_groups]) logging.warning(msg) logging.warning(len(msg) * "=") - g = gl.groups.create({'name': group_list[nb_groups], - 'path': group_list[nb_groups]}) + g = gl.groups.create( + {"name": group_list[nb_groups], "path": group_list[nb_groups]} + ) else: - msg = 'Creating group {} from {}'.format(group_list[nb_groups], - g.name) + msg = "Creating group {} from {}".format(group_list[nb_groups], g.name) logging.warning(msg) logging.warning(len(msg) * "=") - g = gl.groups.create({'name': group_list[nb_groups], - 'path': group_list[nb_groups], - 'parent_id': g.id}) + g = gl.groups.create( + { + "name": group_list[nb_groups], + "path": group_list[nb_groups], + "parent_id": g.id, + } + ) return g def get_or_create_gitlab_project(gl, project_name): - """ - """ + """ """ if len(project_name) == 1: # Check if exists p = gl.projects.list(search=project_name[0]) if not p: - p = gl.projects.create({'name': project_name[0], - 'path': project_name[0]}) + p = gl.projects.create({"name": project_name[0], "path": project_name[0]}) return p.id else: return p[0].id - repo_full_path = '/'.join(project_name) + repo_full_path = "/".join(project_name) # Look for exact repo/project: p = gl.projects.list(search=project_name[-1]) @@ -349,6 +347,5 @@ def get_or_create_gitlab_project(gl, project_name): return curr_p g = get_or_create_gitlab_group(gl, project_name[:-1]) - p = gl.projects.create({'name': project_name[-1], - 'namespace_id': g.id}) + p = gl.projects.create({"name": project_name[-1], "namespace_id": g.id}) return p From 9b28f8f8fbc97c28fb9cd0a8500bac693d3b2002 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Thu, 25 Jan 2024 11:40:52 -0500 Subject: [PATCH 03/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 168 ++++++++++++++------ 1 file changed, 120 insertions(+), 48 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 2f22ca4..106be67 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -1,16 +1,17 @@ import os -import dicom +import pydicom as dicom import argparse import pathlib import urllib.parse import datalad.api as dlad import shutil +import gitlab GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab") -def sort_series(path: str) -> None: +def sort_series(path: pathlib.Path) -> None: """Sort series in separate folder Parameters @@ -19,7 +20,7 @@ def sort_series(path: str) -> None: path to dicoms """ - files = glob.glob(os.path.join(path, "*")) + files = path.glob(os.path.join(path, "*")) for f in files: if not os.path.isfile(f): continue @@ -40,6 +41,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: p.add_argument( "--gitlab-url", type=str, + default=os.environ.get("GITLAB_SERVER", None), help="http(s) url to the gitlab server where to push repos", ) p.add_argument( @@ -48,6 +50,12 @@ def _build_arg_parser() -> argparse.ArgumentParser: type=str, help="string with placeholder for dicom tags", ) + p.add_argument( + "--session-name-tag", + default="PatientName", + type=str, + help="dicom tags that contains the name of the session", + ) p.add_argument("--storage-remote", help="url to the datalad remote") p.add_argument( "--sort-series", @@ -62,6 +70,12 @@ def _build_arg_parser() -> argparse.ArgumentParser: action="store_true", help="use fake dates for datalad dataset", ) + p.add_argument( + "--p7z-opts", + type=str, + default="-mx5 -ms=off", + help="option for 7z generated archives", + ) return p @@ -71,32 +85,39 @@ def main() -> None: input = urllib.parse.urlparse(args.input) output_remote = urllib.parse.urlparse(args.storage_remote) - logger.info(f"input data: {input}") + gitlab_url = urllib.parse.urlparse(args.gitlab_url) - process( + with index_dicoms( input, - output_remote, sort_series=p.sort_series, fake_dates=p.fake_dates, - ) + p7z_opts=p.p7z_opts, + gitlab_group_template=args.gitlab_group_template, + ) as dicom_session_ds: + session_metas = extract_session_metas(dicom_session_ds) + + if not input.scheme or input.scheme == "file" or args.force_export: + export_data(dicom_session_ds, output_remote, session_metas) + + setup_gitlab_remote( + dicom_session_ds, + gitlab_url=gitlab_url, + dicom_session_name=args.session_name_tag, + session_metas=session_meta, + ) -def process( +def index_dicoms( input: urllib.parse.ParseResult, - output_remote: urllib.parse.ParseResult, sort_series: bool, fake_dates: bool, p7z_opts: str, - gitlab_url: urllib.parse.ParseResult, - gitlab_group_template: str, - force_export: bool = False, -) -> None: +) -> dlad.Dataset: """Process incoming dicoms into datalad repo""" + with tempfile.TemporaryDirectory() as tmpdirname: dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates) - do_export = force_export - if not input.scheme or input.scheme == "file": dest = import_local_data( dicom_session_ds, @@ -104,7 +125,6 @@ def process( sort_series=sort_series, p7z_opts=p7z_opts, ) - do_export = True elif input.scheme in ["http", "https", "s3"]: dest = import_remote_data(dicom_session_ds, input_url) @@ -118,28 +138,33 @@ def process( dicom_session_ds.save(message="index dicoms from archive") # # optimize git index after large import dicom_session_ds.repo.gc() # aggressive by default + yield dicom_session_ds - session_metas = extract_session_metas(dicom_session_ds) - if do_export: - if output_remote.scheme == "ria": - export_to_ria(dicom_session_ds, output_remote, session_metas) - elif output_remote.scheme == "s3": - export_to_s3(dicom_session_ds, output_remote, session_metas) - - setup_gitlab_remote(dicom_session_ds, gitlab_url, session_metas) +def export_data( + dicom_session_ds: dlad.Dataset, + output_remote: urllib.parse.ParseResult, + session_metas: dict, +): + if output_remote.scheme == "ria": + export_to_ria(dicom_session_ds, output_remote, session_metas) + elif output_remote.scheme == "s3": + export_to_s3(dicom_session_ds, output_remote, session_metas) def setup_gitlab_repos( dicom_session_ds: dlad.Dataset, gitlab_url: urllib.parse.ParseResult, + gitlab_group_path: str, session_metas: dict, -): - gitlab_conn = connect_gitlab() +) -> None: + gitlab_conn = connect_gitlab(gitlab_url) gitlab_group_path = gitlab_group_template.format(session_metas) dicom_sourcedata_path = "/".join([dicom_session_path, "sourcedata/dicoms"]) - dicom_session_path = "/".join([dicom_sourcedata_path, ["StudyInstanceUID"]]) + dicom_session_path = "/".join( + [dicom_sourcedata_path, session_metas["StudyInstanceUID"]] + ) dicom_study_path = "/".join([dicom_sourcedata_path, "study"]) dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path) @@ -159,6 +184,7 @@ def setup_gitlab_repos( } ) + ## add the session to the dicom study repo dicom_study_repo = get_or_create_project(gl, dicom_study_path) with tempfile.TemporaryDirectory() as tmpdir: dicom_study_ds = datalad.api.install( @@ -170,28 +196,69 @@ def setup_gitlab_repos( dicom_study_ds.create(force=True) dicom_study_ds.push(to="origin") # add default study DS structure. - init_dicom_study(dicom_study_ds, PI, study_name) + init_dicom_study(dicom_study_ds, gitlab_group_path) # initialize BIDS project - init_bids(gl, PI, study_name, dicom_study_repo) - create_group(gl, [PI, study_name, "derivatives"]) - create_group(gl, [PI, study_name, "qc"]) + init_bids(gl, dicom_study_repo, gitlab_group_path) + # create subgroup for QC and derivatives repos + create_group(gl, f"{gitlab_group_path}/derivatives") + create_group(gl, f"{gitlab_group_path}/qc") dicom_study_ds.install( source=dicom_session_repo._attrs["ssh_url_to_repo"], path=session_meta["PatientName"], ) - dicom_study_ds.create_sibling_ria( - UNF_DICOMS_RIA_URL, - name=UNF_DICOMS_RIA_NAME, - alias=study_name, - existing="reconfigure", - ) # Push to gitlab + local ria-store dicom_study_ds.push(to="origin") dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME) +def init_bids( + gl: gitlab.Gitlab, + dicom_study_repo: dlad.Dataset, + gitlab_group_path: str, +) -> None: + bids_project_repo = create_project(gl, f"{gitlab_group_path}/bids") + with tempfile.TemporaryDirectory() as tmpdir: + bids_project_ds = datalad.api.install( + source=bids_project_repo._attrs["ssh_url_to_repo"], + path=tmpdir, + ) + bids_project_ds.create(force=True) + shutil.copytree("repo_templates/bids", bids_project_ds.path, dirs_exist_ok=True) + bids_project_ds.save(path=".", message="init structure and pipelines") + bids_project_ds.install( + path="sourcedata/dicoms", + source=dicom_study_repo._attrs["ssh_url_to_repo"], + ) + # TODO: setup sensitive / non-sensitive S3 buckets + bids_project_ds.push(to="origin") + # create dev branch and push for merge requests + bids_project_ds.gitrepo.checkout(BIDS_DEV_BRANCH, ["-b"]) + bids_project_ds.push(to="origin") + bids_project_ds.protectedbranches.create(data={"name": "convert/*"}) + bids_project_ds.protectedbranches.create(data={"name": "dev"}) + + +def init_dicom_study( + dicom_study_ds: dlad.Dataset, + gitlab_group_path: str, +) -> None: + shutil.copytree( + "repo_templates/dicom_study", dicom_study_ds.path, dirs_exist_ok=True + ) + env = { + "variables": { + "STUDY_PATH": gitlab_group_path, + "BIDS_PATH": f"{gitlab_group_path}/bids", + } + } + with open(os.path.join(dicom_study_ds.path, "ci-env.yml"), "w") as outfile: + yaml.dump(env, outfile, default_flow_style=False) + dicom_study_ds.save(path=".", message="init structure and pipelines") + dicom_study_ds.push(to="origin") + + SESSION_META_KEYS = [ "StudyInstanceUID", "PatientID", @@ -202,7 +269,7 @@ SESSION_META_KEYS = [ ] -def extract_session_metas(dicom_session_ds: dlad.Dataset): +def extract_session_metas(dicom_session_ds: dlad.Dataset) -> dict: all_files = dicom_session_ds.repo.find("*") for f in all_files: try: @@ -273,25 +340,31 @@ def export_to_s3( s3_url: urllib.parse.ParseResult, session_metas: dict, ): - ... + ds.repo.initremote() # git-annex initremote remotename ... # git-annex wanted remotename include=**.{7z,tar.gz,zip} # datalad push --data auto --to remotename -def connect_gitlab(debug=False): +def connect_gitlab( + gitlab_url: urllib.parse.ParseResult, debug: bool = False +) -> gitlab.Gitlab: """ Connection to Gitlab """ - gl = gitlab.Gitlab(GITLAB_SERVER, private_token=GITLAB_TOKEN) + gl = gitlab.Gitlab(str(gitlab_url), private_token=GITLAB_TOKEN) if debug: gl.enable_debug() gl.auth() return gl -def get_or_create_gitlab_group(gl, group_list): - """ """ +def get_or_create_gitlab_group( + gl: gitlab.Gitlab, + group_path: str, +): + """fetch or create a gitlab group""" + group_list = group.split("/") found = False for keep_groups in reversed(range(len(group_list) + 1)): tmp_repo_path = "/".join(group_list[0:keep_groups]) @@ -326,8 +399,9 @@ def get_or_create_gitlab_group(gl, group_list): return g -def get_or_create_gitlab_project(gl, project_name): - """ """ +def get_or_create_gitlab_project(gl: gitlab.Gitlab, project_path: str): + """fetch or create a gitlab repo""" + project_name = project_path.split("/") if len(project_name) == 1: # Check if exists p = gl.projects.list(search=project_name[0]) @@ -337,13 +411,11 @@ def get_or_create_gitlab_project(gl, project_name): else: return p[0].id - repo_full_path = "/".join(project_name) - # Look for exact repo/project: p = gl.projects.list(search=project_name[-1]) if p: for curr_p in p: - if curr_p.path_with_namespace == repo_full_path: + if curr_p.path_with_namespace == project_path: return curr_p g = get_or_create_gitlab_group(gl, project_name[:-1]) From c845c4fd64efb599c53b46aea233ae4009a1b5d3 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Thu, 25 Jan 2024 13:35:38 -0500 Subject: [PATCH 04/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 56 +++++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 106be67..66625fb 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -6,11 +6,15 @@ import urllib.parse import datalad.api as dlad import shutil import gitlab +import tempfile +from contextlib import contextmanager GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab") +GITLAB_TOKEN = os.environ.get("GITLAB_TOKEN", None) +# TODO: rewrite for pathlib.Path input def sort_series(path: pathlib.Path) -> None: """Sort series in separate folder @@ -30,7 +34,7 @@ def sort_series(path: pathlib.Path) -> None: subpath = os.path.join(path, series_instance_uid) if not os.path.exists(subpath): os.mkdir(subpath) - os.rename(f, os.path.join(subpath, os.path.basename(f))) + os.rename(f, os.path.join(subpath, f.name)) def _build_arg_parser() -> argparse.ArgumentParser: @@ -46,7 +50,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: ) p.add_argument( "--gitlab-group-template", - default="{ReferringPhysicianName}/{StudyDescription.replace(" ^ "," / ")}", + default="{ReferringPhysicianName}/{StudyDescription.replace('^','/' )}", type=str, help="string with placeholder for dicom tags", ) @@ -59,14 +63,12 @@ def _build_arg_parser() -> argparse.ArgumentParser: p.add_argument("--storage-remote", help="url to the datalad remote") p.add_argument( "--sort-series", - action="store_true", type=bool, default=True, help="sort dicom series in separate folders", ) p.add_argument( "--fake-dates", - type=bool, action="store_true", help="use fake dates for datalad dataset", ) @@ -89,24 +91,29 @@ def main() -> None: with index_dicoms( input, - sort_series=p.sort_series, - fake_dates=p.fake_dates, - p7z_opts=p.p7z_opts, - gitlab_group_template=args.gitlab_group_template, + sort_series=args.sort_series, + fake_dates=args.fake_dates, + p7z_opts=args.p7z_opts, ) as dicom_session_ds: session_metas = extract_session_metas(dicom_session_ds) - if not input.scheme or input.scheme == "file" or args.force_export: + if ( + not input.scheme + or input.scheme == "file" + or args.force_export + and output_remote + ): export_data(dicom_session_ds, output_remote, session_metas) - setup_gitlab_remote( + setup_gitlab_repos( dicom_session_ds, gitlab_url=gitlab_url, - dicom_session_name=args.session_name_tag, - session_metas=session_meta, + dicom_session_tag=args.session_name_tag, + session_metas=session_metas, ) +@contextmanager def index_dicoms( input: urllib.parse.ParseResult, sort_series: bool, @@ -129,8 +136,9 @@ def index_dicoms( dest = import_remote_data(dicom_session_ds, input_url) # index dicoms files - dicom_session_ds.add_archive_content( + dlad.add_archive_content( dest, + dataset=dicom_session_ds, strip_leading_dirs=True, commit=False, ) @@ -155,8 +163,8 @@ def export_data( def setup_gitlab_repos( dicom_session_ds: dlad.Dataset, gitlab_url: urllib.parse.ParseResult, - gitlab_group_path: str, session_metas: dict, + dicom_session_tag: str, ) -> None: gitlab_conn = connect_gitlab(gitlab_url) @@ -205,7 +213,7 @@ def setup_gitlab_repos( dicom_study_ds.install( source=dicom_session_repo._attrs["ssh_url_to_repo"], - path=session_meta["PatientName"], + path=session_metas.get(dicom_session_tag), ) # Push to gitlab + local ria-store @@ -286,17 +294,17 @@ def import_local_data( sort_series: bool = True, p7z_opts: str = "-mx5", ): - dest = input_path.basename() + dest = input_path.name if input_path.is_dir(): dest = dest + ".7z" # create 7z archive with 1block/file parameters subprocess.run( ["7z", "u", str(dest), "."] + p7z_opts, - cwd=str(dicom_session_ds.path), + cwd=dicom_session_ds.path, ) elif input_path.is_file(): - dest = dicom_session_ds.path / dest + dest = dicom_session_ds.pathobj / dest try: # try hard-linking to avoid copying os.link(str(input_path), str(dest)) except OSError: # fallback if hard-linking not supported @@ -309,7 +317,7 @@ def import_remote_data( dicom_session_ds: dlad.Dataset, input_url: urllib.parse.ParseResult ): try: - dest = pathlib.Path(url.path).basename + dest = pathlib.Path(url.path).name dicom_session_ds.repo.add_url_to_file(dest, url) except Exception: ... # TODO: check how things can fail here and deal with it. @@ -321,9 +329,9 @@ def export_to_ria( ria_url: urllib.parse.ParseResult, session_metas: dict, ): - ria_name = pathlib.Path(ria_url.path).basename + ria_name = pathlib.Path(ria_url.path).name ds.create_sibling_ria( - ria_url, name=ria_name, alias=session_meta["PatientID"], existing="reconfigure" + ria_url, name=ria_name, alias=session_metas["PatientID"], existing="reconfigure" ) ds.push(to=ria_name, data="nothing") ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]["url"]) @@ -352,7 +360,7 @@ def connect_gitlab( """ Connection to Gitlab """ - gl = gitlab.Gitlab(str(gitlab_url), private_token=GITLAB_TOKEN) + gl = gitlab.Gitlab(gitlab_url.geturl(), private_token=GITLAB_TOKEN) if debug: gl.enable_debug() gl.auth() @@ -421,3 +429,7 @@ def get_or_create_gitlab_project(gl: gitlab.Gitlab, project_path: str): g = get_or_create_gitlab_group(gl, project_name[:-1]) p = gl.projects.create({"name": project_name[-1], "namespace_id": g.id}) return p + + +if __name__ == "__main__": + main() From 7ff4659b342a04b0b69130987ad68195914ebf66 Mon Sep 17 00:00:00 2001 From: Milton Camacho Date: Thu, 25 Jan 2024 19:10:20 +0000 Subject: [PATCH 05/26] import subprocess --- docker/dicom_indexer/indexer/index_dicom.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 66625fb..22ae71d 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -7,6 +7,7 @@ import datalad.api as dlad import shutil import gitlab import tempfile +import subprocess from contextlib import contextmanager From f18e914311e3e1c67981da81c724a2f17c80bf5a Mon Sep 17 00:00:00 2001 From: Milton Camacho Date: Thu, 25 Jan 2024 19:17:40 +0000 Subject: [PATCH 06/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 22ae71d..4ac72fe 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -301,7 +301,7 @@ def import_local_data( dest = dest + ".7z" # create 7z archive with 1block/file parameters subprocess.run( - ["7z", "u", str(dest), "."] + p7z_opts, + ["7z", "u", str(dest), "."].append(p7z_opts), cwd=dicom_session_ds.path, ) elif input_path.is_file(): From 291c110261dc5bc4ba0115e9de7b2a8700815c87 Mon Sep 17 00:00:00 2001 From: Milton Camacho Date: Thu, 25 Jan 2024 19:28:58 +0000 Subject: [PATCH 07/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 4ac72fe..4ca51ac 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -301,7 +301,7 @@ def import_local_data( dest = dest + ".7z" # create 7z archive with 1block/file parameters subprocess.run( - ["7z", "u", str(dest), "."].append(p7z_opts), + ["7z", "u", str(dest), "."] + p7z_opts.split(), cwd=dicom_session_ds.path, ) elif input_path.is_file(): From 1c706dc5e55a96c877b0a5bcec4ff16c470603a8 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Thu, 25 Jan 2024 15:08:22 -0500 Subject: [PATCH 08/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 35 +++++++++++---------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 66625fb..038c221 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -7,6 +7,7 @@ import datalad.api as dlad import shutil import gitlab import tempfile +import logging from contextlib import contextmanager @@ -50,7 +51,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: ) p.add_argument( "--gitlab-group-template", - default="{ReferringPhysicianName}/{StudyDescription.replace('^','/' )}", + default="{ReferringPhysicianName}/{StudyDescription}", type=str, help="string with placeholder for dicom tags", ) @@ -110,6 +111,7 @@ def main() -> None: gitlab_url=gitlab_url, dicom_session_tag=args.session_name_tag, session_metas=session_metas, + gitlab_group_template=args.gitlab_group_template ) @@ -165,17 +167,18 @@ def setup_gitlab_repos( gitlab_url: urllib.parse.ParseResult, session_metas: dict, dicom_session_tag: str, + gitlab_group_template: str, ) -> None: gitlab_conn = connect_gitlab(gitlab_url) - gitlab_group_path = gitlab_group_template.format(session_metas) - dicom_sourcedata_path = "/".join([dicom_session_path, "sourcedata/dicoms"]) + gitlab_group_path = gitlab_group_template.format(**session_metas) + dicom_sourcedata_path = "/".join([gitlab_group_path, "sourcedata/dicoms"]) dicom_session_path = "/".join( [dicom_sourcedata_path, session_metas["StudyInstanceUID"]] ) dicom_study_path = "/".join([dicom_sourcedata_path, "study"]) - dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path) + dicom_session_repo = get_or_create_gitlab_project(gitlab_conn, dicom_session_path) ds.siblings( action="configure", # allow to overwrite existing config name=GITLAB_REMOTE_NAME, @@ -183,8 +186,8 @@ def setup_gitlab_repos( ) ds.push(to=GITLAB_REMOTE_NAME) - study_group = get_or_create_group(gl, gitlab_group_path) - bot_user = gl.users.list(username=GITLAB_BOT_USERNAME)[0] + study_group = get_or_create_group(gitlab_conn, gitlab_group_path) + bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME)[0] study_group.members.create( { "user_id": bot_user.id, @@ -206,10 +209,10 @@ def setup_gitlab_repos( # add default study DS structure. init_dicom_study(dicom_study_ds, gitlab_group_path) # initialize BIDS project - init_bids(gl, dicom_study_repo, gitlab_group_path) + init_bids(gitlab_conn, dicom_study_repo, gitlab_group_path) # create subgroup for QC and derivatives repos - create_group(gl, f"{gitlab_group_path}/derivatives") - create_group(gl, f"{gitlab_group_path}/qc") + create_group(gitlab_conn, f"{gitlab_group_path}/derivatives") + create_group(gitlab_conn, f"{gitlab_group_path}/qc") dicom_study_ds.install( source=dicom_session_repo._attrs["ssh_url_to_repo"], @@ -278,15 +281,15 @@ SESSION_META_KEYS = [ def extract_session_metas(dicom_session_ds: dlad.Dataset) -> dict: - all_files = dicom_session_ds.repo.find("*") + all_files = dicom_session_ds.repo.get_files() for f in all_files: try: - dic = dicom.read_file(f, stop_before_pixels=True) - except Exception: # TODO: what exception occurs when non-dicom ? + dic = dicom.read_file(dicom_session_ds.pathobj/f, stop_before_pixels=True) + except Exception as e: # TODO: what exception occurs when non-dicom ? continue # return at first dicom found - return {k: getattr(dic, k) for k in SESSION_META_KEYS} - + return {k: str(getattr(dic, k)).replace('^','/') for k in SESSION_META_KEYS} + raise InputError('no dicom found') def import_local_data( dicom_session_ds: dlad.Dataset, @@ -372,7 +375,7 @@ def get_or_create_gitlab_group( group_path: str, ): """fetch or create a gitlab group""" - group_list = group.split("/") + group_list = group_path.split("/") found = False for keep_groups in reversed(range(len(group_list) + 1)): tmp_repo_path = "/".join(group_list[0:keep_groups]) @@ -426,7 +429,7 @@ def get_or_create_gitlab_project(gl: gitlab.Gitlab, project_path: str): if curr_p.path_with_namespace == project_path: return curr_p - g = get_or_create_gitlab_group(gl, project_name[:-1]) + g = get_or_create_gitlab_group(gl, '/'.join(project_name[:-1])) p = gl.projects.create({"name": project_name[-1], "namespace_id": g.id}) return p From e5efdba82386ae13e2adb7ea4eacd30aaa16c57c Mon Sep 17 00:00:00 2001 From: bpinsard Date: Thu, 25 Jan 2024 15:56:51 -0500 Subject: [PATCH 09/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 35 +++++++++++++-------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 5f6afad..7c56a1a 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -14,6 +14,7 @@ from contextlib import contextmanager GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab") GITLAB_TOKEN = os.environ.get("GITLAB_TOKEN", None) +GITLAB_BOT_USERNAME = os.environ.get("GITLAB_BOT_USERNAME", None) # TODO: rewrite for pathlib.Path input @@ -87,6 +88,13 @@ def main() -> None: parser = _build_arg_parser() args = parser.parse_args() + if not GITLAB_REMOTE_NAME: + raise RuntimeError("missing GITLAB_REMOTE_NAME env var") + if not GITLAB_TOKEN: + raise RuntimeError("missing GITLAB_TOKEN env var") + if not GITLAB_BOT_USERNAME: + raise RuntimeError("missing GITLAB_BOT_USERNAME env var") + input = urllib.parse.urlparse(args.input) output_remote = urllib.parse.urlparse(args.storage_remote) gitlab_url = urllib.parse.urlparse(args.gitlab_url) @@ -112,7 +120,7 @@ def main() -> None: gitlab_url=gitlab_url, dicom_session_tag=args.session_name_tag, session_metas=session_metas, - gitlab_group_template=args.gitlab_group_template + gitlab_group_template=args.gitlab_group_template, ) @@ -156,7 +164,7 @@ def export_data( dicom_session_ds: dlad.Dataset, output_remote: urllib.parse.ParseResult, session_metas: dict, -): +) -> None: if output_remote.scheme == "ria": export_to_ria(dicom_session_ds, output_remote, session_metas) elif output_remote.scheme == "s3": @@ -180,14 +188,14 @@ def setup_gitlab_repos( dicom_study_path = "/".join([dicom_sourcedata_path, "study"]) dicom_session_repo = get_or_create_gitlab_project(gitlab_conn, dicom_session_path) - ds.siblings( + dicom_session_ds.siblings( action="configure", # allow to overwrite existing config name=GITLAB_REMOTE_NAME, url=dicom_session_repo._attrs["ssh_url_to_repo"], ) - ds.push(to=GITLAB_REMOTE_NAME) + dicom_session_ds.push(to=GITLAB_REMOTE_NAME, force="gitpush") - study_group = get_or_create_group(gitlab_conn, gitlab_group_path) + study_group = get_or_create_gitlab_group(gitlab_conn, gitlab_group_path) bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME)[0] study_group.members.create( { @@ -197,7 +205,7 @@ def setup_gitlab_repos( ) ## add the session to the dicom study repo - dicom_study_repo = get_or_create_project(gl, dicom_study_path) + dicom_study_repo = get_or_create_gitlab_project(gitlab_conn, dicom_study_path) with tempfile.TemporaryDirectory() as tmpdir: dicom_study_ds = datalad.api.install( source=dicom_study_repo._attrs["ssh_url_to_repo"], @@ -212,8 +220,8 @@ def setup_gitlab_repos( # initialize BIDS project init_bids(gitlab_conn, dicom_study_repo, gitlab_group_path) # create subgroup for QC and derivatives repos - create_group(gitlab_conn, f"{gitlab_group_path}/derivatives") - create_group(gitlab_conn, f"{gitlab_group_path}/qc") + get_or_create_gitlab_group(gitlab_conn, f"{gitlab_group_path}/derivatives") + get_or_create_gitlab_group(gitlab_conn, f"{gitlab_group_path}/qc") dicom_study_ds.install( source=dicom_session_repo._attrs["ssh_url_to_repo"], @@ -230,7 +238,7 @@ def init_bids( dicom_study_repo: dlad.Dataset, gitlab_group_path: str, ) -> None: - bids_project_repo = create_project(gl, f"{gitlab_group_path}/bids") + bids_project_repo = get_or_create_gitlab_project(gl, f"{gitlab_group_path}/bids") with tempfile.TemporaryDirectory() as tmpdir: bids_project_ds = datalad.api.install( source=bids_project_repo._attrs["ssh_url_to_repo"], @@ -285,12 +293,13 @@ def extract_session_metas(dicom_session_ds: dlad.Dataset) -> dict: all_files = dicom_session_ds.repo.get_files() for f in all_files: try: - dic = dicom.read_file(dicom_session_ds.pathobj/f, stop_before_pixels=True) + dic = dicom.read_file(dicom_session_ds.pathobj / f, stop_before_pixels=True) except Exception as e: # TODO: what exception occurs when non-dicom ? continue # return at first dicom found - return {k: str(getattr(dic, k)).replace('^','/') for k in SESSION_META_KEYS} - raise InputError('no dicom found') + return {k: str(getattr(dic, k)).replace("^", "/") for k in SESSION_META_KEYS} + raise InputError("no dicom found") + def import_local_data( dicom_session_ds: dlad.Dataset, @@ -430,7 +439,7 @@ def get_or_create_gitlab_project(gl: gitlab.Gitlab, project_path: str): if curr_p.path_with_namespace == project_path: return curr_p - g = get_or_create_gitlab_group(gl, '/'.join(project_name[:-1])) + g = get_or_create_gitlab_group(gl, "/".join(project_name[:-1])) p = gl.projects.create({"name": project_name[-1], "namespace_id": g.id}) return p From 84d5a18ba6e662099cf0c378ded49758fa5d840c Mon Sep 17 00:00:00 2001 From: bpinsard Date: Thu, 25 Jan 2024 16:38:10 -0500 Subject: [PATCH 10/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 7c56a1a..f6082c4 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -9,6 +9,7 @@ import gitlab import tempfile import logging import subprocess +import yaml from contextlib import contextmanager @@ -207,7 +208,7 @@ def setup_gitlab_repos( ## add the session to the dicom study repo dicom_study_repo = get_or_create_gitlab_project(gitlab_conn, dicom_study_path) with tempfile.TemporaryDirectory() as tmpdir: - dicom_study_ds = datalad.api.install( + dicom_study_ds = dlad.install( source=dicom_study_repo._attrs["ssh_url_to_repo"], path=tmpdir, ) From 6f01d16af5b5eddc4bc9554a7a931a088179e5cf Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 26 Jan 2024 11:16:51 -0500 Subject: [PATCH 11/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 72 ++++++++++++++------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index f6082c4..a3b4c85 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -114,7 +114,12 @@ def main() -> None: or args.force_export and output_remote ): - export_data(dicom_session_ds, output_remote, session_metas) + export_data( + dicom_session_ds, + output_remote, + dicom_session_tag=args.session_name_tag, + session_metas=session_metas, + ) setup_gitlab_repos( dicom_session_ds, @@ -138,24 +143,24 @@ def index_dicoms( dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates) if not input.scheme or input.scheme == "file": - dest = import_local_data( + archive = import_local_data( dicom_session_ds, pathlib.Path(input.path), sort_series=sort_series, p7z_opts=p7z_opts, ) elif input.scheme in ["http", "https", "s3"]: - dest = import_remote_data(dicom_session_ds, input_url) + archive = import_remote_data(dicom_session_ds, input_url) # index dicoms files dlad.add_archive_content( - dest, + archive, dataset=dicom_session_ds, strip_leading_dirs=True, commit=False, ) # cannot pass message above so commit now - dicom_session_ds.save(message="index dicoms from archive") # + dicom_session_ds.save(message=f"index dicoms from archive {archive}") # # optimize git index after large import dicom_session_ds.repo.gc() # aggressive by default yield dicom_session_ds @@ -164,10 +169,16 @@ def index_dicoms( def export_data( dicom_session_ds: dlad.Dataset, output_remote: urllib.parse.ParseResult, + dicom_session_tag: str, session_metas: dict, ) -> None: - if output_remote.scheme == "ria": - export_to_ria(dicom_session_ds, output_remote, session_metas) + if "ria" in output_remote.scheme: + export_to_ria( + dicom_session_ds, + output_remote, + dicom_session_tag=dicom_session_tag, + session_metas=session_metas, + ) elif output_remote.scheme == "s3": export_to_s3(dicom_session_ds, output_remote, session_metas) @@ -194,16 +205,18 @@ def setup_gitlab_repos( name=GITLAB_REMOTE_NAME, url=dicom_session_repo._attrs["ssh_url_to_repo"], ) + dicom_session_ds.repo.checkout("dev", ["-b"]) dicom_session_ds.push(to=GITLAB_REMOTE_NAME, force="gitpush") study_group = get_or_create_gitlab_group(gitlab_conn, gitlab_group_path) bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME)[0] - study_group.members.create( - { - "user_id": bot_user.id, - "access_level": gitlab.const.AccessLevel.MAINTAINER, - } - ) + if not any(m.id == bot_user.id for m in study_group.members.list()): + study_group.members.create( + { + "user_id": bot_user.id, + "access_level": gitlab.const.AccessLevel.MAINTAINER, + } + ) ## add the session to the dicom study repo dicom_study_repo = get_or_create_gitlab_project(gitlab_conn, dicom_study_path) @@ -229,9 +242,8 @@ def setup_gitlab_repos( path=session_metas.get(dicom_session_tag), ) - # Push to gitlab + local ria-store + # Push to gitlab dicom_study_ds.push(to="origin") - dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME) def init_bids( @@ -257,8 +269,9 @@ def init_bids( # create dev branch and push for merge requests bids_project_ds.gitrepo.checkout(BIDS_DEV_BRANCH, ["-b"]) bids_project_ds.push(to="origin") - bids_project_ds.protectedbranches.create(data={"name": "convert/*"}) - bids_project_ds.protectedbranches.create(data={"name": "dev"}) + # set protected branches + bids_project_repo.protectedbranches.create(data={"name": "convert/*"}) + bids_project_repo.protectedbranches.create(data={"name": "dev"}) def init_dicom_study( @@ -341,20 +354,29 @@ def import_remote_data( def export_to_ria( ds: dlad.Dataset, ria_url: urllib.parse.ParseResult, + dicom_session_tag: str, session_metas: dict, + export_ria_archive: bool = False, ): ria_name = pathlib.Path(ria_url.path).name ds.create_sibling_ria( - ria_url, name=ria_name, alias=session_metas["PatientID"], existing="reconfigure" + ria_url.geturl(), + name=ria_name, + alias=session_metas[dicom_session_tag], + existing="reconfigure", + new_store_ok=True, ) ds.push(to=ria_name, data="nothing") - ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]["url"]) - archive_path = ria_sibling_path / "archives" / "archive.7z" - ds.export_archive_ora( - archive_path, opts=[f"-mx{COMPRESSION_LEVEL}"], missing_content="error" - ) - ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) # index - ds.push(to=ria_name, data="nothing") + + # keep the old ria-archive before add-archive-content, not used for now + if export_ria_archive: + ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]["url"]) + archive_path = ria_sibling_path / "archives" / "archive.7z" + ds.export_archive_ora( + archive_path, opts=[f"-mx{COMPRESSION_LEVEL}"], missing_content="error" + ) + ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) # index + ds.push(to=ria_name, data="nothing") def export_to_s3( From b53e89bb671ad04334adee75f693be2cd2e3e628 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 26 Jan 2024 11:56:49 -0500 Subject: [PATCH 12/26] fix folder archival --- docker/dicom_indexer/indexer/index_dicom.py | 24 +++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index a3b4c85..619c4fe 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -192,6 +192,7 @@ def setup_gitlab_repos( ) -> None: gitlab_conn = connect_gitlab(gitlab_url) + # generate gitlab group/repo paths gitlab_group_path = gitlab_group_template.format(**session_metas) dicom_sourcedata_path = "/".join([gitlab_group_path, "sourcedata/dicoms"]) dicom_session_path = "/".join( @@ -199,17 +200,23 @@ def setup_gitlab_repos( ) dicom_study_path = "/".join([dicom_sourcedata_path, "study"]) + # create repo (should not exists unless rerun) dicom_session_repo = get_or_create_gitlab_project(gitlab_conn, dicom_session_path) dicom_session_ds.siblings( action="configure", # allow to overwrite existing config name=GITLAB_REMOTE_NAME, url=dicom_session_repo._attrs["ssh_url_to_repo"], ) - dicom_session_ds.repo.checkout("dev", ["-b"]) - dicom_session_ds.push(to=GITLAB_REMOTE_NAME, force="gitpush") + # and push + dicom_session_ds.push(to=GITLAB_REMOTE_NAME) + # add maint permissions for the dicom bot user on the study repos study_group = get_or_create_gitlab_group(gitlab_conn, gitlab_group_path) - bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME)[0] + bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME).get(0, None) + if not bot_user: + raise RuntimeError( + f"bot_user: {GITLAB_BOT_USERNAME} does not exists in gitlab instance" + ) if not any(m.id == bot_user.id for m in study_group.members.list()): study_group.members.create( { @@ -243,7 +250,7 @@ def setup_gitlab_repos( ) # Push to gitlab - dicom_study_ds.push(to="origin") + dicom_study_ds.push(to="origin", force="gitpush") def init_bids( @@ -319,17 +326,16 @@ def import_local_data( dicom_session_ds: dlad.Dataset, input_path: pathlib.Path, sort_series: bool = True, - p7z_opts: str = "-mx5", + p7z_opts: str = "-mx5 -ms=off", ): dest = input_path.name if input_path.is_dir(): dest = dest + ".7z" # create 7z archive with 1block/file parameters - subprocess.run( - ["7z", "u", str(dest), "."] + p7z_opts.split(), - cwd=dicom_session_ds.path, - ) + cmd = ["7z", "u", str(dest), str(input_path)] + p7z_opts.split() + print(cmd) + subprocess.run(cmd, cwd=dicom_session_ds.path) elif input_path.is_file(): dest = dicom_session_ds.pathobj / dest try: # try hard-linking to avoid copying From 7ef85cff24878dca6a18d647cb64cbb4b6ded79c Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 26 Jan 2024 13:23:15 -0500 Subject: [PATCH 13/26] add s3 remote init: wip --- docker/dicom_indexer/indexer/index_dicom.py | 33 +++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 619c4fe..ab16f8d 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -334,7 +334,6 @@ def import_local_data( dest = dest + ".7z" # create 7z archive with 1block/file parameters cmd = ["7z", "u", str(dest), str(input_path)] + p7z_opts.split() - print(cmd) subprocess.run(cmd, cwd=dicom_session_ds.path) elif input_path.is_file(): dest = dicom_session_ds.pathobj / dest @@ -363,6 +362,7 @@ def export_to_ria( dicom_session_tag: str, session_metas: dict, export_ria_archive: bool = False, + ria_archive_7zopts: str = "-mx5 -ms=off", ): ria_name = pathlib.Path(ria_url.path).name ds.create_sibling_ria( @@ -379,7 +379,9 @@ def export_to_ria( ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]["url"]) archive_path = ria_sibling_path / "archives" / "archive.7z" ds.export_archive_ora( - archive_path, opts=[f"-mx{COMPRESSION_LEVEL}"], missing_content="error" + archive_path, + opts=ria_archive_7zopts.split(), + missing_content="error", ) ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) # index ds.push(to=ria_name, data="nothing") @@ -390,10 +392,31 @@ def export_to_s3( s3_url: urllib.parse.ParseResult, session_metas: dict, ): - ds.repo.initremote() + # TODO: check if we can reuse a single bucket (or per study) with fileprefix # git-annex initremote remotename ... - # git-annex wanted remotename include=**.{7z,tar.gz,zip} - # datalad push --data auto --to remotename + remote_name = s3_url.hostname + bucket_name, path = pathlib.Path(s3_url.path).parts + ds.repo.initremote( + remote_name, + [ + "type=S3", + "encryption=none", + "autoenable=true", + f"host={s3_url.hostname}", + "port=443", + "protocol=https", + "chunk=1GiB", + f"bucket={bucket_name}", + "requeststyle=path", + f"fileprefix={'/'.join(path)}", + ], + ) + ds.repo.set_preferred_content( + remote_name, + "include=**.{7z,tar.gz,zip}", + ) + + ds.push(to=remote_name) def connect_gitlab( From 95eabd62f6d9e87ab9c614dac9c8c8fc2d3d71ba Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 26 Jan 2024 14:57:58 -0500 Subject: [PATCH 14/26] ironing out things: wip --- docker/dicom_indexer/indexer/index_dicom.py | 81 ++++++++++++++------- 1 file changed, 53 insertions(+), 28 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index ab16f8d..d1e872a 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -12,12 +12,24 @@ import subprocess import yaml from contextlib import contextmanager +DEBUG = bool(os.environ.get("DEBUG", False)) -GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab") +GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "origin") GITLAB_TOKEN = os.environ.get("GITLAB_TOKEN", None) GITLAB_BOT_USERNAME = os.environ.get("GITLAB_BOT_USERNAME", None) +S3_REMOTE_DEFAULT_PARAMETERS = [ + "type=S3", + "encryption=none", + "autoenable=true", + "port=443", + "protocol=https", + "chunk=1GiB", + "requeststyle=path", +] + + # TODO: rewrite for pathlib.Path input def sort_series(path: pathlib.Path) -> None: """Sort series in separate folder @@ -139,7 +151,7 @@ def index_dicoms( ) -> dlad.Dataset: """Process incoming dicoms into datalad repo""" - with tempfile.TemporaryDirectory() as tmpdirname: + with tempfile.TemporaryDirectory(delete=not DEBUG) as tmpdirname: dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates) if not input.scheme or input.scheme == "file": @@ -183,6 +195,24 @@ def export_data( export_to_s3(dicom_session_ds, output_remote, session_metas) +def set_bot_privileges(gitlab_conn: gitlab.Gitlab, gitlab_group_path: str) -> None: + # add maint permissions for the dicom bot user on the study repos + study_group = get_or_create_gitlab_group(gitlab_conn, gitlab_group_path) + bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME) + if not bot_user: + raise RuntimeError( + f"bot_user: {GITLAB_BOT_USERNAME} does not exists in gitlab instance" + ) + bot_user = bot_user[0] + if not any(m.id == bot_user.id for m in study_group.members.list()): + study_group.members.create( + { + "user_id": bot_user.id, + "access_level": gitlab.const.AccessLevel.MAINTAINER, + } + ) + + def setup_gitlab_repos( dicom_session_ds: dlad.Dataset, gitlab_url: urllib.parse.ParseResult, @@ -207,31 +237,32 @@ def setup_gitlab_repos( name=GITLAB_REMOTE_NAME, url=dicom_session_repo._attrs["ssh_url_to_repo"], ) - # and push - dicom_session_ds.push(to=GITLAB_REMOTE_NAME) + """ + # prevent warnings + dicom_session_ds.config.add( + f"remote.{GITLAB_REMOTE_NAME}.annex-ignore", + value='false', + scope='local' + )""" - # add maint permissions for the dicom bot user on the study repos - study_group = get_or_create_gitlab_group(gitlab_conn, gitlab_group_path) - bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME).get(0, None) - if not bot_user: - raise RuntimeError( - f"bot_user: {GITLAB_BOT_USERNAME} does not exists in gitlab instance" - ) - if not any(m.id == bot_user.id for m in study_group.members.list()): - study_group.members.create( - { - "user_id": bot_user.id, - "access_level": gitlab.const.AccessLevel.MAINTAINER, - } - ) + set_bot_privileges(gitlab_conn, gitlab_group_path) + # and push + dicom_session_ds.push(to=GITLAB_REMOTE_NAME, force='gitpush') ## add the session to the dicom study repo dicom_study_repo = get_or_create_gitlab_project(gitlab_conn, dicom_study_path) - with tempfile.TemporaryDirectory() as tmpdir: + with tempfile.TemporaryDirectory(delete=not DEBUG) as tmpdir: dicom_study_ds = dlad.install( source=dicom_study_repo._attrs["ssh_url_to_repo"], path=tmpdir, ) + """ + # prevent warnings when pushing + dicom_study_ds.config.add( + f"remote.origin.annex-ignore", + value='false', + scope='local' + )""" if dicom_study_ds.repo.get_hexsha() is None or dicom_study_ds.id is None: dicom_study_ds.create(force=True) @@ -250,7 +281,7 @@ def setup_gitlab_repos( ) # Push to gitlab - dicom_study_ds.push(to="origin", force="gitpush") + dicom_study_ds.push(to="origin") def init_bids( @@ -398,16 +429,10 @@ def export_to_s3( bucket_name, path = pathlib.Path(s3_url.path).parts ds.repo.initremote( remote_name, - [ - "type=S3", - "encryption=none", - "autoenable=true", + S3_REMOTE_DEFAULT_PARAMETERS + + [ f"host={s3_url.hostname}", - "port=443", - "protocol=https", - "chunk=1GiB", f"bucket={bucket_name}", - "requeststyle=path", f"fileprefix={'/'.join(path)}", ], ) From b6053a4c47a381da1f2bbacaa5cad2523fbfb22f Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 26 Jan 2024 15:41:13 -0500 Subject: [PATCH 15/26] wip: alpine docker with storescp + dicom indexer --- docker/dicom_indexer/Dockerfile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docker/dicom_indexer/Dockerfile b/docker/dicom_indexer/Dockerfile index 21b30c0..231c935 100644 --- a/docker/dicom_indexer/Dockerfile +++ b/docker/dicom_indexer/Dockerfile @@ -1,10 +1,14 @@ -FROM alpine:3.17.2 +FROM alpine:3.19 RUN apk add --no-cache ca-certificates tzdata \ python3 py3-pip git openssh-client git-annex curl bzip2 bash glab\ && cp /usr/share/zoneinfo/UTC /etc/localtime \ && apk del tzdata \ && rm -rf /tmp/* /var/cache/apk/* +RUN apk add --no-cache dcmtk --repository=https://dl-cdn.alpinelinux.org/alpine/edge/testing -RUN pip install --no-cache-dir datalad ssh_agent_setup python-gitlab pydicom + +RUN pip install --break-system-packages --no-cache-dir datalad ssh_agent_setup python-gitlab pydicom pyyaml + +ADD indexer /indexer WORKDIR /work From 386a10de229867ee03e575c91cf3ccb8bc42a930 Mon Sep 17 00:00:00 2001 From: Milton Camacho Date: Tue, 30 Jan 2024 16:47:30 +0000 Subject: [PATCH 16/26] wip: dicom indexer --- docker/dicom_indexer/indexer/index_dicom.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index d1e872a..4ee4b7e 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -17,7 +17,7 @@ DEBUG = bool(os.environ.get("DEBUG", False)) GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "origin") GITLAB_TOKEN = os.environ.get("GITLAB_TOKEN", None) GITLAB_BOT_USERNAME = os.environ.get("GITLAB_BOT_USERNAME", None) - +BIDS_DEV_BRANCH = os.environ.get("BIDS_DEV_BRANCH", "dev") S3_REMOTE_DEFAULT_PARAMETERS = [ "type=S3", @@ -291,7 +291,7 @@ def init_bids( ) -> None: bids_project_repo = get_or_create_gitlab_project(gl, f"{gitlab_group_path}/bids") with tempfile.TemporaryDirectory() as tmpdir: - bids_project_ds = datalad.api.install( + bids_project_ds = dlad.install( source=bids_project_repo._attrs["ssh_url_to_repo"], path=tmpdir, ) @@ -305,7 +305,7 @@ def init_bids( # TODO: setup sensitive / non-sensitive S3 buckets bids_project_ds.push(to="origin") # create dev branch and push for merge requests - bids_project_ds.gitrepo.checkout(BIDS_DEV_BRANCH, ["-b"]) + bids_project_ds.repo.checkout(BIDS_DEV_BRANCH, ["-b"]) bids_project_ds.push(to="origin") # set protected branches bids_project_repo.protectedbranches.create(data={"name": "convert/*"}) @@ -427,7 +427,9 @@ def export_to_s3( # git-annex initremote remotename ... remote_name = s3_url.hostname bucket_name, path = pathlib.Path(s3_url.path).parts - ds.repo.initremote( + + # TODO: change the bucket information to datadata information + ds.repo.init_remote( remote_name, S3_REMOTE_DEFAULT_PARAMETERS + [ @@ -437,11 +439,13 @@ def export_to_s3( ], ) ds.repo.set_preferred_content( - remote_name, + "wanted", "include=**.{7z,tar.gz,zip}", + remote=remote_name ) - ds.push(to=remote_name) + ds.push(to=remote_name, data='auto') + #It does not push the data to the S3 unless I set data="anything" which pushes everyhing including the deflated archived data def connect_gitlab( From ad5eb5bbb626f63fe2be897dc4cfb3f4cac50ead Mon Sep 17 00:00:00 2001 From: bpinsard Date: Tue, 30 Jan 2024 13:43:32 -0500 Subject: [PATCH 17/26] add repo templates, fix s3 export --- docker/dicom_indexer/indexer/index_dicom.py | 12 +++++++----- .../indexer/repo_templates/bids/.all-contributorsrc | 5 +++++ .../indexer/repo_templates/bids/.bidsignore | 3 +++ .../indexer/repo_templates/bids/.derivatives | 1 + .../indexer/repo_templates/bids/.gitattributes | 13 +++++++++++++ .../indexer/repo_templates/bids/.gitignore | 2 ++ .../indexer/repo_templates/bids/.gitlab-ci.yml | 4 ++++ .../indexer/repo_templates/bids/README | 6 ++++++ .../repo_templates/dicom_study/.gitattributes | 12 ++++++++++++ .../repo_templates/dicom_study/.gitlab-ci.yml | 6 ++++++ 10 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/.all-contributorsrc create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/.bidsignore create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/.derivatives create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/.gitattributes create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/.gitignore create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/README create mode 100644 docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitattributes create mode 100644 docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index d1e872a..f495a85 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -426,19 +426,21 @@ def export_to_s3( # TODO: check if we can reuse a single bucket (or per study) with fileprefix # git-annex initremote remotename ... remote_name = s3_url.hostname - bucket_name, path = pathlib.Path(s3_url.path).parts - ds.repo.initremote( + _, bucket_name, *fileprefix = pathlib.Path(s3_url.path).parts + fileprefix.append(session_metas['StudyInstanceUID']+'/') + ds.repo.init_remote( remote_name, S3_REMOTE_DEFAULT_PARAMETERS + [ f"host={s3_url.hostname}", f"bucket={bucket_name}", - f"fileprefix={'/'.join(path)}", + f"fileprefix={'/'.join(fileprefix)}", ], ) ds.repo.set_preferred_content( - remote_name, - "include=**.{7z,tar.gz,zip}", + "wanted", + "include=*7z or include=*.tar.gz or include=*zip", + remote=remote_name, ) ds.push(to=remote_name) diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.all-contributorsrc b/docker/dicom_indexer/indexer/repo_templates/bids/.all-contributorsrc new file mode 100644 index 0000000..ead2a2a --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.all-contributorsrc @@ -0,0 +1,5 @@ +{ + "files": [ + "README" + ] +} diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.bidsignore b/docker/dicom_indexer/indexer/repo_templates/bids/.bidsignore new file mode 100644 index 0000000..e6dbe00 --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.bidsignore @@ -0,0 +1,3 @@ +**/anat/*localizer* +**/anat/*scout* +**/*__dup* diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.derivatives b/docker/dicom_indexer/indexer/repo_templates/bids/.derivatives new file mode 100644 index 0000000..c8a15dd --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.derivatives @@ -0,0 +1 @@ +qc/mriqc diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.gitattributes b/docker/dicom_indexer/indexer/repo_templates/bids/.gitattributes new file mode 100644 index 0000000..b63f3e1 --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.gitattributes @@ -0,0 +1,13 @@ +**/.git* annex.largefiles=nothing +* annex.largefiles=(largerthan=100kb) +*.yml annex.largefiles=nothing +*.json annex.largefiles=nothing +*.txt annex.largefiles=nothing +*.tsv annex.largefiles=nothing +*.nii.gz annex.largefiles=anything +*.tgz annex.largefiles=anything +*_scans.tsv annex.largefiles=anything +# annex event files as they contain subjects behavioral responses +sub-*/**/*_events.tsv annex.largefiles=anything +*.bk2 annex.largefiles=anything +.bidsignore annex.largefiles=nothing diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.gitignore b/docker/dicom_indexer/indexer/repo_templates/bids/.gitignore new file mode 100644 index 0000000..899093a --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.gitignore @@ -0,0 +1,2 @@ +.pybids_cache/** +.heudiconv diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml b/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml new file mode 100644 index 0000000..51dba8d --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml @@ -0,0 +1,4 @@ +include: + - project: 'unf/ni-dataops' + file: + - 'ci-pipelines/bids/bids_repo.yml' diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/README b/docker/dicom_indexer/indexer/repo_templates/bids/README new file mode 100644 index 0000000..eb7bcdd --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/README @@ -0,0 +1,6 @@ +# xyz dataset + +## Contributors + + + diff --git a/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitattributes b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitattributes new file mode 100644 index 0000000..637ed86 --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitattributes @@ -0,0 +1,12 @@ +**/.git* annex.largefiles=nothing +* annex.largefiles=(largerthan=100kb) +*.yml annex.largefiles=nothing +*.json annex.largefiles=nothing +*.txt annex.largefiles=nothing +*.tsv annex.largefiles=nothing +*.nii.gz annex.largefiles=anything +*.tgz annex.largefiles=anything +*_scans.tsv annex.largefiles=anything +# annex event files as they contain subjects behavioral responses +sub-*/**/*_events.tsv annex.largefiles=anything +*.bk2 annex.largefiles=anything diff --git a/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml new file mode 100644 index 0000000..78031e8 --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml @@ -0,0 +1,6 @@ + +include: + - local: /ci-env.yml + - project: 'unf/ni-dataops' + file: + - 'ci-pipelines/sources/dicoms_study.yml' From 010e5d7fc4934c2b39b17b9f7239ae8fd0137724 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Wed, 31 Jan 2024 10:31:07 -0500 Subject: [PATCH 18/26] use more pathlib --- docker/dicom_indexer/indexer/index_dicom.py | 50 ++++++++------------- 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index d33ba5b..5adac1c 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -13,6 +13,8 @@ import yaml from contextlib import contextmanager DEBUG = bool(os.environ.get("DEBUG", False)) +if DEBUG: + logging.basicConfig(level=logging.DEBUG) GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "origin") GITLAB_TOKEN = os.environ.get("GITLAB_TOKEN", None) @@ -223,12 +225,10 @@ def setup_gitlab_repos( gitlab_conn = connect_gitlab(gitlab_url) # generate gitlab group/repo paths - gitlab_group_path = gitlab_group_template.format(**session_metas) - dicom_sourcedata_path = "/".join([gitlab_group_path, "sourcedata/dicoms"]) - dicom_session_path = "/".join( - [dicom_sourcedata_path, session_metas["StudyInstanceUID"]] - ) - dicom_study_path = "/".join([dicom_sourcedata_path, "study"]) + gitlab_group_path = pathlib.Path(gitlab_group_template.format(**session_metas)) + dicom_sourcedata_path = gitlab_group_path / "sourcedata/dicoms" + dicom_session_path = dicom_sourcedata_path / session_metas["StudyInstanceUID"] + dicom_study_path = dicom_sourcedata_path / "study" # create repo (should not exists unless rerun) dicom_session_repo = get_or_create_gitlab_project(gitlab_conn, dicom_session_path) @@ -247,7 +247,7 @@ def setup_gitlab_repos( set_bot_privileges(gitlab_conn, gitlab_group_path) # and push - dicom_session_ds.push(to=GITLAB_REMOTE_NAME, force="gitpush") + dicom_session_ds.push(to=GITLAB_REMOTE_NAME) ## add the session to the dicom study repo dicom_study_repo = get_or_create_gitlab_project(gitlab_conn, dicom_study_path) @@ -266,7 +266,6 @@ def setup_gitlab_repos( if dicom_study_ds.repo.get_hexsha() is None or dicom_study_ds.id is None: dicom_study_ds.create(force=True) - dicom_study_ds.push(to="origin") # add default study DS structure. init_dicom_study(dicom_study_ds, gitlab_group_path) # initialize BIDS project @@ -287,9 +286,9 @@ def setup_gitlab_repos( def init_bids( gl: gitlab.Gitlab, dicom_study_repo: dlad.Dataset, - gitlab_group_path: str, + gitlab_group_path: pathlib.Path, ) -> None: - bids_project_repo = get_or_create_gitlab_project(gl, f"{gitlab_group_path}/bids") + bids_project_repo = get_or_create_gitlab_project(gl, gitlab_group_path / "bids") with tempfile.TemporaryDirectory() as tmpdir: bids_project_ds = dlad.install( source=bids_project_repo._attrs["ssh_url_to_repo"], @@ -462,14 +461,14 @@ def connect_gitlab( def get_or_create_gitlab_group( gl: gitlab.Gitlab, - group_path: str, + group_path: pathlib.Path, ): """fetch or create a gitlab group""" - group_list = group_path.split("/") + group_list = group_path.parts found = False for keep_groups in reversed(range(len(group_list) + 1)): tmp_repo_path = "/".join(group_list[0:keep_groups]) - logging.warning(tmp_repo_path) + logging.debug(tmp_repo_path) gs = gl.groups.list(search=tmp_repo_path) for g in gs: if g.full_path == tmp_repo_path: @@ -479,16 +478,12 @@ def get_or_create_gitlab_group( break for nb_groups in range(keep_groups, len(group_list)): if nb_groups == 0: - msg = "Creating group {}".format(group_list[nb_groups]) - logging.warning(msg) - logging.warning(len(msg) * "=") + logging.debug(f"Creating group {group_list[nb_groups]}") g = gl.groups.create( {"name": group_list[nb_groups], "path": group_list[nb_groups]} ) else: - msg = "Creating group {} from {}".format(group_list[nb_groups], g.name) - logging.warning(msg) - logging.warning(len(msg) * "=") + logging.debug(f"Creating group {group_list[nb_groups]} from {g.name}") g = gl.groups.create( { "name": group_list[nb_groups], @@ -500,26 +495,19 @@ def get_or_create_gitlab_group( return g -def get_or_create_gitlab_project(gl: gitlab.Gitlab, project_path: str): +def get_or_create_gitlab_project(gl: gitlab.Gitlab, project_path: pathlib.Path): """fetch or create a gitlab repo""" - project_name = project_path.split("/") - if len(project_name) == 1: - # Check if exists - p = gl.projects.list(search=project_name[0]) - if not p: - p = gl.projects.create({"name": project_name[0], "path": project_name[0]}) - return p.id - else: - return p[0].id + project_name = project_path.parts # Look for exact repo/project: p = gl.projects.list(search=project_name[-1]) if p: for curr_p in p: - if curr_p.path_with_namespace == project_path: + if curr_p.path_with_namespace == str(project_path): return curr_p - g = get_or_create_gitlab_group(gl, "/".join(project_name[:-1])) + g = get_or_create_gitlab_group(gl, project_path.parent) + logging.debug(f"Creating project {project_name[-1]} from {g.name}") p = gl.projects.create({"name": project_name[-1], "namespace_id": g.id}) return p From 4339b20b3c98075d45b2c51bd2d6544173648ad7 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Wed, 31 Jan 2024 11:04:50 -0500 Subject: [PATCH 19/26] dicom_indexer docker:wip --- docker/dicom_indexer/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/dicom_indexer/Dockerfile b/docker/dicom_indexer/Dockerfile index 231c935..f7f4c4c 100644 --- a/docker/dicom_indexer/Dockerfile +++ b/docker/dicom_indexer/Dockerfile @@ -10,5 +10,8 @@ RUN apk add --no-cache dcmtk --repository=https://dl-cdn.alpinelinux.org/alpine/ RUN pip install --break-system-packages --no-cache-dir datalad ssh_agent_setup python-gitlab pydicom pyyaml ADD indexer /indexer +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh +ENTRYPOINT /entrypoint.sh WORKDIR /work From 44105daf4ad46dfb8a0e4dd63e4e3ef053518c82 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Wed, 31 Jan 2024 13:38:32 -0500 Subject: [PATCH 20/26] add command/entrypoin --- docker/dicom_indexer/Dockerfile | 6 ++--- docker/dicom_indexer/entrypoint.sh | 37 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 docker/dicom_indexer/entrypoint.sh diff --git a/docker/dicom_indexer/Dockerfile b/docker/dicom_indexer/Dockerfile index f7f4c4c..22ec83d 100644 --- a/docker/dicom_indexer/Dockerfile +++ b/docker/dicom_indexer/Dockerfile @@ -1,6 +1,6 @@ -FROM alpine:3.19 +FROM python:3.12-alpine RUN apk add --no-cache ca-certificates tzdata \ - python3 py3-pip git openssh-client git-annex curl bzip2 bash glab\ + git openssh-client git-annex curl bzip2 bash glab\ && cp /usr/share/zoneinfo/UTC /etc/localtime \ && apk del tzdata \ && rm -rf /tmp/* /var/cache/apk/* @@ -12,6 +12,6 @@ RUN pip install --break-system-packages --no-cache-dir datalad ssh_agent_setup p ADD indexer /indexer COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh -ENTRYPOINT /entrypoint.sh +CMD /entrypoint.sh WORKDIR /work diff --git a/docker/dicom_indexer/entrypoint.sh b/docker/dicom_indexer/entrypoint.sh new file mode 100644 index 0000000..05d50ac --- /dev/null +++ b/docker/dicom_indexer/entrypoint.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +CONTAINER_ID=$(basename $(cat /proc/1/cpuset)) +GITLAB_TOKEN_SECRET=/var/run/secrets/dicom_bot_gitlab_token + +# only export keys when deploying as a service on swarm +# TODO: should try using gitlab runner mechanism if not +if [ -e $GITLAB_TOKEN_SECRET ] ; then + # generate container specific ssh-key + ssh-keygen -f /root/.ssh/id_rsa -N '' + # register it for dicom_bot user + curl -X POST -F "private_token=$(cat $GITLAB_TOKEN_SECRET)" \ + -F "title="$(cat /etc/hostname)${CONTAINER_ID:0:12} -F "key=$(cat ~/.ssh/id_rsa.pub)" \ + "${GITLAB_API_URL}/user/keys" + fi + +git config --global init.defaultBranch main + + +# example +# /usr/bin/storescp \ +# -aet DICOM_SERVER_SEQUOIA\ +# -pm\ +# -od $DICOM_TMP_DIR -su ''\ +# --eostudy-timeout ${STORESCP_STUDY_TIMEOUT:=60} \ +# --exec-on-eostudy "python3 $DICOM_ROOT/exec_on_study_received.py #p " 2100 >> $DICOM_DATA_ROOT/storescp.log + +# run whatever command was passed (storescp or python index_dicoms directly) +echo $@ +bash -c "$@" + +if [ -e $GITLAB_TOKEN_SECRET ] ; then + # unregister the temporary ssh key + curl -X DELETE -F "private_token=$(cat $GITLAB_TOKEN_SECRET)" \ + -F "title="$(cat /etc/hostname)${CONTAINER_ID:0:12} + "${GITLAB_API_URL}/user/keys" +fi From c336879fa51aca22c256c241da2e0c45b56a3c78 Mon Sep 17 00:00:00 2001 From: Milton Camacho Date: Wed, 31 Jan 2024 19:38:49 +0000 Subject: [PATCH 21/26] str to pathlib.path correction --- docker/dicom_indexer/indexer/index_dicom.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 5adac1c..859e72f 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -197,7 +197,7 @@ def export_data( export_to_s3(dicom_session_ds, output_remote, session_metas) -def set_bot_privileges(gitlab_conn: gitlab.Gitlab, gitlab_group_path: str) -> None: +def set_bot_privileges(gitlab_conn: gitlab.Gitlab, gitlab_group_path: pathlib.Path) -> None: # add maint permissions for the dicom bot user on the study repos study_group = get_or_create_gitlab_group(gitlab_conn, gitlab_group_path) bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME) @@ -271,8 +271,8 @@ def setup_gitlab_repos( # initialize BIDS project init_bids(gitlab_conn, dicom_study_repo, gitlab_group_path) # create subgroup for QC and derivatives repos - get_or_create_gitlab_group(gitlab_conn, f"{gitlab_group_path}/derivatives") - get_or_create_gitlab_group(gitlab_conn, f"{gitlab_group_path}/qc") + get_or_create_gitlab_group(gitlab_conn, gitlab_group_path / "derivatives") + get_or_create_gitlab_group(gitlab_conn, gitlab_group_path / "qc") dicom_study_ds.install( source=dicom_session_repo._attrs["ssh_url_to_repo"], From 483cc64d53b45706ff3204d22592d025687f9f71 Mon Sep 17 00:00:00 2001 From: Milton Camacho Date: Wed, 31 Jan 2024 20:58:41 +0000 Subject: [PATCH 22/26] corrected creation of ci-env.yml and more use of pathlib --- docker/dicom_indexer/indexer/index_dicom.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 859e72f..5243abf 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -313,18 +313,18 @@ def init_bids( def init_dicom_study( dicom_study_ds: dlad.Dataset, - gitlab_group_path: str, + gitlab_group_path: pathlib.Path, ) -> None: shutil.copytree( "repo_templates/dicom_study", dicom_study_ds.path, dirs_exist_ok=True ) env = { "variables": { - "STUDY_PATH": gitlab_group_path, - "BIDS_PATH": f"{gitlab_group_path}/bids", + "STUDY_PATH": str(gitlab_group_path), + "BIDS_PATH": str(gitlab_group_path / "bids"), } } - with open(os.path.join(dicom_study_ds.path, "ci-env.yml"), "w") as outfile: + with (pathlib.Path(dicom_study_ds.path) / "ci-env.yml").open("w") as outfile: yaml.dump(env, outfile, default_flow_style=False) dicom_study_ds.save(path=".", message="init structure and pipelines") dicom_study_ds.push(to="origin") From 3005669ebab4e3a905ac8d9707a9cfd591d966e2 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 2 Feb 2024 10:40:52 -0500 Subject: [PATCH 23/26] set ni-dataops path as a env var --- .gitlab-ci.yml | 2 ++ docker/dicom_indexer/indexer/index_dicom.py | 16 +++++++++++++--- .../indexer/repo_templates/bids/.gitlab-ci.yml | 3 ++- .../repo_templates/dicom_study/.gitlab-ci.yml | 4 ++-- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 919358b..de079ad 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -59,3 +59,5 @@ build_all_dockers: - heudiconv - deface - dicom_indexer + +# TODO: add deploy job for dicom indexer, if deployed with storescp diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index 5243abf..bfe8fe8 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -20,6 +20,7 @@ GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "origin") GITLAB_TOKEN = os.environ.get("GITLAB_TOKEN", None) GITLAB_BOT_USERNAME = os.environ.get("GITLAB_BOT_USERNAME", None) BIDS_DEV_BRANCH = os.environ.get("BIDS_DEV_BRANCH", "dev") +NI_DATAOPS_GITLAB_ROOT = os.environ.get("NI_DATAOPS_GITLAB_ROOT", "ni-dataops") S3_REMOTE_DEFAULT_PARAMETERS = [ "type=S3", @@ -296,6 +297,7 @@ def init_bids( ) bids_project_ds.create(force=True) shutil.copytree("repo_templates/bids", bids_project_ds.path, dirs_exist_ok=True) + write_ci_env(bids_project_ds, gitlab_group_path) bids_project_ds.save(path=".", message="init structure and pipelines") bids_project_ds.install( path="sourcedata/dicoms", @@ -318,16 +320,24 @@ def init_dicom_study( shutil.copytree( "repo_templates/dicom_study", dicom_study_ds.path, dirs_exist_ok=True ) + write_ci_env(dicom_study_ds, gitlab_group_path) + dicom_study_ds.save(path=".", message="init structure and pipelines") + dicom_study_ds.push(to="origin") + + +def write_ci_env( + ds: dlad.Dataset + gitlab_group_path: pathlib.Path +): env = { "variables": { "STUDY_PATH": str(gitlab_group_path), "BIDS_PATH": str(gitlab_group_path / "bids"), + "NI_DATAOPS_GITLAB_ROOT": NI_DATAOPS_GITLAB_ROOT, } } - with (pathlib.Path(dicom_study_ds.path) / "ci-env.yml").open("w") as outfile: + with (pathlib.Path(ds.path) / ".ci-env.yml").open("w") as outfile: yaml.dump(env, outfile, default_flow_style=False) - dicom_study_ds.save(path=".", message="init structure and pipelines") - dicom_study_ds.push(to="origin") SESSION_META_KEYS = [ diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml b/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml index 51dba8d..606af35 100644 --- a/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml @@ -1,4 +1,5 @@ include: - - project: 'unf/ni-dataops' + - local: /.ci-env.yml + - project: "$NI_DATAOPS_GITLAB_ROOT/ci-pipelines" file: - 'ci-pipelines/bids/bids_repo.yml' diff --git a/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml index 78031e8..aaa831e 100644 --- a/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml +++ b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml @@ -1,6 +1,6 @@ include: - - local: /ci-env.yml - - project: 'unf/ni-dataops' + - local: /.ci-env.yml + - project: "$NI_DATAOPS_GITLAB_ROOT/ci-pipelines" file: - 'ci-pipelines/sources/dicoms_study.yml' From fe95059f1c990575683b618a272d2e799f1df1f7 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Thu, 8 Feb 2024 13:40:01 -0500 Subject: [PATCH 24/26] wip --- docker/dicom_indexer/Dockerfile | 4 ++-- docker/dicom_indexer/entrypoint.sh | 23 +++++++++++-------- .../repo_templates/bids/.gitlab-ci.yml | 1 + .../repo_templates/dicom_study/.gitlab-ci.yml | 1 + 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/docker/dicom_indexer/Dockerfile b/docker/dicom_indexer/Dockerfile index 22ec83d..e7c3e77 100644 --- a/docker/dicom_indexer/Dockerfile +++ b/docker/dicom_indexer/Dockerfile @@ -1,5 +1,5 @@ FROM python:3.12-alpine -RUN apk add --no-cache ca-certificates tzdata \ +RUN apk add --no-cache ca-certificates tzdata 7zip jq\ git openssh-client git-annex curl bzip2 bash glab\ && cp /usr/share/zoneinfo/UTC /etc/localtime \ && apk del tzdata \ @@ -12,6 +12,6 @@ RUN pip install --break-system-packages --no-cache-dir datalad ssh_agent_setup p ADD indexer /indexer COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh -CMD /entrypoint.sh +ENTRYPOINT ["bash", "/entrypoint.sh"] WORKDIR /work diff --git a/docker/dicom_indexer/entrypoint.sh b/docker/dicom_indexer/entrypoint.sh index 05d50ac..1ae1665 100644 --- a/docker/dicom_indexer/entrypoint.sh +++ b/docker/dicom_indexer/entrypoint.sh @@ -1,21 +1,24 @@ #!/bin/bash CONTAINER_ID=$(basename $(cat /proc/1/cpuset)) -GITLAB_TOKEN_SECRET=/var/run/secrets/dicom_bot_gitlab_token +GITLAB_TOKEN_SECRET=$(cat /var/run/secrets/dicom_bot_gitlab_token 2>/dev/null) +GITLAB_TOKEN=${GITLAB_TOKEN_SECRET:=$GITLAB_TOKEN} # only export keys when deploying as a service on swarm # TODO: should try using gitlab runner mechanism if not -if [ -e $GITLAB_TOKEN_SECRET ] ; then +if [ -n "${GITLAB_TOKEN}" ] ; then # generate container specific ssh-key ssh-keygen -f /root/.ssh/id_rsa -N '' # register it for dicom_bot user - curl -X POST -F "private_token=$(cat $GITLAB_TOKEN_SECRET)" \ + echo 'registering the ssh key' + ssh_key_json=$(curl -X POST -F "private_token=${GITLAB_TOKEN}" \ -F "title="$(cat /etc/hostname)${CONTAINER_ID:0:12} -F "key=$(cat ~/.ssh/id_rsa.pub)" \ - "${GITLAB_API_URL}/user/keys" + "${GITLAB_API_URL}/user/keys") fi git config --global init.defaultBranch main - +mkdir -p ~/.ssh/known_hosts +install -m 600 /dev/stdin ~/.ssh/known_hosts <<< "$SSH_KNOWN_HOSTS" # example # /usr/bin/storescp \ @@ -26,12 +29,12 @@ git config --global init.defaultBranch main # --exec-on-eostudy "python3 $DICOM_ROOT/exec_on_study_received.py #p " 2100 >> $DICOM_DATA_ROOT/storescp.log # run whatever command was passed (storescp or python index_dicoms directly) -echo $@ -bash -c "$@" +$@ -if [ -e $GITLAB_TOKEN_SECRET ] ; then +if [ -n "${GITLAB_TOKEN}" ] ; then # unregister the temporary ssh key - curl -X DELETE -F "private_token=$(cat $GITLAB_TOKEN_SECRET)" \ + ssh_key_id=$(jq .id <<< $ssh_key_json) + curl -X DELETE -F "private_token=${GITLAB_TOKEN}" \ -F "title="$(cat /etc/hostname)${CONTAINER_ID:0:12} - "${GITLAB_API_URL}/user/keys" + "${GITLAB_API_URL}/users/keys/${ssh_key_id}" fi diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml b/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml index 606af35..dec518c 100644 --- a/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.gitlab-ci.yml @@ -1,5 +1,6 @@ include: - local: /.ci-env.yml - project: "$NI_DATAOPS_GITLAB_ROOT/ci-pipelines" + ref: refactor file: - 'ci-pipelines/bids/bids_repo.yml' diff --git a/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml index aaa831e..e48223a 100644 --- a/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml +++ b/docker/dicom_indexer/indexer/repo_templates/dicom_study/.gitlab-ci.yml @@ -2,5 +2,6 @@ include: - local: /.ci-env.yml - project: "$NI_DATAOPS_GITLAB_ROOT/ci-pipelines" + ref: refactor file: - 'ci-pipelines/sources/dicoms_study.yml' From 559ab180161d2406972e2f40349c983597a26844 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 9 Feb 2024 10:22:18 -0500 Subject: [PATCH 25/26] fix ssh server key --- docker/dicom_indexer/entrypoint.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docker/dicom_indexer/entrypoint.sh b/docker/dicom_indexer/entrypoint.sh index 1ae1665..560db8e 100644 --- a/docker/dicom_indexer/entrypoint.sh +++ b/docker/dicom_indexer/entrypoint.sh @@ -1,9 +1,12 @@ #!/bin/bash -CONTAINER_ID=$(basename $(cat /proc/1/cpuset)) +export CONTAINER_ID=$(basename $(cat /proc/1/cpuset)) GITLAB_TOKEN_SECRET=$(cat /var/run/secrets/dicom_bot_gitlab_token 2>/dev/null) -GITLAB_TOKEN=${GITLAB_TOKEN_SECRET:=$GITLAB_TOKEN} +export GITLAB_TOKEN=${GITLAB_TOKEN_SECRET:=$GITLAB_TOKEN} +export GITLAB_API_URL=https://${CI_SERVER_HOST}/api/v4 +export GIT_SSH_PORT=${GIT_SSH_PORT:=222} +mkdir -p ~/.ssh # only export keys when deploying as a service on swarm # TODO: should try using gitlab runner mechanism if not if [ -n "${GITLAB_TOKEN}" ] ; then @@ -11,14 +14,14 @@ if [ -n "${GITLAB_TOKEN}" ] ; then ssh-keygen -f /root/.ssh/id_rsa -N '' # register it for dicom_bot user echo 'registering the ssh key' - ssh_key_json=$(curl -X POST -F "private_token=${GITLAB_TOKEN}" \ - -F "title="$(cat /etc/hostname)${CONTAINER_ID:0:12} -F "key=$(cat ~/.ssh/id_rsa.pub)" \ + export ssh_key_json=$(curl -X POST -F "private_token=${GITLAB_TOKEN}" \ + -F "title="${HOSTNAME} -F "key=$(cat ~/.ssh/id_rsa.pub)" \ "${GITLAB_API_URL}/user/keys") - fi + export ssh_key_id=$(jq .id <<< "$ssh_key_json") +fi git config --global init.defaultBranch main -mkdir -p ~/.ssh/known_hosts -install -m 600 /dev/stdin ~/.ssh/known_hosts <<< "$SSH_KNOWN_HOSTS" +ssh-keyscan -p ${GIT_SSH_PORT} -H ${CI_SERVER_HOST} | install -m 600 /dev/stdin $HOME/.ssh/known_hosts # example # /usr/bin/storescp \ @@ -33,8 +36,5 @@ $@ if [ -n "${GITLAB_TOKEN}" ] ; then # unregister the temporary ssh key - ssh_key_id=$(jq .id <<< $ssh_key_json) - curl -X DELETE -F "private_token=${GITLAB_TOKEN}" \ - -F "title="$(cat /etc/hostname)${CONTAINER_ID:0:12} - "${GITLAB_API_URL}/users/keys/${ssh_key_id}" + curl -X DELETE -F "private_token=${GITLAB_TOKEN}" "${GITLAB_API_URL}/user/keys/${ssh_key_id}" fi From 6385f4f72f7af103fbfe10373d2fdb62e65a8e5c Mon Sep 17 00:00:00 2001 From: bpinsard Date: Fri, 9 Feb 2024 10:22:49 -0500 Subject: [PATCH 26/26] fix bids init with hidden file --- docker/dicom_indexer/indexer/index_dicom.py | 53 +++++++++++++++---- .../indexer/repo_templates/bids/.bids_init | 1 + 2 files changed, 43 insertions(+), 11 deletions(-) create mode 100644 docker/dicom_indexer/indexer/repo_templates/bids/.bids_init diff --git a/docker/dicom_indexer/indexer/index_dicom.py b/docker/dicom_indexer/indexer/index_dicom.py index bfe8fe8..ea1396b 100644 --- a/docker/dicom_indexer/indexer/index_dicom.py +++ b/docker/dicom_indexer/indexer/index_dicom.py @@ -4,6 +4,7 @@ import argparse import pathlib import urllib.parse import datalad.api as dlad +import datalad.config import shutil import gitlab import tempfile @@ -12,6 +13,10 @@ import subprocess import yaml from contextlib import contextmanager +REPO_TEMPLATES_PATH = ( + pathlib.Path(os.path.dirname(os.path.realpath(__file__))) / "repo_templates" +) + DEBUG = bool(os.environ.get("DEBUG", False)) if DEBUG: logging.basicConfig(level=logging.DEBUG) @@ -19,6 +24,7 @@ if DEBUG: GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "origin") GITLAB_TOKEN = os.environ.get("GITLAB_TOKEN", None) GITLAB_BOT_USERNAME = os.environ.get("GITLAB_BOT_USERNAME", None) +GITLAB_BOT_EMAIL = os.environ.get("GITLAB_BOT_EMAIL", None) BIDS_DEV_BRANCH = os.environ.get("BIDS_DEV_BRANCH", "dev") NI_DATAOPS_GITLAB_ROOT = os.environ.get("NI_DATAOPS_GITLAB_ROOT", "ni-dataops") @@ -33,6 +39,23 @@ S3_REMOTE_DEFAULT_PARAMETERS = [ ] +def git_global_setup( + storage_remote_url: urllib.parse.ParseResult, scope="global" +) -> None: + git_config = datalad.config.ConfigManager() + git_config.add("user.name", GITLAB_BOT_USERNAME, scope=scope) + git_config.add("user.email", GITLAB_BOT_EMAIL, scope=scope) + if storage_remote_url.scheme == "s3": + import socket + + s3_ip = socket.gethostbyname(storage_remote_url.hostname) + git_config.add( + "annex.security.allowed-ip-addresses", + s3_ip, + scope=scope, + ) + + # TODO: rewrite for pathlib.Path input def sort_series(path: pathlib.Path) -> None: """Sort series in separate folder @@ -82,7 +105,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: p.add_argument("--storage-remote", help="url to the datalad remote") p.add_argument( "--sort-series", - type=bool, + action="store_true", default=True, help="sort dicom series in separate folders", ) @@ -115,6 +138,8 @@ def main() -> None: output_remote = urllib.parse.urlparse(args.storage_remote) gitlab_url = urllib.parse.urlparse(args.gitlab_url) + git_global_setup(output_remote) + with index_dicoms( input, sort_series=args.sort_series, @@ -177,7 +202,7 @@ def index_dicoms( # cannot pass message above so commit now dicom_session_ds.save(message=f"index dicoms from archive {archive}") # # optimize git index after large import - dicom_session_ds.repo.gc() # aggressive by default + #dicom_session_ds.repo.gc() # aggressive by default yield dicom_session_ds @@ -198,7 +223,9 @@ def export_data( export_to_s3(dicom_session_ds, output_remote, session_metas) -def set_bot_privileges(gitlab_conn: gitlab.Gitlab, gitlab_group_path: pathlib.Path) -> None: +def set_bot_privileges( + gitlab_conn: gitlab.Gitlab, gitlab_group_path: pathlib.Path +) -> None: # add maint permissions for the dicom bot user on the study repos study_group = get_or_create_gitlab_group(gitlab_conn, gitlab_group_path) bot_user = gitlab_conn.users.list(username=GITLAB_BOT_USERNAME) @@ -295,9 +322,11 @@ def init_bids( source=bids_project_repo._attrs["ssh_url_to_repo"], path=tmpdir, ) - bids_project_ds.create(force=True) - shutil.copytree("repo_templates/bids", bids_project_ds.path, dirs_exist_ok=True) + shutil.copytree( + REPO_TEMPLATES_PATH / "bids", bids_project_ds.path, dirs_exist_ok=True + ) write_ci_env(bids_project_ds, gitlab_group_path) + bids_project_ds.create(force=True) bids_project_ds.save(path=".", message="init structure and pipelines") bids_project_ds.install( path="sourcedata/dicoms", @@ -318,7 +347,7 @@ def init_dicom_study( gitlab_group_path: pathlib.Path, ) -> None: shutil.copytree( - "repo_templates/dicom_study", dicom_study_ds.path, dirs_exist_ok=True + REPO_TEMPLATES_PATH / "dicom_study", dicom_study_ds.path, dirs_exist_ok=True ) write_ci_env(dicom_study_ds, gitlab_group_path) dicom_study_ds.save(path=".", message="init structure and pipelines") @@ -326,14 +355,13 @@ def init_dicom_study( def write_ci_env( - ds: dlad.Dataset - gitlab_group_path: pathlib.Path -): + ds: dlad.Dataset, + gitlab_group_path: pathlib.Path, +) -> None: env = { "variables": { "STUDY_PATH": str(gitlab_group_path), "BIDS_PATH": str(gitlab_group_path / "bids"), - "NI_DATAOPS_GITLAB_ROOT": NI_DATAOPS_GITLAB_ROOT, } } with (pathlib.Path(ds.path) / ".ci-env.yml").open("w") as outfile: @@ -435,7 +463,10 @@ def export_to_s3( # TODO: check if we can reuse a single bucket (or per study) with fileprefix # git-annex initremote remotename ... remote_name = s3_url.hostname - _, bucket_name, *fileprefix = pathlib.Path(s3_url.path).parts + s3_path = s3_url.path + if '{' in s3_path: + s3_path = s3_path.format(**session_metas) + _, bucket_name, *fileprefix = pathlib.Path(s3_path).parts fileprefix.append(session_metas["StudyInstanceUID"] + "/") ds.repo.init_remote( remote_name, diff --git a/docker/dicom_indexer/indexer/repo_templates/bids/.bids_init b/docker/dicom_indexer/indexer/repo_templates/bids/.bids_init new file mode 100644 index 0000000..a2f7595 --- /dev/null +++ b/docker/dicom_indexer/indexer/repo_templates/bids/.bids_init @@ -0,0 +1 @@ +dummy file to trigger ci to init BIDS branches and remotes