wip: dicom indexre

This commit is contained in:
bpinsard 2024-01-24 09:15:36 -05:00
parent a579defbb0
commit 54881b3166
1 changed files with 102 additions and 105 deletions

View File

@ -7,7 +7,8 @@ import datalad.api as dlad
import shutil import shutil
GITLAB_REMOTE_NAME = os.environ.get('GITLAB_REMOTE_NAME', 'gitlab') GITLAB_REMOTE_NAME = os.environ.get("GITLAB_REMOTE_NAME", "gitlab")
def sort_series(path: str) -> None: def sort_series(path: str) -> None:
"""Sort series in separate folder """Sort series in separate folder
@ -18,7 +19,7 @@ def sort_series(path: str) -> None:
path to dicoms path to dicoms
""" """
files = glob.glob(os.path.join(path, '*')) files = glob.glob(os.path.join(path, "*"))
for f in files: for f in files:
if not os.path.isfile(f): if not os.path.isfile(f):
continue continue
@ -33,18 +34,21 @@ def sort_series(path: str) -> None:
def _build_arg_parser() -> argparse.ArgumentParser: def _build_arg_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser( p = argparse.ArgumentParser(
description="dicom_indexer - indexes dicoms into datalad") description="dicom_indexer - indexes dicoms into datalad"
)
p.add_argument("input", help="path/url of the dicom.")
p.add_argument( p.add_argument(
'input', nargs='+', "--gitlab-url",
help='path/url of the dicom.') type=str,
p.add_argument() help="http(s) url to the gitlab server where to push repos",
)
p.add_argument( p.add_argument(
'gitlab_group_template', "--gitlab-group-template",
default='{ReferringPhysicianName}/{StudyDescription.replace('^','/')}' default="{ReferringPhysicianName}/{StudyDescription.replace(" ^ "," / ")}",
type=str) type=str,
p.add_argument( help="string with placeholder for dicom tags",
'--storage-remote', )
help='url to the datalad remote') p.add_argument("--storage-remote", help="url to the datalad remote")
p.add_argument( p.add_argument(
"--sort-series", "--sort-series",
action="store_true", action="store_true",
@ -60,8 +64,8 @@ def _build_arg_parser() -> argparse.ArgumentParser:
) )
return p return p
def main() -> None:
def main() -> None:
parser = _build_arg_parser() parser = _build_arg_parser()
args = parser.parse_args() args = parser.parse_args()
@ -76,25 +80,24 @@ def main() -> None:
fake_dates=p.fake_dates, fake_dates=p.fake_dates,
) )
def process( def process(
input:urllib.parse.ParseResult, input: urllib.parse.ParseResult,
output_remote: urllib.parse.ParseResult, output_remote: urllib.parse.ParseResult,
sort_series: bool, sort_series: bool,
fake_dates: bool, fake_dates: bool,
p7z_opts: str, p7z_opts: str,
gitlab_url: urllib.parse.ParseResult, gitlab_url: urllib.parse.ParseResult,
gitlab_group_template: str, gitlab_group_template: str,
force_export: bool=False, force_export: bool = False,
) -> None: ) -> None:
"""Process incoming dicoms into datalad repo """Process incoming dicoms into datalad repo"""
"""
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates) dicom_session_ds = dlad.create(tmpdirname, fake_dates=fake_dates)
do_export = force_export do_export = force_export
if not input.scheme or input.scheme == 'file': if not input.scheme or input.scheme == "file":
dest = import_local_data( dest = import_local_data(
dicom_session_ds, dicom_session_ds,
pathlib.Path(input.path), pathlib.Path(input.path),
@ -102,7 +105,7 @@ def process(
p7z_opts=p7z_opts, p7z_opts=p7z_opts,
) )
do_export = True do_export = True
elif input.scheme in ['http', 'https', 's3']: elif input.scheme in ["http", "https", "s3"]:
dest = import_remote_data(dicom_session_ds, input_url) dest = import_remote_data(dicom_session_ds, input_url)
# index dicoms files # index dicoms files
@ -112,25 +115,21 @@ def process(
commit=False, commit=False,
) )
# cannot pass message above so commit now # cannot pass message above so commit now
dicom_session_ds.save(message='index dicoms from archive')# dicom_session_ds.save(message="index dicoms from archive") #
# optimize git index after large import # optimize git index after large import
dicom_session_ds.repo.gc() # aggressive by default dicom_session_ds.repo.gc() # aggressive by default
session_metas = extract_session_metas(dicom_session_ds) session_metas = extract_session_metas(dicom_session_ds)
if do_export: if do_export:
if output_remote.scheme == 'ria': if output_remote.scheme == "ria":
export_to_ria(dicom_session_ds, output_remote, session_metas) export_to_ria(dicom_session_ds, output_remote, session_metas)
elif output_remote.scheme == 's3': elif output_remote.scheme == "s3":
export_to_s3(dicom_session_ds, output_remote, session_metas) export_to_s3(dicom_session_ds, output_remote, session_metas)
setup_gitlab_remote(dicom_session_ds, gitlab_url, session_metas) setup_gitlab_remote(dicom_session_ds, gitlab_url, session_metas)
def setup_gitlab_repos( def setup_gitlab_repos(
dicom_session_ds: dlad.Dataset, dicom_session_ds: dlad.Dataset,
gitlab_url: urllib.parse.ParseResult, gitlab_url: urllib.parse.ParseResult,
@ -139,36 +138,37 @@ def setup_gitlab_repos(
gitlab_conn = connect_gitlab() gitlab_conn = connect_gitlab()
gitlab_group_path = gitlab_group_template.format(session_metas) gitlab_group_path = gitlab_group_template.format(session_metas)
dicom_sourcedata_path = '/'.join([dicom_session_path, 'sourcedata/dicoms']) dicom_sourcedata_path = "/".join([dicom_session_path, "sourcedata/dicoms"])
dicom_session_path = '/'.join([dicom_sourcedata_path, ['StudyInstanceUID']]) dicom_session_path = "/".join([dicom_sourcedata_path, ["StudyInstanceUID"]])
dicom_study_path = '/'.join([dicom_sourcedata_path, 'study']) dicom_study_path = "/".join([dicom_sourcedata_path, "study"])
dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path) dicom_session_repo = get_or_create_gitlab_project(gl, dicom_session_path)
ds.siblings( ds.siblings(
action='configure', # allow to overwrite existing config action="configure", # allow to overwrite existing config
name=GITLAB_REMOTE_NAME, name=GITLAB_REMOTE_NAME,
url=dicom_session_repo._attrs['ssh_url_to_repo'], url=dicom_session_repo._attrs["ssh_url_to_repo"],
) )
ds.push(to=GITLAB_REMOTE_NAME) ds.push(to=GITLAB_REMOTE_NAME)
study_group = get_or_create_group(gl, gitlab_group_path) study_group = get_or_create_group(gl, gitlab_group_path)
bot_user = gl.users.list(username=GITLAB_BOT_USERNAME)[0] bot_user = gl.users.list(username=GITLAB_BOT_USERNAME)[0]
study_group.members.create({ study_group.members.create(
'user_id': bot_user.id, {
'access_level': gitlab.const.AccessLevel.MAINTAINER, "user_id": bot_user.id,
}) "access_level": gitlab.const.AccessLevel.MAINTAINER,
}
)
dicom_study_repo = get_or_create_project(gl, dicom_study_path) dicom_study_repo = get_or_create_project(gl, dicom_study_path)
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
dicom_study_ds = datalad.api.install( dicom_study_ds = datalad.api.install(
source = dicom_study_repo._attrs['ssh_url_to_repo'], source=dicom_study_repo._attrs["ssh_url_to_repo"],
path=tmpdir, path=tmpdir,
) )
if dicom_study_ds.repo.get_hexsha() is None or dicom_study_ds.id is None: if dicom_study_ds.repo.get_hexsha() is None or dicom_study_ds.id is None:
dicom_study_ds.create(force=True) dicom_study_ds.create(force=True)
dicom_study_ds.push(to='origin') dicom_study_ds.push(to="origin")
# add default study DS structure. # add default study DS structure.
init_dicom_study(dicom_study_ds, PI, study_name) init_dicom_study(dicom_study_ds, PI, study_name)
# initialize BIDS project # initialize BIDS project
@ -177,103 +177,100 @@ def setup_gitlab_repos(
create_group(gl, [PI, study_name, "qc"]) create_group(gl, [PI, study_name, "qc"])
dicom_study_ds.install( dicom_study_ds.install(
source=dicom_session_repo._attrs['ssh_url_to_repo'], source=dicom_session_repo._attrs["ssh_url_to_repo"],
path=session_meta['PatientName'], path=session_meta["PatientName"],
) )
dicom_study_ds.create_sibling_ria( dicom_study_ds.create_sibling_ria(
UNF_DICOMS_RIA_URL, UNF_DICOMS_RIA_URL,
name=UNF_DICOMS_RIA_NAME, name=UNF_DICOMS_RIA_NAME,
alias=study_name, alias=study_name,
existing='reconfigure') existing="reconfigure",
)
# Push to gitlab + local ria-store # Push to gitlab + local ria-store
dicom_study_ds.push(to='origin') dicom_study_ds.push(to="origin")
dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME) dicom_study_ds.push(to=UNF_DICOMS_RIA_NAME)
SESSION_META_KEYS = [ SESSION_META_KEYS = [
'StudyInstanceUID', "StudyInstanceUID",
'PatientID', "PatientID",
'PatientName', "PatientName",
'ReferringPhysicianName', "ReferringPhysicianName",
'StudyDate', "StudyDate",
'StudyDescription', "StudyDescription",
] ]
def extract_session_metas(dicom_session_ds: dlad.Dataset): def extract_session_metas(dicom_session_ds: dlad.Dataset):
all_files = dicom_session_ds.repo.find('*') all_files = dicom_session_ds.repo.find("*")
for f in all_files: for f in all_files:
try: try:
dic = dicom.read_file(f, stop_before_pixels=True) dic = dicom.read_file(f, stop_before_pixels=True)
except Exception: # TODO: what exception occurs when non-dicom ? except Exception: # TODO: what exception occurs when non-dicom ?
continue continue
# return at first dicom found # return at first dicom found
return {k:getattr(dic, k) for k in SESSION_META_KEYS} return {k: getattr(dic, k) for k in SESSION_META_KEYS}
def import_local_data( def import_local_data(
dicom_session_ds: dlad.Dataset, dicom_session_ds: dlad.Dataset,
input_path: pathlib.Path, input_path: pathlib.Path,
sort_series: bool=True, sort_series: bool = True,
p7z_opts: str='-mx5' p7z_opts: str = "-mx5",
): ):
dest = input_path.basename() dest = input_path.basename()
if input_path.is_dir(): if input_path.is_dir():
dest = dest + '.7z' dest = dest + ".7z"
# create 7z archive with 1block/file parameters # create 7z archive with 1block/file parameters
subprocess.run( subprocess.run(
['7z', 'u', str(dest), '.'] + p7z_opts, ["7z", "u", str(dest), "."] + p7z_opts,
cwd=str(dicom_session_ds.path), cwd=str(dicom_session_ds.path),
) )
elif input_path.is_file(): elif input_path.is_file():
dest = dicom_session_ds.path / dest dest = dicom_session_ds.path / dest
try: # try hard-linking to avoid copying try: # try hard-linking to avoid copying
os.link(str(input_path), str(dest)) os.link(str(input_path), str(dest))
except OSError: #fallback if hard-linking not supported except OSError: # fallback if hard-linking not supported
shutil.copyfile(str(input_path), str(dest)) shutil.copyfile(str(input_path), str(dest))
dicom_session_ds.save(dest, message='add dicoms archive') dicom_session_ds.save(dest, message="add dicoms archive")
return dest return dest
def import_remote_data( def import_remote_data(
dicom_session_ds:dlad.Dataset, dicom_session_ds: dlad.Dataset, input_url: urllib.parse.ParseResult
input_url:urllib.parse.ParseResult): ):
try: try:
dest = pathlib.Path(url.path).basename dest = pathlib.Path(url.path).basename
dicom_session_ds.repo.add_url_to_file(dest, url) dicom_session_ds.repo.add_url_to_file(dest, url)
except Exception: except Exception:
... #TODO: check how things can fail here and deal with it. ... # TODO: check how things can fail here and deal with it.
return dest return dest
def export_to_ria( def export_to_ria(
ds: dlad.Dataset, ds: dlad.Dataset,
ria_url:urllib.parse.ParseResult, ria_url: urllib.parse.ParseResult,
session_metas: dict, session_metas: dict,
): ):
ria_name = pathlib.Path(ria_url.path).basename ria_name = pathlib.Path(ria_url.path).basename
ds.create_sibling_ria( ds.create_sibling_ria(
ria_url, ria_url, name=ria_name, alias=session_meta["PatientID"], existing="reconfigure"
name=ria_name, )
alias=session_meta['PatientID'], ds.push(to=ria_name, data="nothing")
existing='reconfigure') ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]["url"])
ds.push(to=ria_name, data='nothing') archive_path = ria_sibling_path / "archives" / "archive.7z"
ria_sibling_path = pathlib.Path(ds.siblings(name=ria_name)[0]['url'])
archive_path = ria_sibling_path / 'archives' / 'archive.7z'
ds.export_archive_ora( ds.export_archive_ora(
archive_path, archive_path, opts=[f"-mx{COMPRESSION_LEVEL}"], missing_content="error"
opts=[f'-mx{COMPRESSION_LEVEL}'], )
missing_content='error') ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) # index
ds.repo.fsck(remote=f"{ria_url}-storage", fast=True) #index ds.push(to=ria_name, data="nothing")
ds.push(to=ria_name, data='nothing')
def export_to_s3( def export_to_s3(
ds: dlad.Dataset, ds: dlad.Dataset,
s3_url:urllib.parse.ParseResult, s3_url: urllib.parse.ParseResult,
session_metas: dict, session_metas: dict,
): ):
... ...
@ -294,17 +291,16 @@ def connect_gitlab(debug=False):
def get_or_create_gitlab_group(gl, group_list): def get_or_create_gitlab_group(gl, group_list):
""" """ """
"""
found = False found = False
for keep_groups in reversed(range(len(group_list)+1)): for keep_groups in reversed(range(len(group_list) + 1)):
tmp_repo_path = '/'.join(group_list[0:keep_groups]) tmp_repo_path = "/".join(group_list[0:keep_groups])
logging.warning(tmp_repo_path) logging.warning(tmp_repo_path)
gs = gl.groups.list(search=tmp_repo_path) gs = gl.groups.list(search=tmp_repo_path)
for g in gs: for g in gs:
if g.full_path == tmp_repo_path: if g.full_path == tmp_repo_path:
found = True found = True
break break
if found: if found:
break break
for nb_groups in range(keep_groups, len(group_list)): for nb_groups in range(keep_groups, len(group_list)):
@ -312,34 +308,36 @@ def get_or_create_gitlab_group(gl, group_list):
msg = "Creating group {}".format(group_list[nb_groups]) msg = "Creating group {}".format(group_list[nb_groups])
logging.warning(msg) logging.warning(msg)
logging.warning(len(msg) * "=") logging.warning(len(msg) * "=")
g = gl.groups.create({'name': group_list[nb_groups], g = gl.groups.create(
'path': group_list[nb_groups]}) {"name": group_list[nb_groups], "path": group_list[nb_groups]}
)
else: else:
msg = 'Creating group {} from {}'.format(group_list[nb_groups], msg = "Creating group {} from {}".format(group_list[nb_groups], g.name)
g.name)
logging.warning(msg) logging.warning(msg)
logging.warning(len(msg) * "=") logging.warning(len(msg) * "=")
g = gl.groups.create({'name': group_list[nb_groups], g = gl.groups.create(
'path': group_list[nb_groups], {
'parent_id': g.id}) "name": group_list[nb_groups],
"path": group_list[nb_groups],
"parent_id": g.id,
}
)
return g return g
def get_or_create_gitlab_project(gl, project_name): def get_or_create_gitlab_project(gl, project_name):
""" """ """
"""
if len(project_name) == 1: if len(project_name) == 1:
# Check if exists # Check if exists
p = gl.projects.list(search=project_name[0]) p = gl.projects.list(search=project_name[0])
if not p: if not p:
p = gl.projects.create({'name': project_name[0], p = gl.projects.create({"name": project_name[0], "path": project_name[0]})
'path': project_name[0]})
return p.id return p.id
else: else:
return p[0].id return p[0].id
repo_full_path = '/'.join(project_name) repo_full_path = "/".join(project_name)
# Look for exact repo/project: # Look for exact repo/project:
p = gl.projects.list(search=project_name[-1]) p = gl.projects.list(search=project_name[-1])
@ -349,6 +347,5 @@ def get_or_create_gitlab_project(gl, project_name):
return curr_p return curr_p
g = get_or_create_gitlab_group(gl, project_name[:-1]) g = get_or_create_gitlab_group(gl, project_name[:-1])
p = gl.projects.create({'name': project_name[-1], p = gl.projects.create({"name": project_name[-1], "namespace_id": g.id})
'namespace_id': g.id})
return p return p