import glob import os import SimpleITK as sitk import numpy as np from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm import json import shutil def copy_dataset_json(raw_path, preprocessed_path, dataset_name): src_path = os.path.join(raw_path, dataset_name, 'dataset.json') dst_path = os.path.join(preprocessed_path, dataset_name, 'dataset.json') if not os.path.exists(dst_path): shutil.copy(src_path, dst_path) else: print(f"Dataset JSON already exists at {dst_path}, skipping copy.") def create_dataset_json(num_train, preprocessing, dataset_data_path, label_to_use): labels = {str(label): i + 1 for i, label in enumerate(label_to_use)} labels["background"] = 0 data_dataset_json = { "labels": labels, "channel_names": { "0": preprocessing, }, "numTraining": num_train, "file_ending": ".mha" } dump_data_datasets_path = os.path.join(dataset_data_path, 'dataset.json') with open(dump_data_datasets_path, 'w') as f: json.dump(data_dataset_json, f) def makedirs_raw_dataset(dataset_data_path): os.makedirs(dataset_data_path, exist_ok = True) os.makedirs(os.path.join(dataset_data_path, 'imagesTr'), exist_ok=True) os.makedirs(os.path.join(dataset_data_path, 'labelsTr'), exist_ok = True) def process_file(data_path, dataset_path, modality_suffix="_0000"): curr_img = sitk.ReadImage(data_path) filename = os.path.basename(data_path) if not filename.endswith(f'{modality_suffix}.mha'): filename = filename + f'{modality_suffix}.mha' sitk.WriteImage(curr_img, os.path.join(dataset_path, f'imagesTr/{filename}')) def process_file_labels(data_path, dataset_path): curr_img = sitk.ReadImage(data_path) filename = os.path.basename(data_path) sitk.WriteImage(curr_img, os.path.join(dataset_path, f'labelsTr/{filename}')) def nnsyn_plan_and_preprocess_seg(data_origin_path: str, dataset_id: int, dataset_id_src: int, preprocessing_target: str, configuration: str = '3d_fullres', plan: str = 'nnUNetPlans', dataset_name: str = None): list_data_ct = sorted(glob.glob(os.path.join(data_origin_path, 'TARGET_IMAGES','*.mha'), recursive=True)) list_data_labels = sorted(glob.glob(os.path.join(data_origin_path, 'LABELS','*.mha'), recursive=True)) print("target ---", len(list_data_ct), list_data_ct[:2]) print("labels ---", len(list_data_labels), list_data_labels[:2]) if dataset_name is None: dataset_name = 'SEG_' + os.path.basename(data_origin_path) # copy data from orign to nnUNet_raw dataset_data_path = os.path.join(os.environ['nnUNet_raw'], f'Dataset{dataset_id:03d}_{dataset_name}') makedirs_raw_dataset(dataset_data_path) with ThreadPoolExecutor() as executor: list(tqdm(executor.map(lambda data_path: process_file(data_path, dataset_data_path, "_0000"), list_data_ct), total=len(list_data_ct))) with ThreadPoolExecutor() as executor: list(tqdm(executor.map(lambda target_path: process_file_labels(target_path, dataset_data_path), list_data_labels), total=len(list_data_labels))) # create dataset.json num_train = len(list_data_ct) assert len(list_data_labels) == len(list_data_ct) labels_to_use = [ 2, # kidney right 3, # kidney left 5, # liver 6, # stomach *range(10, 14+1), #lungs *range(26, 50+1), #vertebrae 51, #heart 79, # spinal cord *range(92, 115+1), # ribs 116 #sternum ] create_dataset_json(num_train, preprocessing_target, dataset_data_path, labels_to_use) SOURCE_PLAN_IDENTIFIER = plan TARGET_PLAN_IDENTIFIER = plan + f'_Dataset{dataset_id_src}' os.system(f'nnUNetv2_extract_fingerprint -d {dataset_id} --verify_dataset_integrity') os.system(f'nnUNetv2_move_plans_between_datasets -s {dataset_id_src} -t {dataset_id} -sp {SOURCE_PLAN_IDENTIFIER} -tp {TARGET_PLAN_IDENTIFIER}') copy_dataset_json(os.environ['nnUNet_raw'], os.environ['nnUNet_preprocessed'], f'Dataset{dataset_id:03d}_{dataset_name}') os.system(f'nnUNetv2_preprocess -d {dataset_id} -c {configuration} -plans_name {TARGET_PLAN_IDENTIFIER} -np 4') if __name__ == '__main__': # example usage: # python -m nnsyn_preprocessing_entry -d 982 --data_origin_path '/datasets/work/hb-synthrad2023/work/synthrad2025/bw_workplace/data/nnunet_struct/ORIGIN/Synthrad2025_MRI2CT_AB' --preprocessing_target CT --dataset_id_src 960 # nnsyn_plan_and_preprocess -d 982 --data_origin_path '/datasets/work/hb-synthrad2023/work/synthrad2025/bw_workplace/data/nnunet_struct/ORIGIN/Synthrad2025_MRI2CT_AB' --preprocessing_target CT --dataset_id_src 960 nnsyn_plan_and_preprocess_seg(data_origin_path='/datasets/work/hb-synthrad2023/work/synthrad2025/bw_workplace/data/nnunet_struct/ORIGIN/synthrad2025_task1_mri2ct_AB', dataset_id=961, dataset_id_src=960, preprocessing_target='CT', configuration='3d_fullres', plan='nnUNetResEncUNetLPlans')