synthrad2025_docker / docker_task_2 /nnunetv2 /nnsyn /nnsyn_preprocessing_seg.py
FelixzeroSun's picture
Upload folder using huggingface_hub
19c1f58 verified
import glob
import os
import SimpleITK as sitk
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import json
import shutil
def copy_dataset_json(raw_path, preprocessed_path, dataset_name):
src_path = os.path.join(raw_path, dataset_name, 'dataset.json')
dst_path = os.path.join(preprocessed_path, dataset_name, 'dataset.json')
if not os.path.exists(dst_path):
shutil.copy(src_path, dst_path)
else:
print(f"Dataset JSON already exists at {dst_path}, skipping copy.")
def create_dataset_json(num_train, preprocessing, dataset_data_path, label_to_use):
labels = {str(label): i + 1 for i, label in enumerate(label_to_use)}
labels["background"] = 0
data_dataset_json = {
"labels": labels,
"channel_names": {
"0": preprocessing,
},
"numTraining": num_train,
"file_ending": ".mha"
}
dump_data_datasets_path = os.path.join(dataset_data_path, 'dataset.json')
with open(dump_data_datasets_path, 'w') as f:
json.dump(data_dataset_json, f)
def makedirs_raw_dataset(dataset_data_path):
os.makedirs(dataset_data_path, exist_ok = True)
os.makedirs(os.path.join(dataset_data_path, 'imagesTr'), exist_ok=True)
os.makedirs(os.path.join(dataset_data_path, 'labelsTr'), exist_ok = True)
def process_file(data_path, dataset_path, modality_suffix="_0000"):
curr_img = sitk.ReadImage(data_path)
filename = os.path.basename(data_path)
if not filename.endswith(f'{modality_suffix}.mha'):
filename = filename + f'{modality_suffix}.mha'
sitk.WriteImage(curr_img, os.path.join(dataset_path, f'imagesTr/{filename}'))
def process_file_labels(data_path, dataset_path):
curr_img = sitk.ReadImage(data_path)
filename = os.path.basename(data_path)
sitk.WriteImage(curr_img, os.path.join(dataset_path, f'labelsTr/{filename}'))
def nnsyn_plan_and_preprocess_seg(data_origin_path: str, dataset_id: int, dataset_id_src: int,
preprocessing_target: str,
configuration: str = '3d_fullres', plan: str = 'nnUNetPlans',
dataset_name: str = None):
list_data_ct = sorted(glob.glob(os.path.join(data_origin_path, 'TARGET_IMAGES','*.mha'), recursive=True))
list_data_labels = sorted(glob.glob(os.path.join(data_origin_path, 'LABELS','*.mha'), recursive=True))
print("target ---", len(list_data_ct), list_data_ct[:2])
print("labels ---", len(list_data_labels), list_data_labels[:2])
if dataset_name is None:
dataset_name = 'SEG_' + os.path.basename(data_origin_path)
# copy data from orign to nnUNet_raw
dataset_data_path = os.path.join(os.environ['nnUNet_raw'], f'Dataset{dataset_id:03d}_{dataset_name}')
makedirs_raw_dataset(dataset_data_path)
with ThreadPoolExecutor() as executor:
list(tqdm(executor.map(lambda data_path: process_file(data_path, dataset_data_path, "_0000"), list_data_ct), total=len(list_data_ct)))
with ThreadPoolExecutor() as executor:
list(tqdm(executor.map(lambda target_path: process_file_labels(target_path, dataset_data_path), list_data_labels), total=len(list_data_labels)))
# create dataset.json
num_train = len(list_data_ct)
assert len(list_data_labels) == len(list_data_ct)
labels_to_use = [
2, # kidney right
3, # kidney left
5, # liver
6, # stomach
*range(10, 14+1), #lungs
*range(26, 50+1), #vertebrae
51, #heart
79, # spinal cord
*range(92, 115+1), # ribs
116 #sternum
]
create_dataset_json(num_train, preprocessing_target, dataset_data_path, labels_to_use)
SOURCE_PLAN_IDENTIFIER = plan
TARGET_PLAN_IDENTIFIER = plan + f'_Dataset{dataset_id_src}'
os.system(f'nnUNetv2_extract_fingerprint -d {dataset_id} --verify_dataset_integrity')
os.system(f'nnUNetv2_move_plans_between_datasets -s {dataset_id_src} -t {dataset_id} -sp {SOURCE_PLAN_IDENTIFIER} -tp {TARGET_PLAN_IDENTIFIER}')
copy_dataset_json(os.environ['nnUNet_raw'], os.environ['nnUNet_preprocessed'], f'Dataset{dataset_id:03d}_{dataset_name}')
os.system(f'nnUNetv2_preprocess -d {dataset_id} -c {configuration} -plans_name {TARGET_PLAN_IDENTIFIER} -np 4')
if __name__ == '__main__':
# example usage:
# python -m nnsyn_preprocessing_entry -d 982 --data_origin_path '/datasets/work/hb-synthrad2023/work/synthrad2025/bw_workplace/data/nnunet_struct/ORIGIN/Synthrad2025_MRI2CT_AB' --preprocessing_target CT --dataset_id_src 960
# nnsyn_plan_and_preprocess -d 982 --data_origin_path '/datasets/work/hb-synthrad2023/work/synthrad2025/bw_workplace/data/nnunet_struct/ORIGIN/Synthrad2025_MRI2CT_AB' --preprocessing_target CT --dataset_id_src 960
nnsyn_plan_and_preprocess_seg(data_origin_path='/datasets/work/hb-synthrad2023/work/synthrad2025/bw_workplace/data/nnunet_struct/ORIGIN/synthrad2025_task1_mri2ct_AB',
dataset_id=961, dataset_id_src=960,
preprocessing_target='CT',
configuration='3d_fullres', plan='nnUNetResEncUNetLPlans')