|
|
import glob |
|
|
import os |
|
|
import SimpleITK as sitk |
|
|
import numpy as np |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
from tqdm import tqdm |
|
|
import json |
|
|
import shutil |
|
|
|
|
|
def copy_dataset_json(raw_path, preprocessed_path, dataset_name): |
|
|
src_path = os.path.join(raw_path, dataset_name, 'dataset.json') |
|
|
dst_path = os.path.join(preprocessed_path, dataset_name, 'dataset.json') |
|
|
if not os.path.exists(dst_path): |
|
|
shutil.copy(src_path, dst_path) |
|
|
else: |
|
|
print(f"Dataset JSON already exists at {dst_path}, skipping copy.") |
|
|
|
|
|
def create_dataset_json(num_train, preprocessing, dataset_data_path, label_to_use): |
|
|
labels = {str(label): i + 1 for i, label in enumerate(label_to_use)} |
|
|
labels["background"] = 0 |
|
|
|
|
|
data_dataset_json = { |
|
|
"labels": labels, |
|
|
"channel_names": { |
|
|
"0": preprocessing, |
|
|
|
|
|
}, |
|
|
"numTraining": num_train, |
|
|
"file_ending": ".mha" |
|
|
} |
|
|
dump_data_datasets_path = os.path.join(dataset_data_path, 'dataset.json') |
|
|
with open(dump_data_datasets_path, 'w') as f: |
|
|
json.dump(data_dataset_json, f) |
|
|
|
|
|
def makedirs_raw_dataset(dataset_data_path): |
|
|
|
|
|
os.makedirs(dataset_data_path, exist_ok = True) |
|
|
os.makedirs(os.path.join(dataset_data_path, 'imagesTr'), exist_ok=True) |
|
|
os.makedirs(os.path.join(dataset_data_path, 'labelsTr'), exist_ok = True) |
|
|
|
|
|
def process_file(data_path, dataset_path, modality_suffix="_0000"): |
|
|
curr_img = sitk.ReadImage(data_path) |
|
|
filename = os.path.basename(data_path) |
|
|
if not filename.endswith(f'{modality_suffix}.mha'): |
|
|
filename = filename + f'{modality_suffix}.mha' |
|
|
sitk.WriteImage(curr_img, os.path.join(dataset_path, f'imagesTr/{filename}')) |
|
|
|
|
|
def process_file_labels(data_path, dataset_path): |
|
|
curr_img = sitk.ReadImage(data_path) |
|
|
filename = os.path.basename(data_path) |
|
|
sitk.WriteImage(curr_img, os.path.join(dataset_path, f'labelsTr/{filename}')) |
|
|
|
|
|
|
|
|
def nnsyn_plan_and_preprocess_seg(data_origin_path: str, dataset_id: int, dataset_id_src: int, |
|
|
preprocessing_target: str, |
|
|
configuration: str = '3d_fullres', plan: str = 'nnUNetPlans', |
|
|
dataset_name: str = None): |
|
|
list_data_ct = sorted(glob.glob(os.path.join(data_origin_path, 'TARGET_IMAGES','*.mha'), recursive=True)) |
|
|
list_data_labels = sorted(glob.glob(os.path.join(data_origin_path, 'LABELS','*.mha'), recursive=True)) |
|
|
print("target ---", len(list_data_ct), list_data_ct[:2]) |
|
|
print("labels ---", len(list_data_labels), list_data_labels[:2]) |
|
|
|
|
|
if dataset_name is None: |
|
|
dataset_name = 'SEG_' + os.path.basename(data_origin_path) |
|
|
|
|
|
|
|
|
|
|
|
dataset_data_path = os.path.join(os.environ['nnUNet_raw'], f'Dataset{dataset_id:03d}_{dataset_name}') |
|
|
makedirs_raw_dataset(dataset_data_path) |
|
|
|
|
|
with ThreadPoolExecutor() as executor: |
|
|
list(tqdm(executor.map(lambda data_path: process_file(data_path, dataset_data_path, "_0000"), list_data_ct), total=len(list_data_ct))) |
|
|
|
|
|
with ThreadPoolExecutor() as executor: |
|
|
list(tqdm(executor.map(lambda target_path: process_file_labels(target_path, dataset_data_path), list_data_labels), total=len(list_data_labels))) |
|
|
|
|
|
|
|
|
|
|
|
num_train = len(list_data_ct) |
|
|
assert len(list_data_labels) == len(list_data_ct) |
|
|
labels_to_use = [ |
|
|
2, |
|
|
3, |
|
|
5, |
|
|
6, |
|
|
*range(10, 14+1), |
|
|
*range(26, 50+1), |
|
|
51, |
|
|
79, |
|
|
*range(92, 115+1), |
|
|
116 |
|
|
] |
|
|
create_dataset_json(num_train, preprocessing_target, dataset_data_path, labels_to_use) |
|
|
|
|
|
SOURCE_PLAN_IDENTIFIER = plan |
|
|
TARGET_PLAN_IDENTIFIER = plan + f'_Dataset{dataset_id_src}' |
|
|
|
|
|
os.system(f'nnUNetv2_extract_fingerprint -d {dataset_id} --verify_dataset_integrity') |
|
|
os.system(f'nnUNetv2_move_plans_between_datasets -s {dataset_id_src} -t {dataset_id} -sp {SOURCE_PLAN_IDENTIFIER} -tp {TARGET_PLAN_IDENTIFIER}') |
|
|
copy_dataset_json(os.environ['nnUNet_raw'], os.environ['nnUNet_preprocessed'], f'Dataset{dataset_id:03d}_{dataset_name}') |
|
|
os.system(f'nnUNetv2_preprocess -d {dataset_id} -c {configuration} -plans_name {TARGET_PLAN_IDENTIFIER} -np 4') |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
|
|
nnsyn_plan_and_preprocess_seg(data_origin_path='/datasets/work/hb-synthrad2023/work/synthrad2025/bw_workplace/data/nnunet_struct/ORIGIN/synthrad2025_task1_mri2ct_AB', |
|
|
dataset_id=961, dataset_id_src=960, |
|
|
preprocessing_target='CT', |
|
|
configuration='3d_fullres', plan='nnUNetResEncUNetLPlans') |