|
|
| '''
|
| write by ygq
|
| create on 2025-07-24
|
| update MnMs data clean
|
| https://github.com/openmedlab/Awesome-Medical-Dataset/blob/main/resources/M&Ms.md
|
| https://zhuanlan.zhihu.com/p/694831343
|
|
|
| 来自 6 个国际医疗中心 的 340 名受试者 的 CMR 数据。
|
| 覆盖 4 个主流 MRI 设备厂商(Siemens, Philips, GE, Canon)。
|
| 数据集文件结构如下,数据集被组织成训练集、验证集和测试集三个主目录,其中训练集进一步分为有标注和无标注的子目录。每个有标注的子目录包含病人的成像文件以及相应的标注数据。
|
| M&Ms
|
| ├── Training
|
| │ ├── Labeled
|
| │ │ ├── A0S9V9
|
| │ │ │ ├── A0S9V9_sa.nii.gz
|
| │ │ │ └── A0S9V9_sa_gt.nii.gz
|
| │ │ ├── A1D0Q7
|
| │ │ ├── A1D9Z7
|
| │ │ └── ...
|
| │ └── Unlabeled
|
| ├── Validation
|
| ├── Testing
|
| └── 211230_M&Ms_Dataset_information_diagnosis_opendataset.csv
|
|
|
| 对训练集有标注的 150 例数据进行图像尺寸统计,size 的格式为 (x,y,z,frame)
|
| 经验丰富的临床医生对心脏磁共振(CMR)图像进行了分割,参考了 ACDC 的标注标准,标注了左心室(LV)、右心室(RV)血池以及左心室心肌(MYO)的轮廓,标签分别为:1(LV)、2(MYO)和3(RV)。
|
|
|
| '''
|
| import os
|
| import glob
|
| import pandas as pd
|
| import SimpleITK as sitk
|
| import argparse
|
| import json
|
| from tqdm import tqdm
|
| from util import meta_data
|
| import util
|
| import numpy as np
|
|
|
|
|
|
|
|
|
| meta_id_name='External code'
|
| meta_vendor_name='VendorName'
|
| meta_centre_name='Centre'
|
| meta_pathology_name='Pathology'
|
| meta_ed_name='ED'
|
| meta_es_name='ES'
|
| meta_age_name='Age'
|
| meta_sex_name='Sex'
|
| meta_height_name='Height'
|
| meta_weight_name='Weight'
|
|
|
| TASK_VALUE="segmentation"
|
| CLAMP_RANGE_CT = [-300,300]
|
| CLAMP_RANGE_MRI = None
|
| TARGET_VOXEL_SPACING=None
|
|
|
| LABEL_DICT={
|
| "0":"backgroud",
|
| "1":"LV",
|
| "2":"MYO",
|
| "3":"RV"
|
| }
|
|
|
|
|
|
|
|
|
|
|
|
|
| def find_metadata_files(path):
|
|
|
| search_pattern = os.path.join(path, '*.csv')
|
| return glob.glob(search_pattern, recursive=True)
|
|
|
| def find_image_dirs(path):
|
| return os.listdir(path)
|
|
|
|
|
| def load_dicom_images(folder_path):
|
| reader = sitk.ImageSeriesReader()
|
| dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
|
| reader.SetFileNames(dicom_names)
|
| image = reader.Execute()
|
| return dicom_names,image
|
|
|
|
|
| def load_dicom_tag(imgs):
|
| reader = sitk.ImageFileReader()
|
|
|
| reader.SetFileName(imgs)
|
| reader.ReadImageInformation()
|
|
|
| tag=reader.Execute()
|
| return tag
|
|
|
| def load_nrrd(fp):
|
| return sitk.ReadImage(fp)
|
|
|
| def save_nifti(image, output_path, folder_path):
|
|
|
| output_dirpath = os.path.dirname(output_path)
|
| if not os.path.exists(output_dirpath):
|
| print(f"Creating directory {output_dirpath}")
|
| os.makedirs(output_dirpath)
|
|
|
| image.SetMetaData("FolderPath", folder_path)
|
| sitk.WriteImage(image, output_path)
|
|
|
|
|
| def convert_windows_to_linux_path(windows_path):
|
|
|
|
|
| linux_path = windows_path.replace('\\', '/')
|
| if ':' in linux_path:
|
| linux_path = linux_path.split(':', 1)[1]
|
| return linux_path
|
|
|
| def main(target_path, output_dir):
|
| metadata_files = find_metadata_files(target_path)
|
| pid_dirs=find_image_dirs(target_path)
|
| pid_dirs=["Training","Testing","Validation"]
|
| failed_files = []
|
| if not os.path.isdir(output_dir):
|
| os.makedirs(output_dir)
|
| json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
|
| failed_files_path = os.path.join(output_dir, 'failed_files.json')
|
| meta = meta_data()
|
|
|
|
|
| if not os.path.exists(json_output_path):
|
| with open(json_output_path, 'w') as json_file:
|
| json.dump({}, json_file)
|
| meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv')
|
| if os.path.isfile(meta_file):
|
| mf_flag=True
|
| df_meta=pd.read_csv(meta_file,sep=',')
|
| else:
|
| mf_flag=False
|
|
|
| if pid_dirs:
|
| for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
|
| if not os.path.isdir(os.path.join(target_path,pid_dir)):
|
| continue
|
| if pid_dir =="Training":
|
| tr_flag=True
|
| else:
|
| tr_flag=False
|
| label_flag=False
|
|
|
| if not tr_flag:
|
| image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
|
| unlabeled_list=image_dirs
|
| else:
|
| image_dir_1=find_image_dirs(os.path.join(target_path,pid_dir,'Labeled'))
|
| image_dir_2=find_image_dirs(os.path.join(target_path,pid_dir,'Unlabeled'))
|
| unlabeled_list=image_dir_2
|
| image_dirs=image_dir_1+image_dir_2
|
| for data_dir in tqdm(image_dirs, desc="Processing images files"):
|
|
|
| location=data_dir
|
| if not tr_flag:
|
| full_path=os.path.join(target_path,pid_dir,data_dir)
|
| else:
|
| if data_dir in unlabeled_list:
|
| full_path=os.path.join(target_path,pid_dir,"Unlabeled",data_dir)
|
| else:
|
| full_path=os.path.join(target_path,pid_dir,"Labeled",data_dir)
|
| label_flag=True
|
| data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
|
|
|
| if data_info_row.shape[0]>0:
|
| data_info_row=data_info_row.reset_index()
|
|
|
| meta_image_id=data_info_row[meta_id_name][0]
|
| meta_vendor=data_info_row[meta_vendor_name][0]
|
| meta_centre=data_info_row[meta_centre_name][0]
|
| meta_pathology=data_info_row[meta_pathology_name][0]
|
| meta_age=data_info_row[meta_age_name][0]
|
| meta_sex=data_info_row[meta_sex_name][0]
|
| meta_height=data_info_row[meta_height_name][0]
|
| meta_weigth=data_info_row[meta_weight_name][0]
|
| meta_ed=data_info_row[meta_ed_name][0]
|
| meta_es=data_info_row[meta_es_name][0]
|
| else:
|
| meta_image_id=data_dir
|
| meta_vendor=''
|
| meta_centre=''
|
| meta_pathology=''
|
| meta_age=''
|
| meta_sex=''
|
| meta_height=''
|
| meta_weigth=''
|
| meta_ed=''
|
| meta_es=''
|
|
|
| if not os.path.isdir(full_path):
|
| continue
|
| try:
|
| print(full_path)
|
| full_path_image=os.path.join(full_path,"%s_sa.nii.gz"%data_dir)
|
|
|
| if label_flag:
|
| full_path_label=os.path.join(full_path,"%s_sa_gt.nii.gz"%data_dir)
|
| if not os.path.isfile(full_path_label):
|
| full_path_label=None
|
| else:
|
| full_path_label=None
|
|
|
| sitk_img_original = util.load_nifti(full_path_image)
|
| if sitk_img_original is None:
|
| print(f" Failed to load image: {full_path_image}")
|
| continue
|
|
|
| modality="MRI"
|
| study='MnMs'
|
| CIA_other_info = {
|
| 'metadata_file':''
|
|
|
| }
|
| CIA_other_info['split'] = pid_dir
|
| if mf_flag:
|
| CIA_other_info['metadata_file']=meta_file
|
|
|
| original_spacing = list(sitk_img_original.GetSpacing())
|
| original_size = list(sitk_img_original.GetSize())
|
| sitk_img_processed = sitk_img_original
|
|
|
| is_4d_image = sitk_img_original.GetDimension() == 4
|
|
|
| frame_flag=False
|
|
|
| if is_4d_image:
|
|
|
|
|
|
|
|
|
| channels = []
|
| num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
|
| channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3]
|
|
|
|
|
| for i in range(num_channels):
|
| extractor = sitk.ExtractImageFilter()
|
| current_3d_channel_size = original_size[:3]
|
|
|
| if sitk_img_original.GetDimension() == 4:
|
| extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
|
| extractor.SetIndex([0,0,0,i])
|
| channel_3d_img = extractor.Execute(sitk_img_original)
|
| else:
|
| channel_3d_img = sitk_img_original
|
| if i > 0: break
|
|
|
| channel_resampler = util.get_unisize_resampler(
|
| channel_3d_img, 'linear',
|
| spacing=channel_target_spacing, size=current_3d_channel_size
|
| )
|
| if channel_resampler:
|
| channels.append(channel_resampler.Execute(channel_3d_img))
|
| else:
|
| channels.append(channel_3d_img)
|
|
|
| if channels:
|
| if len(channels) > 1:
|
| sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
|
|
|
| frame_flag=True
|
| imgDict={}
|
| for kf_idx in range(num_channels):
|
| imgDict[str(kf_idx)]='none'
|
| if str(meta_ed):imgDict[str(meta_ed)]='ed'
|
| if str(meta_es):imgDict[str(meta_es)]='es'
|
| meta.add_keyvalue('ImgDict',imgDict)
|
| elif len(channels) == 1:
|
| sitk_img_processed = channels[0]
|
| elif TARGET_VOXEL_SPACING:
|
| img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
|
| spacing=TARGET_VOXEL_SPACING, size=original_size)
|
| if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
|
| else:
|
| img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
|
| spacing=original_spacing, size=original_size)
|
| if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
|
|
|
|
|
|
|
|
|
| CIA_other_info['Image_id']=meta_image_id
|
| CIA_other_info['Vendor']=meta_vendor
|
| CIA_other_info['Centre']=str(meta_centre)
|
| CIA_other_info['Pathology']=str(meta_pathology)
|
| CIA_other_info['Age']=str(meta_age)
|
| CIA_other_info['Sex']=meta_sex
|
| CIA_other_info['Height']=str(meta_height)
|
| CIA_other_info['Weight']=str(meta_weigth)
|
| CIA_other_info['ED']=str(meta_ed)
|
| CIA_other_info['ES']=str(meta_es)
|
|
|
|
|
|
|
|
|
|
|
| is_processed_4d = sitk_img_processed.GetDimension() == 4
|
| clamp_range_to_use=None
|
| if clamp_range_to_use and is_processed_4d:
|
| clamped_channels_final = []
|
| num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
|
| for i in range(num_channels_final):
|
| extractor = sitk.ExtractImageFilter()
|
| proc_size_final = sitk_img_processed.GetSize()
|
| extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0])
|
| extractor.SetIndex([0,0,0,i])
|
| channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed)
|
| clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use))
|
| if clamped_channels_final:
|
| if len(clamped_channels_final) > 1:
|
| sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final)
|
| elif len(clamped_channels_final) == 1:
|
| sitk_img_processed = clamped_channels_final[0]
|
| elif clamp_range_to_use:
|
| sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use)
|
|
|
|
|
| output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
|
|
|
| save_nifti(sitk_img_processed, output_path, full_path_image)
|
| print(f"Saved NIfTI file to {output_path}")
|
|
|
| label_path_dict = {}
|
|
|
| processed_lbl_full_path = os.path.join(output_dir, data_dir, TASK_VALUE, f"{data_dir}.nii.gz")
|
| print(processed_lbl_full_path,full_path_label,tr_flag,label_flag)
|
| if tr_flag and label_flag and os.path.exists(full_path_label):
|
| sitk_lbl_original = util.load_nifti(full_path_label)
|
| if not sitk_lbl_original:
|
| print(f" Failed to load label: {full_path_label}")
|
| processed_lbl_full_path = None
|
| continue
|
| if sitk_lbl_original:
|
| label_resampler = sitk.ResampleImageFilter()
|
| reference_for_label = sitk_img_processed
|
|
|
| if sitk_img_processed.GetDimension() == 4:
|
| num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
|
| if num_comp_proc > 0:
|
| extractor = sitk.ExtractImageFilter()
|
| proc_img_size_for_lbl_ref = sitk_img_processed.GetSize()
|
| extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0])
|
| extractor.SetIndex([0,0,0,0])
|
| try:
|
| reference_for_label = extractor.Execute(sitk_img_processed)
|
| except Exception as ref_err:
|
| print(f" Failed to extract 3D reference from 4D image: {output_path} for label alignment.")
|
|
|
| reference_for_label = None
|
| else:
|
| print(f" Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.")
|
| reference_for_label = None
|
|
|
| sitk_lbl_processed = None
|
|
|
| if reference_for_label and reference_for_label.GetDimension() > 0:
|
| label_resampler.SetInterpolator(sitk.sitkNearestNeighbor)
|
| label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID())
|
|
|
| if sitk_lbl_original.GetDimension() == 4:
|
| lbl_channels = []
|
| lbl_size = list(sitk_lbl_original.GetSize())
|
| for i in range(lbl_size[3]):
|
| extractor = sitk.ExtractImageFilter()
|
| extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0])
|
| extractor.SetIndex([0, 0, 0, i])
|
| single_channel = extractor.Execute(sitk_lbl_original)
|
|
|
| label_resampler.SetReferenceImage(reference_for_label)
|
| resampled_channel = label_resampler.Execute(single_channel)
|
| lbl_channels.append(resampled_channel)
|
|
|
| if len(lbl_channels) > 1:
|
| sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels)
|
| elif len(lbl_channels) == 1:
|
| sitk_lbl_processed = lbl_channels[0]
|
| else:
|
| label_resampler.SetReferenceImage(reference_for_label)
|
| sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original)
|
| if processed_lbl_full_path:
|
| if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]:
|
| print(f" Mismatch between image and label size (ignoring channels):")
|
| print(f" Image size: {sitk_img_processed.GetSize()}")
|
| print(f" Label size: {sitk_lbl_processed.GetSize()}")
|
| util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label)
|
| else:
|
| print(f" Failed to set reference image for label resampling for {full_path_label}. Saving original label.")
|
| util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label)
|
|
|
| else:
|
| processed_lbl_full_path = None
|
| else:
|
| processed_lbl_full_path = None
|
|
|
| if processed_lbl_full_path:
|
| label_path_dict['heart'] = processed_lbl_full_path
|
|
|
| print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
|
| print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize())
|
| try:
|
| assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
|
|
|
| except Exception as e:
|
| failed_files.append(full_path_label)
|
| continue
|
|
|
| except RuntimeError:
|
| failed_files.append(full_path_image)
|
| print(f"Failed to load MnMs images from {full_path_image}")
|
| continue
|
|
|
|
|
|
|
|
|
| size_processed = list(sitk_img_processed.GetSize())
|
| print('size_processed',size_processed)
|
|
|
|
|
| meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))
|
| meta.add_keyvalue('OriImg_path',full_path_image)
|
| meta.add_keyvalue('Size',size_processed)
|
| meta.add_keyvalue('Modality',modality)
|
| meta.add_keyvalue('Dataset_name',study)
|
| meta.add_keyvalue('ROI','chest')
|
|
|
|
|
| if processed_lbl_full_path:
|
| print(label_path_dict.keys())
|
| meta.add_keyvalue('Task',TASK_VALUE)
|
|
|
| meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
|
| meta.add_keyvalue('Label_Dict',LABEL_DICT)
|
| meta.add_extra_keyvalue('Metadata',CIA_other_info)
|
|
|
|
|
|
|
|
|
|
|
| with open(json_output_path, 'r+') as json_file:
|
| existing_mappings = json.load(json_file)
|
| existing_mappings[output_path] = meta.get_meta_data()
|
| json_file.seek(0)
|
| print(existing_mappings)
|
| json.dump(existing_mappings, json_file, indent=4)
|
| json_file.truncate()
|
|
|
|
|
|
|
| with open(failed_files_path, "w") as json_file:
|
| json.dump(failed_files, json_file)
|
|
|
| print(f"The list has been written to {failed_files_path}")
|
| print(f"Saved NIfTI mappings to {json_output_path}")
|
|
|
| if __name__ == "__main__":
|
| parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
|
| parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnMs/OpenDataset/")
|
| parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnMs/")
|
| args = parser.parse_args()
|
| print(args.target_path, args.output_dir)
|
| main(args.target_path, args.output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|