File size: 24,663 Bytes

da9fb1e

#coding:utf-8
'''

write by ygq

create on 2025-07-24

update MnMs data clean

https://github.com/openmedlab/Awesome-Medical-Dataset/blob/main/resources/M&Ms.md

https://zhuanlan.zhihu.com/p/694831343



来自 6 个国际医疗中心 的 340 名受试者 的 CMR 数据。

覆盖 4 个主流 MRI 设备厂商（Siemens, Philips, GE, Canon）。

数据集文件结构如下，数据集被组织成训练集、验证集和测试集三个主目录，其中训练集进一步分为有标注和无标注的子目录。每个有标注的子目录包含病人的成像文件以及相应的标注数据。

M&Ms

├── Training

│   ├── Labeled

│   │   ├── A0S9V9

│   │   │   ├── A0S9V9_sa.nii.gz 

│   │   │   └── A0S9V9_sa_gt.nii.gz 

│   │   ├── A1D0Q7

│   │   ├── A1D9Z7

│   │   └── ...

│   └── Unlabeled

├── Validation

├── Testing

└── 211230_M&Ms_Dataset_information_diagnosis_opendataset.csv 



对训练集有标注的 150 例数据进行图像尺寸统计，size 的格式为 (x,y,z,frame)

经验丰富的临床医生对心脏磁共振（CMR）图像进行了分割，参考了 ACDC 的标注标准，标注了左心室（LV）、右心室（RV）血池以及左心室心肌（MYO）的轮廓，标签分别为：1（LV）、2（MYO）和3（RV）。



'''
import os
import glob
import pandas as pd
import SimpleITK as sitk
import argparse
import json
from tqdm import tqdm
from util import meta_data
import util
import numpy as np
# from bert_helper import *



meta_id_name='External code'
meta_vendor_name='VendorName'
meta_centre_name='Centre'
meta_pathology_name='Pathology'
meta_ed_name='ED'
meta_es_name='ES'
meta_age_name='Age'
meta_sex_name='Sex'
meta_height_name='Height'
meta_weight_name='Weight'

TASK_VALUE="segmentation"
CLAMP_RANGE_CT = [-300,300]
CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
TARGET_VOXEL_SPACING=None

LABEL_DICT={
    "0":"backgroud",
    "1":"LV",#左心室 Blood Pools
    "2":"MYO",#左心室心肌
    "3":"RV"#右心室 Blood Pools
}

# def find_metadata_files(path):
#     # for Cancer Image Archive (TCIA) dataset
#     search_pattern = os.path.join(path, '**', 'metadata.csv')
#     return glob.glob(search_pattern, recursive=True)

def find_metadata_files(path):
    # for Cancer Image Archive (TCIA) dataset
    search_pattern = os.path.join(path, '*.csv')
    return glob.glob(search_pattern, recursive=True)
##added by yanguoqing on 20250527
def find_image_dirs(path):
    return os.listdir(path)

##modify by yanguoqing on 20250527
def load_dicom_images(folder_path):
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    return dicom_names,image

##added by yanguoqing on 20250527
def load_dicom_tag(imgs):
    reader = sitk.ImageFileReader()
    # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileName(imgs)
    reader.ReadImageInformation()  # 仅读取元信息，不加载像素数据
    # metadata_keys = reader.GetMetaDataKeys()
    tag=reader.Execute()
    return tag

def load_nrrd(fp):
    return sitk.ReadImage(fp)

def save_nifti(image, output_path, folder_path):
    # Set metadata in the NIfTI file's header
    output_dirpath = os.path.dirname(output_path)
    if not os.path.exists(output_dirpath):
        print(f"Creating directory {output_dirpath}")
        os.makedirs(output_dirpath)
    # Set metadata in the NIfTI file's header
    image.SetMetaData("FolderPath", folder_path)
    sitk.WriteImage(image, output_path)

##modify by yanguoqing on 20250527
def convert_windows_to_linux_path(windows_path):
    # Replace backslashes with forward slashes and remove the drive letter
    # Some meta files have windows paths, but the data is stored on a linux server
    linux_path = windows_path.replace('\\', '/')
    if ':' in linux_path:
        linux_path = linux_path.split(':', 1)[1]
    return linux_path

def main(target_path, output_dir):
    metadata_files = find_metadata_files(target_path)
    pid_dirs=find_image_dirs(target_path)
    pid_dirs=["Training","Testing","Validation"]
    failed_files = []
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
    failed_files_path = os.path.join(output_dir, 'failed_files.json')
    meta = meta_data()
    
    # Initialize the JSON file
    if not os.path.exists(json_output_path):
        with open(json_output_path, 'w') as json_file:
            json.dump({}, json_file)
    meta_file=os.path.join(target_path,'211230_M&Ms_Dataset_information_diagnosis_opendataset.csv')
    if os.path.isfile(meta_file):
        mf_flag=True
        df_meta=pd.read_csv(meta_file,sep=',')
    else:
        mf_flag=False

    if pid_dirs:
        for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
            if not os.path.isdir(os.path.join(target_path,pid_dir)):
                continue
            if pid_dir =="Training":
                tr_flag=True
            else:
                tr_flag=False
            label_flag=False
            
            if not tr_flag:
                image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
                unlabeled_list=image_dirs
            else:
                image_dir_1=find_image_dirs(os.path.join(target_path,pid_dir,'Labeled'))
                image_dir_2=find_image_dirs(os.path.join(target_path,pid_dir,'Unlabeled'))
                unlabeled_list=image_dir_2
                image_dirs=image_dir_1+image_dir_2
            for data_dir in tqdm(image_dirs, desc="Processing images files"):
            
                location=data_dir
                if not tr_flag:
                    full_path=os.path.join(target_path,pid_dir,data_dir)
                else:
                    if data_dir in unlabeled_list:
                        full_path=os.path.join(target_path,pid_dir,"Unlabeled",data_dir)
                    else:
                        full_path=os.path.join(target_path,pid_dir,"Labeled",data_dir)
                        label_flag=True
                data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
                
                if data_info_row.shape[0]>0:
                    data_info_row=data_info_row.reset_index()
                    #print(data_info_row[meta_id_name])
                    meta_image_id=data_info_row[meta_id_name][0]
                    meta_vendor=data_info_row[meta_vendor_name][0]
                    meta_centre=data_info_row[meta_centre_name][0]
                    meta_pathology=data_info_row[meta_pathology_name][0]
                    meta_age=data_info_row[meta_age_name][0]
                    meta_sex=data_info_row[meta_sex_name][0]
                    meta_height=data_info_row[meta_height_name][0]
                    meta_weigth=data_info_row[meta_weight_name][0]
                    meta_ed=data_info_row[meta_ed_name][0]
                    meta_es=data_info_row[meta_es_name][0]
                else:
                    meta_image_id=data_dir
                    meta_vendor=''
                    meta_centre=''
                    meta_pathology=''
                    meta_age=''
                    meta_sex=''
                    meta_height=''
                    meta_weigth=''
                    meta_ed=''
                    meta_es=''
                # full_path = convert_windows_to_linux_path(full_path)
                if not os.path.isdir(full_path):
                    continue
                try:
                    print(full_path)
                    full_path_image=os.path.join(full_path,"%s_sa.nii.gz"%data_dir)
                    
                    if label_flag:
                        full_path_label=os.path.join(full_path,"%s_sa_gt.nii.gz"%data_dir)
                        if not os.path.isfile(full_path_label):
                            full_path_label=None
                    else:
                        full_path_label=None

                    sitk_img_original = util.load_nifti(full_path_image)
                    if sitk_img_original is None:
                        print(f"  Failed to load image: {full_path_image}")
                        continue 
                    
                    modality="MRI"
                    study='MnMs'##Dataset_name
                    CIA_other_info = {
                    'metadata_file':''
                    # 'Series_Description':serise_desc
                    }
                    CIA_other_info['split'] = pid_dir
                    if mf_flag:
                        CIA_other_info['metadata_file']=meta_file

                    original_spacing = list(sitk_img_original.GetSpacing())
                    original_size = list(sitk_img_original.GetSize())
                    sitk_img_processed = sitk_img_original
                    # is_4d_image = msd_dataset_info.get("tensorImageSize", "3D").upper() == "4D" or sitk_img_original.GetDimension() == 4
                    is_4d_image = sitk_img_original.GetDimension() == 4

                    frame_flag=False
                    # --- Resampling Logic (Revised for 4D) ---
                    if is_4d_image:
                        
                        
                        # Always process 4D images channel-wise for resampling
                        # logging.info(f"    Processing 4D image channel-wise: {original_img_full_path}") # Keep log for errors only
                        channels = []
                        num_channels = original_size[3] if len(original_size) == 4 and sitk_img_original.GetDimension() == 4 else 1
                        channel_target_spacing = TARGET_VOXEL_SPACING if TARGET_VOXEL_SPACING else original_spacing[:3] # Use 3D spacing
                        
                        
                        for i in range(num_channels):
                            extractor = sitk.ExtractImageFilter()
                            current_3d_channel_size = original_size[:3]
                            
                            if sitk_img_original.GetDimension() == 4:
                                extractor.SetSize([current_3d_channel_size[0], current_3d_channel_size[1], current_3d_channel_size[2], 0])
                                extractor.SetIndex([0,0,0,i])
                                channel_3d_img = extractor.Execute(sitk_img_original)
                            else: 
                                channel_3d_img = sitk_img_original
                                if i > 0: break 

                            channel_resampler = util.get_unisize_resampler(
                                channel_3d_img, 'linear',
                                spacing=channel_target_spacing, size=current_3d_channel_size 
                            )
                            if channel_resampler: 
                                channels.append(channel_resampler.Execute(channel_3d_img))
                            else: 
                                channels.append(channel_3d_img)
                        
                        if channels:
                            if len(channels) > 1: # Only join if there are multiple channels
                                sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(channels)
                                ##aded by yanguoqing on 2025-08-11
                                frame_flag=True
                                imgDict={}
                                for kf_idx in range(num_channels):
                                    imgDict[str(kf_idx)]='none'
                                if str(meta_ed):imgDict[str(meta_ed)]='ed'
                                if str(meta_es):imgDict[str(meta_es)]='es'
                                meta.add_keyvalue('ImgDict',imgDict)
                            elif len(channels) == 1: # If only one channel resulted (e.g. original was 3D misidentified as 4D by tensorImageSize)
                                sitk_img_processed = channels[0]
                    elif TARGET_VOXEL_SPACING: # 3D image with target spacing
                        img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
                                                                    spacing=TARGET_VOXEL_SPACING, size=original_size)
                        if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)
                    else: # 3D image, no TARGET_VOXEL_SPACING
                        img_resampler_obj = util.get_unisize_resampler(sitk_img_original, 'linear',
                                                                    spacing=original_spacing, size=original_size)
                        if img_resampler_obj: sitk_img_processed = img_resampler_obj.Execute(sitk_img_original)            
                    


                    ##
                    CIA_other_info['Image_id']=meta_image_id
                    CIA_other_info['Vendor']=meta_vendor
                    CIA_other_info['Centre']=str(meta_centre)
                    CIA_other_info['Pathology']=str(meta_pathology)
                    CIA_other_info['Age']=str(meta_age)
                    CIA_other_info['Sex']=meta_sex
                    CIA_other_info['Height']=str(meta_height)
                    CIA_other_info['Weight']=str(meta_weigth)
                    CIA_other_info['ED']=str(meta_ed)
                    CIA_other_info['ES']=str(meta_es)

                    

                    # --- End Resampling Logic ---
                    
                    is_processed_4d = sitk_img_processed.GetDimension() == 4
                    clamp_range_to_use=None
                    if clamp_range_to_use and is_processed_4d:
                        clamped_channels_final = []
                        num_channels_final = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
                        for i in range(num_channels_final):
                            extractor = sitk.ExtractImageFilter()
                            proc_size_final = sitk_img_processed.GetSize()
                            extractor.SetSize([proc_size_final[0], proc_size_final[1], proc_size_final[2], 0])
                            extractor.SetIndex([0,0,0,i])
                            channel_3d_img_to_clamp = extractor.Execute(sitk_img_processed)
                            clamped_channels_final.append(util.clamp_image(channel_3d_img_to_clamp, clamp_range_to_use))
                        if clamped_channels_final:
                            if len(clamped_channels_final) > 1:
                                sitk_img_processed = sitk.JoinSeriesImageFilter().Execute(clamped_channels_final)
                            elif len(clamped_channels_final) == 1:
                                sitk_img_processed = clamped_channels_final[0]
                    elif clamp_range_to_use: # 3D image
                        sitk_img_processed = util.clamp_image(sitk_img_processed, clamp_range_to_use)
                    

                    output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
                    # output_path=convert_windows_to_linux_path(output_path)
                    save_nifti(sitk_img_processed, output_path, full_path_image)
                    print(f"Saved NIfTI file to {output_path}")

                    label_path_dict = {}

                    processed_lbl_full_path = os.path.join(output_dir, data_dir, TASK_VALUE, f"{data_dir}.nii.gz")
                    print(processed_lbl_full_path,full_path_label,tr_flag,label_flag)
                    if tr_flag and label_flag and os.path.exists(full_path_label):
                        sitk_lbl_original = util.load_nifti(full_path_label)
                        if not sitk_lbl_original:
                            print(f"  Failed to load label: {full_path_label}")
                            processed_lbl_full_path = None
                            continue
                        if sitk_lbl_original:
                            label_resampler = sitk.ResampleImageFilter()
                            reference_for_label = sitk_img_processed # Default to processed image
                            
                            if sitk_img_processed.GetDimension() == 4:
                                num_comp_proc = sitk_img_processed.GetSize()[3] if len(sitk_img_processed.GetSize()) == 4 else 1
                                if num_comp_proc > 0:
                                    extractor = sitk.ExtractImageFilter()
                                    proc_img_size_for_lbl_ref = sitk_img_processed.GetSize()
                                    extractor.SetSize([proc_img_size_for_lbl_ref[0], proc_img_size_for_lbl_ref[1], proc_img_size_for_lbl_ref[2], 0])
                                    extractor.SetIndex([0,0,0,0])
                                    try:
                                        reference_for_label = extractor.Execute(sitk_img_processed)
                                    except Exception as ref_err:
                                        print(f"  Failed to extract 3D reference from 4D image: {output_path} for label alignment.")
                                        # print(traceback.format_exc())
                                        reference_for_label = None
                                else: # Fallback if extraction fails
                                    print(f"      Could not extract 3D reference for label from 4D image {output_path}. Label may not be correctly resampled.")
                                    reference_for_label = None # This will cause an issue below if not handled
                            
                                sitk_lbl_processed = None

                                if reference_for_label and reference_for_label.GetDimension() > 0:
                                    label_resampler.SetInterpolator(sitk.sitkNearestNeighbor)
                                    label_resampler.SetOutputPixelType(sitk_lbl_original.GetPixelID())

                                    if sitk_lbl_original.GetDimension() == 4:
                                        lbl_channels = []
                                        lbl_size = list(sitk_lbl_original.GetSize())
                                        for i in range(lbl_size[3]):
                                            extractor = sitk.ExtractImageFilter()
                                            extractor.SetSize([lbl_size[0], lbl_size[1], lbl_size[2], 0])
                                            extractor.SetIndex([0, 0, 0, i])
                                            single_channel = extractor.Execute(sitk_lbl_original)

                                            label_resampler.SetReferenceImage(reference_for_label)
                                            resampled_channel = label_resampler.Execute(single_channel)
                                            lbl_channels.append(resampled_channel)

                                        if len(lbl_channels) > 1:
                                            sitk_lbl_processed = sitk.JoinSeriesImageFilter().Execute(lbl_channels)
                                        elif len(lbl_channels) == 1:
                                            sitk_lbl_processed = lbl_channels[0]
                                    else:
                                        label_resampler.SetReferenceImage(reference_for_label)
                                        sitk_lbl_processed = label_resampler.Execute(sitk_lbl_original)
                                if processed_lbl_full_path:
                                    if sitk_img_processed.GetSize()[:3] != sitk_lbl_processed.GetSize()[:3]:
                                        print(f"  Mismatch between image and label size (ignoring channels):")
                                        print(f"     Image size: {sitk_img_processed.GetSize()}")
                                        print(f"     Label size: {sitk_lbl_processed.GetSize()}")
                                util.save_nifti(sitk_lbl_processed, processed_lbl_full_path, full_path_label)
                            else:
                                print(f"      Failed to set reference image for label resampling for {full_path_label}. Saving original label.")
                                util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_path_label) # Save original
                                # processed_lbl_full_path should still point to this saved original label
                        else: 
                            processed_lbl_full_path = None
                    else:
                        processed_lbl_full_path = None

                    if processed_lbl_full_path:
                        label_path_dict['heart'] = processed_lbl_full_path

                        print('compare image and label size',sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
                        print('compare image and label size',sitk_img_processed.GetSize(),sitk_lbl_processed.GetSize())
                    try:
                        assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()

                    except Exception as e:
                        failed_files.append(full_path_label)
                        continue

                except RuntimeError:
                    failed_files.append(full_path_image)
                    print(f"Failed to load MnMs images from {full_path_image}")
                    continue

                

                
                size_processed = list(sitk_img_processed.GetSize())
                print('size_processed',size_processed)

                # meta.add_keyvalue('Image_id',meta_image_id)
                meta.add_keyvalue('Spacing_mm',min(original_spacing[:3]))##保留前三个x,y,z的最小spacing
                meta.add_keyvalue('OriImg_path',full_path_image)
                meta.add_keyvalue('Size',size_processed)  # 这里用处理后的size -- YH Jachin
                meta.add_keyvalue('Modality',modality)
                meta.add_keyvalue('Dataset_name',study)
                meta.add_keyvalue('ROI','chest')

                
                if processed_lbl_full_path:
                    print(label_path_dict.keys())
                    meta.add_keyvalue('Task',TASK_VALUE)
                    # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
                    meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
                    meta.add_keyvalue('Label_Dict',LABEL_DICT)
                meta.add_extra_keyvalue('Metadata',CIA_other_info)

                
                

                # Write the mapping to the JSON file on the fly
                with open(json_output_path, 'r+') as json_file:
                    existing_mappings = json.load(json_file)
                    existing_mappings[output_path] = meta.get_meta_data()
                    json_file.seek(0)
                    print(existing_mappings)
                    json.dump(existing_mappings, json_file, indent=4)
                    json_file.truncate()
    # else:
    #     print("No metadata.csv files found.")
    
    with open(failed_files_path, "w") as json_file:
        json.dump(failed_files, json_file)
        
    print(f"The list has been written to {failed_files_path}")
    print(f"Saved NIfTI mappings to {json_output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
    parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/MnMs/OpenDataset/")
    parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/MnMs/")
    args = parser.parse_args()
    print(args.target_path, args.output_dir)
    main(args.target_path, args.output_dir)