#coding:utf-8 ''' write by ygq create on 2025-08-03 update BRATS_2020 BRATS2020 是 BRATS 系列的一个重要里程碑。它在 BRATS2019 的基础上,通过显著扩大数据规模、增加数据多样性(尤其是纳入中国数据)、 完善生存预测任务的评估流程(验证集和测试集包含生存信息)以及引入额外未标注数据以促进新学习范式,为脑胶质瘤多模态 MRI 分割和生存预测研究设定了更高的标准。 数据内容与规模(显著扩大):     训练集: 包含 369 例 患者的完整多模态 MRI 扫描数据及其对应的专家手动分割标注(Ground Truth)。(相比2019的335例增加)     验证集: 包含 125 例 患者的完整多模态 MRI 扫描数据。没有提供标注。用于开发阶段在线评估算法性能。     测试集: 包含 166 例 患者的完整多模态 MRI 扫描数据。没有提供标注。这是最终排名使用的独立测试集。(与2019测试集规模相同,但内容不同) 关键特性 - 多模态 MRI(与2019一致): 每个病例仍然包含四种预处理后的 3D MRI 序列: Native (T1) Post-contrast T1-weighted (T1Gd/T1ce) T2-weighted (T2) T2 Fluid Attenuated Inversion Recovery (T2-FLAIR) 关键特性 - 肿瘤标注(与2019一致): 训练集提供专家手动勾画的精细标注。 标注定义相同的三个子区域: 坏疽性和非增强肿瘤核心: 标签值 = 1 瘤周水肿: 标签值 = 2 增强肿瘤: 标签值 = 4 整个肿瘤区域: 标签值 1+2+4 肿瘤核心区域: 标签值 1+4 根据沟通参考MSD中的BRATS的结构: 1.将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放; 2.生存期信息也需要相应补充道HGG的数据集中 Trainning: meta_info:[保留Grade,BraTS_2019_subject_ID]--name_mapping.csv Grade,BraTS_2017_subject_ID,BraTS_2018_subject_ID,TCGA_TCIA_subject_ID,BraTS_2019_subject_ID,BraTS_2020_subject_ID HGG,Brats17_CBICA_AAB_1,Brats18_CBICA_AAB_1,NA,BraTS19_CBICA_AAB_1,BraTS20_Training_001 HGG,Brats17_CBICA_AAG_1,Brats18_CBICA_AAG_1,NA,BraTS19_CBICA_AAG_1,BraTS20_Training_002 survival_info:--survival_info.csv Brats20ID,Age,Survival_days,Extent_of_Resection BraTS20_Training_001,60.463,289,GTR BraTS20_Training_002,52.263,616,GTR Validation: meta_info:[保留Grade,BraTS_2019_subject_ID]--name_mapping_validation_data.csv BraTS_2017_subject_ID,BraTS_2018_subject_ID,TCGA_TCIA_subject_ID,BraTS_2019_subject_ID,BraTS_2020_subject_ID Brats17_CBICA_AAM_1,Brats18_CBICA_AAM_1,NA,BraTS19_CBICA_AAM_1,BraTS20_Validation_001 Brats17_CBICA_ABT_1,Brats18_CBICA_ABT_1,NA,BraTS19_CBICA_ABT_1,BraTS20_Validation_002 survival_info:--survival_evaluation.csv BraTS20ID,Age,ResectionStatus BraTS20_Validation_001,68.170,GTR BraTS20_Validation_002,50.153,GTR ''' import os import glob import pandas as pd import SimpleITK as sitk import argparse import json from tqdm import tqdm from util import meta_data import util import numpy as np # from bert_helper import * import shutil ##trainning_dataset ##dataset_meta meta_info_dict={ "training":{ 'meta_id_name':'BraTS_2020_subject_ID', 'meta_grade_name':'Grade', 'survival_id_name':'Brats20ID', 'meta_age_name':'Age', 'meta_survival_name':'Survival_days', 'meta_status_name':'Extent_of_Resection' }, 'validation':{ 'meta_id_name':'BraTS_2020_subject_ID', 'survival_id_name':'BraTS20ID', 'meta_age_name':'Age', 'meta_status_name':'ResectionStatus' } } TASK_VALUE="segmentation" CLAMP_RANGE_CT = [-300,300] CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... TARGET_VOXEL_SPACING=None ##参考MSD的sub_modality描述信息 SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"] ##文件名对应的排序顺序 SERIES_ORDER=["flair","t1","t1ce","t2"] LABEL_DICT={ "0":"backgroud", "1":"non-enhancing tumor", "2":"edema", "4":"enhancing tumour" } # def find_metadata_files(path): # # for Cancer Image Archive (TCIA) dataset # search_pattern = os.path.join(path, '**', 'metadata.csv') # return glob.glob(search_pattern, recursive=True) def find_metadata_files(path): # for Cancer Image Archive (TCIA) dataset search_pattern = os.path.join(path, '*.csv') return glob.glob(search_pattern, recursive=True) ##added by yanguoqing on 20250527 def find_image_dirs(path): return os.listdir(path) ##modify by yanguoqing on 20250527 def load_dicom_images(folder_path): reader = sitk.ImageSeriesReader() dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileNames(dicom_names) image = reader.Execute() return dicom_names,image ##added by yanguoqing on 20250527 def load_dicom_tag(imgs): reader = sitk.ImageFileReader() # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileName(imgs) reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 # metadata_keys = reader.GetMetaDataKeys() tag=reader.Execute() return tag def load_nrrd(fp): return sitk.ReadImage(fp) ##modify by yanguoqing on 20250805 def load_brtas_images(series_files): ''' 每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离) 将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放 ''' reader = sitk.ImageSeriesReader() reader.SetFileNames(series_files) image = reader.Execute() return image def save_nifti(image, output_path, folder_path): # Set metadata in the NIfTI file's header output_dirpath = os.path.dirname(output_path) if not os.path.exists(output_dirpath): print(f"Creating directory {output_dirpath}") os.makedirs(output_dirpath) # Set metadata in the NIfTI file's header image.SetMetaData("FolderPath", folder_path) sitk.WriteImage(image, output_path) ##modify by yanguoqing on 20250527 def convert_windows_to_linux_path(windows_path): # Replace backslashes with forward slashes and remove the drive letter # Some meta files have windows paths, but the data is stored on a linux server linux_path = windows_path.replace('\\', '/') if ':' in linux_path: linux_path = linux_path.split(':', 1)[1] return linux_path def main(target_path, output_dir): metadata_files = find_metadata_files(target_path) pid_dirs=find_image_dirs(target_path) failed_files = [] if not os.path.isdir(output_dir): os.makedirs(output_dir) json_output_path = os.path.join(output_dir, 'nifti_mappings.json') failed_files_path = os.path.join(output_dir, 'failed_files.json') # Initialize the JSON file if not os.path.exists(json_output_path): with open(json_output_path, 'w') as json_file: json.dump({}, json_file) meta_file=os.path.join(target_path,'name_mapping.csv') survival_file=os.path.join(target_path,'survival_info.csv') val_meta_file=os.path.join(target_path,'name_mapping_validation_data.csv') val_survival_file=os.path.join(target_path,'survival_evaluation.csv') if os.path.isfile(meta_file): mf_flag=True df_meta=pd.read_csv(meta_file,sep=',') else: mf_flag=False if os.path.isfile(survival_file): sf_flag=True df_survial=pd.read_csv(survival_file,sep=',') else: sf_flag=False if os.path.isfile(val_meta_file): vmf_flag=True vdf_meta=pd.read_csv(val_meta_file,sep=',') else: vmf_flag=False if os.path.isfile(val_survival_file): vsf_flag=True vdf_survial=pd.read_csv(val_survival_file,sep=',') else: vsf_flag=False if pid_dirs: for data_dir in tqdm(pid_dirs, desc="Processing pid dirs"): if not os.path.isdir(os.path.join(target_path,data_dir)): continue ##HGG_FLAG if 'Training' in data_dir: tr_flag=True else: tr_flag=False # label_flag=False ##遍历所有目录下的HGG/LGG的病例数据(影像+标注seg) # image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) # for data_dir in tqdm(image_dirs, desc="Processing images files"): full_path=os.path.join(target_path,data_dir) meta = meta_data() if tr_flag: data_info_row=df_meta[df_meta[meta_info_dict['training']['meta_id_name']]==data_dir] survival_file_row=df_survial[df_survial[meta_info_dict['training']['survival_id_name']]==data_dir] if data_info_row.shape[0]>0: data_info_row=data_info_row.reset_index() #print(data_info_row[meta_id_name]) meta_image_id=data_info_row[meta_info_dict['training']['meta_id_name']][0] meta_image_grade=data_info_row[meta_info_dict['training']['meta_grade_name']][0] else: meta_image_id=data_dir meta_image_grade='' if survival_file_row.shape[0]>0: survival_file_row=survival_file_row.reset_index() #print(data_info_row[meta_id_name]) meta_image_age=survival_file_row[meta_info_dict['training']['meta_age_name']][0] meta_image_survival=survival_file_row[meta_info_dict['training']['meta_survival_name']][0] meta_image_status=survival_file_row[meta_info_dict['training']['meta_status_name']][0] else: meta_image_age='' meta_image_survival='' meta_image_status='' else: data_info_row=vdf_meta[vdf_meta[meta_info_dict['validation']['meta_id_name']]==data_dir] survival_file_row=vdf_survial[vdf_survial[meta_info_dict['validation']['survival_id_name']]==data_dir] if data_info_row.shape[0]>0: data_info_row=data_info_row.reset_index() #print(data_info_row[meta_id_name]) meta_image_id=data_info_row[meta_info_dict['validation']['meta_id_name']][0] meta_image_grade='' else: meta_image_id=data_dir meta_image_grade='' if survival_file_row.shape[0]>0: survival_file_row=survival_file_row.reset_index() #print(data_info_row[meta_id_name]) meta_image_age=survival_file_row[meta_info_dict['validation']['meta_age_name']][0] meta_image_survival='' meta_image_status=survival_file_row[meta_info_dict['validation']['meta_status_name']][0] else: meta_image_age='' meta_image_survival='' meta_image_status='' try: ##读取MRI四组文件,按照flair,t1,t1ce,t2的顺序叠加,对于seg先剔除不参与 series_files=[os.path.join(full_path,"%s_%s.nii"%(data_dir,sm))for sm in SERIES_ORDER] ##判断是否每个sub_modality文件存在 series_flag=[os.path.isfile(os.path.join(full_path,"%s_%s.nii"%(data_dir,sm)))for sm in SERIES_ORDER] series_files=[series_files[index] for index, value in enumerate(series_flag) if value] sub_modality=[SUB_MODALITY[index] for index, value in enumerate(series_flag) if value] if len(series_files)>0: ##存在有效的MRI影像数据进行后续处理 sitk_img_original=load_brtas_images(series_files) else: print("病例数据%s为空"%data_dir) continue original_spacing = list(sitk_img_original.GetSpacing()) original_size = list(sitk_img_original.GetSize()) modality="MRI" study='BRATS_2020'##Dataset_name CIA_other_info = { 'metadata_file':'' } if tr_flag: CIA_other_info['split'] = "train" CIA_other_info['metadata_file']=meta_file else: CIA_other_info['split'] = "validation" CIA_other_info['metadata_file']=val_meta_file ## CIA_other_info['Image_id']=meta_image_id CIA_other_info['Grade']=meta_image_grade CIA_other_info['Age']=str(meta_image_age) CIA_other_info['Survival']=str(meta_image_survival) CIA_other_info['ResectionStatus']=meta_image_status meta.add_keyvalue('Spacing_mm',1.0) meta.add_keyvalue('OriImg_path',",".join(series_files)) meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin meta.add_keyvalue('Modality',modality) meta.add_keyvalue('Dataset_name',study) meta.add_keyvalue('ROI','head') sub_modality_dict={} for idx,value in enumerate(series_flag): if value: sub_modality_dict[str(idx)]=SUB_MODALITY[idx] meta.add_keyvalue('Sub_modality',sub_modality_dict) if tr_flag: meta.add_keyvalue('Label_Dict',LABEL_DICT) output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz") # output_path=convert_windows_to_linux_path(output_path) ## save_nifti(sitk_img_original, output_image_file, full_path) print(f"Saved NIfTI file to {output_image_file}") ##Label processing if tr_flag: label_path_dict={} full_label_file=os.path.join(full_path,"%s_seg.nii"%(data_dir)) process_label_path=os.path.join(output_dir,data_dir,'segmentation') processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz") if not os.path.isdir(process_label_path): os.makedirs(process_label_path,exist_ok=True) if not os.path.isfile(full_label_file): pass label_flag=False else: sitk_lbl_original = util.load_nifti(full_label_file) util.save_nifti(sitk_lbl_original, processed_lbl_full_path, full_label_file) # Save original print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}") label_path_dict['brain'] = processed_lbl_full_path label_flag=True if label_flag: meta.add_keyvalue('Task',TASK_VALUE) meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) # try: # assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize() # except Exception as e: # failed_files.append(full_path_label) # continue print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize()) except Exception as e: print(e) failed_files.append(data_dir) print(f"Failed to load BRATS images from {data_dir}") continue meta.add_extra_keyvalue('Metadata',CIA_other_info) # Write the mapping to the JSON file on the fly with open(json_output_path, 'r+') as json_file: existing_mappings = json.load(json_file) existing_mappings[output_image_file] = meta.get_meta_data() json_file.seek(0) # print(existing_mappings) json.dump(existing_mappings, json_file, indent=4) json_file.truncate() # else: # print("No metadata.csv files found.") with open(failed_files_path, "w") as json_file: json.dump(failed_files, json_file) print(f"The list has been written to {failed_files_path}") print(f"Saved NIfTI mappings to {json_output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/BRATS/BRATS2020/") parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2020") args = parser.parse_args() print(args.target_path, args.output_dir) main(args.target_path, args.output_dir)