#coding:utf-8 ''' writebyygq createon2025-08-30 BL = Baseline(基线) FU = Follow-up(随访) 1. Baseline (基线) 含义:指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像(如CT、MRI、X光)。 作用:这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较,来判断病情的变化。 2. Follow-up (随访) 含义:指的是在基线影像之后,按计划或根据病情需要再次拍摄的影像。 作用:用于评估治疗效果(如肿瘤是否缩小)、监测疾病进展(如病灶是否增大或增多)、或观察术后恢复情况。 “BL FU” 在报告中的应用场景: 当放射科医生在报告中写下“BL FU”或“compare to BL FU”时,他们的意思是: “本次的影像检查结果,需要与之前拍摄的基线影像进行对比,以评估变化。” 例如: 肿瘤患者:一位肺癌患者在化疗前做了一次CT(作为基线BL),化疗2个周期后又做了一次CT(作为随访FU)。放射科医生会在新报告中将两次影像进行对比,并描述:“与20XX年X月X日的基线CT(BL FU) 相比,右肺下叶肿块明显缩小。” 慢性病患者:如肺炎、肝硬化、多发性硬化等需要长期监测的疾病,医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。 label: 0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息 编号ID:10位的16进制编号,每一个对应一个csv文件,对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件-- 备注:CSV包含所有的label信息和编号,如果考虑按照tissue进行分别存储,可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict BL的以及对应的MASK都是inputsTr目录下面 命名形式: 93dd4de5cd_BL_img_BL_img_00.nii.gz 93dd4de5cd_BL_mask_BL_img_00.nii.gz 93dd4de5cd_BL_00.json FU在inputsTr目录下面,对应的mask在targetsTr力猛 命名形式: c6f057b865_FU_img_FU_img_00.nii.gz c6f057b865_FU_mask_FU_img_00.nii.gz c6f057b865_FU_img_FU_img_01.nii.gz c6f057b865_FU_mask_FU_img_01.nii.gz c6f057b865_FU_00.json c6f057b865_FU_01.json 元数据信息CSV-病灶或者癌症信息--对应基线的位置,对应的基线影像编号,位置,以及对应的随访位置编号以及病灶位置 lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type 1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung 2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node json格式样例 { "name": "Points of interest", "points": [ { "name": "1", "point": [ 84.9530896759608, 273.525433308214, 148.780708364732 ] }, { "name": "2", "point": [ 206.307026476578, 258.39816700611, 177.256619144603 ] } ], "type": "Multiple points", "version": { "major": 1, "minor": 0 } } 20251101补充增加,将病灶编号进行合并同类项目, 注意处理完成后保留原影像的几何空间信息以及元数据文件信息 ''' import os import glob import pandas as pd import SimpleITK as sitk import argparse import json from tqdm import tqdm from util import meta_data import util import numpy as np # from bert_helper import * import shutil ##统一编码 label_id_lut={'backgroud': 0, 'Lymph node': 1, 'Lung': 2, 'Soft tissue / Skin': 3, 'Liver': 4, 'Skeleton': 5, 'Adrenals': 6, 'Spleen': 7, 'CNS': 8, 'Kidney': 9, 'Heart': 10, 'Others': 11, 'unclear': 12, } TASK_VALUE="segmentation" CLAMP_RANGE_CT = [-300,300] CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC... TARGET_VOXEL_SPACING=None # ##参考MSD的sub_modality描述信息 # SUB_MODALITY=["CT","PET"] # ##文件名对应的排序顺序 # SERIES_ORDER=["0000","0001"] ##根据对应的json信息进行补充1-N的数值 LABEL_DICT={ "0":"backgroud", } META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type'] # def find_metadata_files(path): # # for Cancer Image Archive (TCIA) dataset # search_pattern = os.path.join(path, '**', 'metadata.csv') # return glob.glob(search_pattern, recursive=True) def find_metadata_files(path): # for Cancer Image Archive (TCIA) dataset search_pattern = os.path.join(path, '*.csv') return glob.glob(search_pattern, recursive=True) ##added by yanguoqing on 20250527 def find_image_dirs(path): return os.listdir(path) ##modify by yanguoqing on 20250527 def load_dicom_images(folder_path): reader = sitk.ImageSeriesReader() dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileNames(dicom_names) image = reader.Execute() return dicom_names,image ##added by yanguoqing on 20250527 def load_dicom_tag(imgs): reader = sitk.ImageFileReader() # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileName(imgs) reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 # metadata_keys = reader.GetMetaDataKeys() tag=reader.Execute() return tag def load_nrrd(fp): return sitk.ReadImage(fp) ##modify by yanguoqing on 20250830 def merge_images(series_files): ''' 每个病例包含两种不同序列的 CT:CT/PET--0000/0001 将多个分开的模态合并,构建第四个维度的数组,分别按照CT,PET顺序存放 ''' reader = sitk.ImageSeriesReader() reader.SetFileNames(series_files) image = reader.Execute() return image def save_nifti(image, output_path, folder_path): # Set metadata in the NIfTI file's header output_dirpath = os.path.dirname(output_path) if not os.path.exists(output_dirpath): print(f"Creating directory {output_dirpath}") os.makedirs(output_dirpath) # Set metadata in the NIfTI file's header image.SetMetaData("FolderPath", folder_path) sitk.WriteImage(image, output_path) ##modify by yanguoqing on 20250527 def convert_windows_to_linux_path(windows_path): # Replace backslashes with forward slashes and remove the drive letter # Some meta files have windows paths, but the data is stored on a linux server linux_path = windows_path.replace('\\', '/') if ':' in linux_path: linux_path = linux_path.split(':', 1)[1] return linux_path ##added by yanguoqing on 2025-08-31 ##根据csv文件返回的所有数据文件名称,获取所有数据id的 def get_filename_list(fp_dir): all_file_list=glob.glob("%s/*.csv"%fp_dir) return all_file_list ##获取study_id以及study_date def check_fname(fname): if fname.startswith("fdg"): sid=fname[:14] sdate=fname[15:25] else: sid=fname[:21] sdate=fname[22:] return sid,sdate def main(target_path, output_dir): pid_dirs=["inputsTr"] failed_files = [] if not os.path.isdir(output_dir): os.makedirs(output_dir) json_output_path = os.path.join(output_dir, 'nifti_mappings.json') failed_files_path = os.path.join(output_dir, 'failed_files.json') meta = meta_data() # Initialize the JSON file if not os.path.exists(json_output_path): with open(json_output_path, 'w') as json_file: json.dump({}, json_file) input_dir=os.path.join(target_path,'inputsTr') target_dir=os.path.join(target_path,'targetsTr') fp_files=get_filename_list(input_dir) ##从辅助文件信息中获取所有1614个病例名称,每个病例名称存在0000,0001两个三维影像数据,按照顺序合并; if pid_dirs: for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"): for fp_file in tqdm(fp_files, desc="Processing all dataset"): meta_file=fp_file df_meta=pd.read_csv(meta_file) fp_name=os.path.basename(fp_file)[:-4] ##依次查找BL以及FU的所有影像以及对应的mask for sub_mod in ['BL','FU']: bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod)) if len(bl_fps)>0: for bl_fp in bl_fps: basename=os.path.basename(bl_fp)[:-5] bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz") bl_fp_img=os.path.join(input_dir,bl_fp_name) if os.path.isfile(bl_fp_img): ##判定存在进行正常处理 bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz") bl_fp_mask=os.path.join(input_dir,bl_mask_name) if os.path.isfile(bl_fp_mask): label_fp=bl_fp_mask label_flag=True else: bl_fp_mask=os.path.join(target_dir,bl_mask_name) if os.path.isfile(bl_fp_mask): label_fp=bl_fp_mask label_flag=True else: label_fp=None label_flag=False modality="CT" study='PSMA_Longitudinal_CT'##Dataset_name CIA_other_info = { 'Image_id':basename, 'metadata_file':'' # 'Series_Description':serise_desc } CIA_other_info['split'] = "train" CIA_other_info['metadata_file']=meta_file stk_image=util.load_nifti(bl_fp_img) spacing_info = stk_image.GetSpacing() size = list(stk_image.GetSize()) resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size) if resampler is not None: proces_image = resampler.Execute(stk_image) print('SPACIE INFO AFTER', proces_image.GetSpacing()) CIA_other_info['Resample'] = True else: proces_image = stk_image CIA_other_info['Resample'] = False output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz") # output_path=convert_windows_to_linux_path(output_path) save_nifti(proces_image, output_path, input_dir) print(f"Saved NIfTI file to {output_path}") if label_flag: label_path_dict = {} label_stk_img=util.load_nifti(label_fp) image_array = sitk.GetArrayFromImage(label_stk_img) ##注意处理label的赋值并还原附带原始影像的基本信息,并重新赋值合并同类项 with open(bl_fp,'r') as fi: json_info=json.load(fi) label_dict={ "0":"backgroud" } update_image_array=np.copy(image_array) ##获取合并同类项后的基本信息 group_meta=df_meta.groupby('lesion_type')['lesion_id'] for name,group in group_meta: ##分组名称以及分组后的所有leision_id ids=group_meta.get_group(name) target_id=label_id_lut[name] # ##取每个分组的最小leision_id赋值 # ids_min=ids.min() # label_dict[str(ids_min)]=name label_dict[str(target_id)]=name ##并对 for v in ids.tolist(): print(name,v,target_id) update_image_array[image_array==v]=target_id print(np.where(update_image_array==10)) image_array=None label_stk_img_update=sitk.GetImageFromArray(update_image_array) label_stk_img_update.CopyInformation(label_stk_img) # 手动复制所有元数据 # 获取元数据键 meta_keys = label_stk_img.GetMetaDataKeys() for key in meta_keys: value = label_stk_img.GetMetaData(key) label_stk_img_update.SetMetaData(key, value) # for lesion_info in json_info['points']: # df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])] # df_row=df_row.reset_index() # lesion_type=df_row['lesion_type'][0] # label_dict[lesion_info['name']]=lesion_type resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size) if resampler is not None: proces_label = resampler.Execute(label_stk_img_update) ary_process_label=sitk.GetArrayFromImage(proces_label) if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0: print('momingqimiao',ary_process_label[-1,0,0]) ary_process_label[-1,:,:]=0 label_stk_img_process=sitk.GetImageFromArray(ary_process_label) label_stk_img_process.CopyInformation(proces_label) meta_keys = proces_label.GetMetaDataKeys() for key in meta_keys: value = proces_label.GetMetaData(key) label_stk_img_process.SetMetaData(key, value) else: label_stk_img_process = label_stk_img_update # print(proces_image.GetSize(),proces_label.GetSize()) try: assert proces_image.GetSize() == label_stk_img_process.GetSize() except Exception as e: failed_files.append(label_fp) continue label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz") label_path_dict['tumor'] = label_output_path util.save_nifti(label_stk_img_process, label_output_path, label_fp) print(f"Saved Label Segment NIfTI file to {label_output_path}") else: continue size_processed = list(proces_image.GetSize()) print('size_processed',size_processed,size) # meta.add_keyvalue('Image_id',meta_image_id) meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing meta.add_keyvalue('OriImg_path',bl_fp_img) meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin meta.add_keyvalue('Modality',modality) meta.add_keyvalue('Dataset_name',study) meta.add_keyvalue('ROI','whole-body') if label_flag: # print(label_path_dict.keys()) meta.add_keyvalue('Task',TASK_VALUE) # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys())) meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict}) meta.add_keyvalue('Label_Dict',label_dict) meta.add_extra_keyvalue('Metadata',CIA_other_info) # Write the mapping to the JSON file on the fly with open(json_output_path, 'r+') as json_file: existing_mappings = json.load(json_file) existing_mappings[output_path] = meta.get_meta_data() json_file.seek(0) # print(existing_mappings) json.dump(existing_mappings, json_file, indent=4) json_file.truncate() # else: # print("No metadata.csv files found.") with open(failed_files_path, "w") as json_file: json.dump(failed_files, json_file) print(f"The list has been written to {failed_files_path}") print(f"Saved NIfTI mappings to {json_output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/ygq/Data_Engineering/PSMA_clean/demo") parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="//home/data/ygq/Data_Engineering/PSMA_clean/sample/") args = parser.parse_args() print(args.target_path, args.output_dir) main(args.target_path, args.output_dir)