|
|
| '''
|
| write by ygq
|
| create on 2025-07-24
|
| update kaggle data clean
|
|
|
| 依次解析train.csv以及test.csv文件,获取每个数据集基本信息;
|
| 根据解析的id查找对应的train/test目录下的影像并做规范处理,同时查找label的segment目录下的标签,提取不同部位的CT的标签位置保存到json文件中;
|
| 完成后保存json并退出
|
|
|
| '''
|
| import os
|
| import glob
|
| import pandas as pd
|
| import SimpleITK as sitk
|
| import argparse
|
| import json
|
| from tqdm import tqdm
|
| from util import meta_data
|
| import util
|
| import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| meta_id_name='Patient'
|
| meta_weeks_name='Weeks'
|
| meta_fvc_name='FVC'
|
| meta_percent_name='Percent'
|
| meta_age_name='Age'
|
| meta_sex_name='Sex'
|
| meta_status_name='SmokingStatus'
|
|
|
| TASK_VALUE="segmentation"
|
| CLAMP_RANGE_CT = [-300,300]
|
| CLAMP_RANGE_MRI = [-1,0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def find_metadata_files(path):
|
|
|
| search_pattern = os.path.join(path, '*.csv')
|
| return glob.glob(search_pattern, recursive=True)
|
|
|
| def find_image_dirs(path):
|
| return os.listdir(path)
|
|
|
|
|
| def load_dicom_images(folder_path):
|
| reader = sitk.ImageSeriesReader()
|
| dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
|
| reader.SetFileNames(dicom_names)
|
| image = reader.Execute()
|
| return dicom_names,image
|
|
|
|
|
| def load_dicom_tag(imgs):
|
| reader = sitk.ImageFileReader()
|
|
|
| reader.SetFileName(imgs)
|
| reader.ReadImageInformation()
|
|
|
| tag=reader.Execute()
|
| return tag
|
|
|
| def load_nrrd(fp):
|
| return sitk.ReadImage(fp)
|
|
|
| def save_nifti(image, output_path, folder_path):
|
|
|
| output_dirpath = os.path.dirname(output_path)
|
| if not os.path.exists(output_dirpath):
|
| print(f"Creating directory {output_dirpath}")
|
| os.makedirs(output_dirpath)
|
|
|
| image.SetMetaData("FolderPath", folder_path)
|
| sitk.WriteImage(image, output_path)
|
|
|
|
|
| def convert_windows_to_linux_path(windows_path):
|
|
|
|
|
| linux_path = windows_path.replace('\\', '/')
|
| if ':' in linux_path:
|
| linux_path = linux_path.split(':', 1)[1]
|
| return linux_path
|
|
|
| def main(target_path, output_dir):
|
| metadata_files = find_metadata_files(target_path)
|
| pid_dirs=find_image_dirs(target_path)
|
| failed_files = []
|
| if not os.path.isdir(output_dir):
|
| os.makedirs(output_dir)
|
| json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
|
| failed_files_path = os.path.join(output_dir, 'failed_files.json')
|
| meta = meta_data()
|
|
|
|
|
| if not os.path.exists(json_output_path):
|
| with open(json_output_path, 'w') as json_file:
|
| json.dump({}, json_file)
|
|
|
| if pid_dirs:
|
| for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
|
| if not os.path.isdir(os.path.join(target_path,pid_dir)):
|
| continue
|
| meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
|
| if os.path.isfile(meta_file):
|
| mf_flag=True
|
| df_meta=pd.read_csv(meta_file,sep=',')
|
| else:
|
| mf_flag=False
|
| image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
|
| for data_dir in tqdm(image_dirs, desc="Processing images files"):
|
|
|
| location=data_dir
|
|
|
| full_path=os.path.join(target_path,pid_dir,data_dir)
|
| data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
|
|
|
| if data_info_row.shape[0]>0:
|
| data_info_row=data_info_row.reset_index()
|
|
|
| meta_image_id=data_info_row[meta_id_name][0]
|
| meta_weeks=data_info_row[meta_weeks_name][0]
|
| meta_fvc=data_info_row[meta_fvc_name][0]
|
| meta_percent=data_info_row[meta_percent_name][0]
|
| meta_age=data_info_row[meta_age_name][0]
|
| meta_sex=data_info_row[meta_sex_name][0]
|
| meta_status=data_info_row[meta_status_name][0]
|
| else:
|
| meta_image_id=data_dir
|
| meta_weeks=''
|
| meta_fvc=''
|
| meta_percent=''
|
| meta_age=''
|
| meta_sex=''
|
| meta_status=''
|
|
|
| if not os.path.isdir(full_path):
|
| continue
|
| try:
|
| print(full_path)
|
| dicom_fp,dicom_image = load_dicom_images(full_path)
|
|
|
| spacing_info = dicom_image.GetSpacing()
|
| print('SPACING INFO:', spacing_info)
|
|
|
| metadata_keys = dicom_image.GetMetaDataKeys()
|
|
|
| dtag=load_dicom_tag(dicom_fp[0])
|
| uid=dtag.GetMetaData('0020|000e')
|
| modality=dtag.GetMetaData('0008|0060')
|
| study='OSIC_PFP'
|
| CIA_other_info = {
|
| 'Study_UID':uid,
|
| 'metadata_file':''
|
|
|
| }
|
| CIA_other_info['split'] = pid_dir
|
| if mf_flag:
|
| CIA_other_info['metadata_file']=meta_file
|
|
|
| size = list(dicom_image.GetSize())
|
| resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)
|
|
|
|
|
| if resampler is not None:
|
| proces_image = resampler.Execute(dicom_image)
|
| print('SPACIE INFO AFTER', proces_image.GetSpacing())
|
| CIA_other_info['Resample'] = True
|
| else:
|
| proces_image = dicom_image
|
| CIA_other_info['Resample'] = False
|
|
|
|
|
| CIA_other_info['Image_id']=meta_image_id
|
| CIA_other_info['Weeks']=str(meta_weeks)
|
| CIA_other_info['FVC']=str(meta_fvc)
|
| CIA_other_info['Percent']=str(meta_percent)
|
| CIA_other_info['Age']=str(meta_age)
|
| CIA_other_info['Sex']=meta_sex
|
| CIA_other_info['Smoke_Status']=meta_status
|
|
|
| if 'CT' in modality:
|
| proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
|
| else:
|
| pass
|
|
|
| output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
|
|
|
| save_nifti(proces_image, output_path, full_path)
|
| print(f"Saved NIfTI file to {output_path}")
|
|
|
|
|
| label_path_dict = {}
|
| label_flag=True
|
| pare_path=os.path.dirname(target_path)
|
| label_paths = os.path.join(pare_path, 'GT')
|
| label_files=glob.glob("%s/*/*/%s_*.nrrd"%(label_paths,data_dir))
|
|
|
| if len(label_files)>0:
|
| for lf in label_files:
|
| lf_name=os.path.basename(lf)
|
| lf_id=lf_name.split("_")[0]
|
| lf_tissue=os.path.basename(os.path.dirname(lf)).split("_")[1]
|
| label_image=load_nrrd(lf)
|
| resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
|
| if resampler is not None:
|
| proces_label = resampler.Execute(label_image)
|
| else:
|
| proces_label = label_image
|
|
|
| label_output_path = os.path.join(output_dir, lf_id, TASK_VALUE, f"{lf_name}.nii.gz")
|
|
|
| label_path_dict[lf_tissue] = label_output_path
|
| util.save_nifti(proces_label, label_output_path, lf)
|
| print(f"Saved Label Segment NIfTI file to {label_output_path}")
|
|
|
| else:
|
| label_flag=False
|
| except RuntimeError:
|
| failed_files.append(full_path)
|
| print(f"Failed to load DICOM images from {full_path}")
|
| continue
|
|
|
| '''
|
| meta.add_keyvalue('Image_id',meta_image_id)
|
| meta.add_keyvalue('Weeks',meta_weeks)
|
| meta.add_keyvalue('FVC',meta_fvc)
|
| meta.add_keyvalue('Percent',meta_percent)
|
| meta.add_keyvalue('Age',meta_age)
|
| meta.add_keyvalue('Sex',meta_sex)
|
| meta.add_keyvalue('Smoke_Status',meta_status)
|
| '''
|
| print(proces_image.GetSize(),proces_label.GetSize())
|
| try:
|
| assert proces_image.GetSize() == proces_label.GetSize()
|
| except Exception as e:
|
| failed_files.append(full_path)
|
| continue
|
| size_processed = list(proces_image.GetSize())
|
|
|
|
|
| meta.add_keyvalue('Image_id',meta_image_id)
|
| meta.add_keyvalue('Spacing_mm',min(spacing_info))
|
| meta.add_keyvalue('OriImg_path',full_path)
|
| meta.add_keyvalue('Size',size_processed)
|
| meta.add_keyvalue('Modality',modality)
|
| meta.add_keyvalue('Dataset_name',study)
|
| meta.add_keyvalue('ROI','whole-body')
|
|
|
| if label_flag:
|
| print(label_path_dict.keys())
|
| meta.add_keyvalue('Task',TASK_VALUE)
|
| meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
|
| meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
|
|
|
| meta.add_extra_keyvalue('Metadata',CIA_other_info)
|
|
|
|
|
|
|
|
|
|
|
| with open(json_output_path, 'r+') as json_file:
|
| existing_mappings = json.load(json_file)
|
| existing_mappings[output_path] = meta.get_meta_data()
|
| json_file.seek(0)
|
| json.dump(existing_mappings, json_file, indent=4)
|
| json_file.truncate()
|
| else:
|
| print("No metadata.csv files found.")
|
|
|
| with open(failed_files_path, "w") as json_file:
|
| json.dump(failed_files, json_file)
|
|
|
| print(f"The list has been written to {failed_files_path}")
|
| print(f"Saved NIfTI mappings to {json_output_path}")
|
|
|
| if __name__ == "__main__":
|
| parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
|
| parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression")
|
| parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic_new/")
|
| args = parser.parse_args()
|
| print(args.target_path, args.output_dir)
|
| main(args.target_path, args.output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|