import os import glob import pandas as pd import SimpleITK as sitk import argparse import json from tqdm import tqdm from util import meta_data import util # from bert_helper import * # model_name = "bert-large-uncased" # reduce_method = 'mean' # max_words_num = 32 # max number of words in the caption > 2 # embeder, tokenizer = get_frozen_embeder(model_name) # string1 = "modality: ct, gender: female, age: 51, roi: abdomen" # embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method) # string2 = "modality: ct, gender: female, age: 50, roi: head" # embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method) # input_size = embeder.config.vocab_size # in_size = embeder.config.hidden_size # print(embeder, input_size, in_size) # print(tokenizer) # print(embeder_output1) # print(embeder_output1.shape) # torch.Size([1, 8, 768]) # print(embeder_output2) # print(embeder_output2.shape) # torch.Size([1, 8, 768]) # error = torch.abs(embeder_output1 - embeder_output2) # print(error) # print("Embedding distance between the two sentences: ") # print(f"String1: {string1}") # print(f"String2: {string2}") # print(torch.mean(error)) # exit() CLAMP_RANGE_CT = [-300,300] CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC... # def find_metadata_files(path): # # for Cancer Image Archive (TCIA) dataset # search_pattern = os.path.join(path, '**', 'metadata.csv') # return glob.glob(search_pattern, recursive=True) def find_metadata_files(path): # for Cancer Image Archive (TCIA) dataset search_pattern = os.path.join(path, '*.csv') return glob.glob(search_pattern, recursive=True) ##added by yanguoqing on 20250527 def find_image_dirs(path): return os.listdir(path) ##modify by yanguoqing on 20250527 def load_dicom_images(folder_path): reader = sitk.ImageSeriesReader() dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileNames(dicom_names) image = reader.Execute() return dicom_names,image ##added by yanguoqing on 20250527 def load_dicom_tag(imgs): reader = sitk.ImageFileReader() # dicom_names = reader.GetGDCMSeriesFileNames(folder_path) reader.SetFileName(imgs) reader.ReadImageInformation() # 仅读取元信息,不加载像素数据 # metadata_keys = reader.GetMetaDataKeys() tag=reader.Execute() return tag def save_nifti(image, output_path, folder_path): # Set metadata in the NIfTI file's header output_dirpath = os.path.dirname(output_path) if not os.path.exists(output_dirpath): print(f"Creating directory {output_dirpath}") os.makedirs(output_dirpath) # Set metadata in the NIfTI file's header image.SetMetaData("FolderPath", folder_path) sitk.WriteImage(image, output_path) ##modify by yanguoqing on 20250527 def convert_windows_to_linux_path(windows_path): # Replace backslashes with forward slashes and remove the drive letter # Some meta files have windows paths, but the data is stored on a linux server linux_path = windows_path.replace('\\', '/') if ':' in linux_path: linux_path = linux_path.split(':', 1)[1] return linux_path def main(target_path, output_dir): metadata_files = find_metadata_files(target_path) pid_dirs=find_image_dirs(target_path) failed_files = [] if not os.path.isdir(output_dir): os.makedirs(output_dir) json_output_path = os.path.join(output_dir, 'nifti_mappings.json') failed_files_path = os.path.join(output_dir, 'failed_files.json') meta = meta_data() # Initialize the JSON file if not os.path.exists(json_output_path): with open(json_output_path, 'w') as json_file: json.dump({}, json_file) if pid_dirs: for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"): if not os.path.isdir(os.path.join(target_path,pid_dir)): continue meta_file=os.path.join(target_path,'%s.csv'%pid_dir) if os.path.isfile(meta_file): mf_flag=True else: mf_flag=False image_dirs=find_image_dirs(os.path.join(target_path,pid_dir)) for data_dir in tqdm(image_dirs, desc="Processing images files"): location=data_dir full_path=os.path.join(target_path,pid_dir,data_dir) # full_path = convert_windows_to_linux_path(full_path) if not os.path.isdir(full_path): continue try: print(full_path) dicom_fp,dicom_image = load_dicom_images(full_path) spacing_info = dicom_image.GetSpacing() metadata_keys = dicom_image.GetMetaDataKeys() dtag=load_dicom_tag(dicom_fp[0]) uid=dtag.GetMetaData('0020|000e') ##Series Instance UID modality=dtag.GetMetaData('0008|0060')##Modality study='OSIC_PFP'##Dataset_name CIA_other_info = { 'Study_UID':uid, 'metadata_file':'' # 'Series_Description':serise_desc } if mf_flag: CIA_other_info['metadata_file']=meta_file size = list(dicom_image.GetSize()) resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size) # resize the image if resampler is not None: proces_image = resampler.Execute(dicom_image) CIA_other_info['Resample'] = True else: proces_image = dicom_image CIA_other_info['Resample'] = False # threshold the image if 'CT' in modality: proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT) else: pass except RuntimeError: failed_files.append(full_path) print(f"Failed to load DICOM images from {full_path}") continue meta.add_keyvalue('Spacing_mm',min(spacing_info)) meta.add_keyvalue('OriImg_path',full_path) meta.add_keyvalue('Size',size) meta.add_keyvalue('Modality',modality) meta.add_keyvalue('Dataset_name',study) meta.add_keyvalue('ROI','lung') meta.add_extra_keyvalue('Metadata',CIA_other_info) output_path = os.path.join(output_dir,pid_dir, f"{os.path.basename(full_path)}.nii.gz") # output_path=convert_windows_to_linux_path(output_path) save_nifti(proces_image, output_path, full_path) print(f"Saved NIfTI file to {output_path}") # Write the mapping to the JSON file on the fly with open(json_output_path, 'r+') as json_file: existing_mappings = json.load(json_file) existing_mappings[output_path] = meta.get_meta_data() json_file.seek(0) json.dump(existing_mappings, json_file, indent=4) json_file.truncate() else: print("No metadata.csv files found.") with open(failed_files_path, "w") as json_file: json.dump(failed_files, json_file) print(f"The list has been written to {failed_files_path}") print(f"Saved NIfTI mappings to {json_output_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.") parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression") parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic/") args = parser.parse_args() print(args.target_path, args.output_dir) main(args.target_path, args.output_dir)