File size: 8,896 Bytes

da9fb1e

import os
import glob
import pandas as pd
import SimpleITK as sitk
import argparse
import json
from tqdm import tqdm
from util import meta_data
import util
# from bert_helper import *

# model_name = "bert-large-uncased"
# reduce_method = 'mean'
# max_words_num = 32  # max number of words in the caption > 2
    
# embeder, tokenizer = get_frozen_embeder(model_name)

# string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)

# string2 = "modality: ct, gender: female, age: 50, roi: head"

# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)

# input_size = embeder.config.vocab_size
# in_size = embeder.config.hidden_size
    
# print(embeder, input_size, in_size)
# print(tokenizer)
    

# print(embeder_output1)
# print(embeder_output1.shape)  # torch.Size([1, 8, 768])
    
    
# print(embeder_output2)
# print(embeder_output2.shape)  # torch.Size([1, 8, 768])
    

# error = torch.abs(embeder_output1 - embeder_output2)
# print(error)
# print("Embedding distance between the two sentences: ")
# print(f"String1: {string1}")
# print(f"String2: {string2}")
# print(torch.mean(error))


# exit()
CLAMP_RANGE_CT = [-300,300]
CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...


# def find_metadata_files(path):
#     # for Cancer Image Archive (TCIA) dataset
#     search_pattern = os.path.join(path, '**', 'metadata.csv')
#     return glob.glob(search_pattern, recursive=True)

def find_metadata_files(path):
    # for Cancer Image Archive (TCIA) dataset
    search_pattern = os.path.join(path, '*.csv')
    return glob.glob(search_pattern, recursive=True)
##added by yanguoqing on 20250527
def find_image_dirs(path):
    return os.listdir(path)

##modify by yanguoqing on 20250527
def load_dicom_images(folder_path):
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    return dicom_names,image

##added by yanguoqing on 20250527
def load_dicom_tag(imgs):
    reader = sitk.ImageFileReader()
    # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileName(imgs)
    reader.ReadImageInformation()  # 仅读取元信息，不加载像素数据
    # metadata_keys = reader.GetMetaDataKeys()
    tag=reader.Execute()
    return tag

def save_nifti(image, output_path, folder_path):
    # Set metadata in the NIfTI file's header
    output_dirpath = os.path.dirname(output_path)
    if not os.path.exists(output_dirpath):
        print(f"Creating directory {output_dirpath}")
        os.makedirs(output_dirpath)
    # Set metadata in the NIfTI file's header
    image.SetMetaData("FolderPath", folder_path)
    sitk.WriteImage(image, output_path)

##modify by yanguoqing on 20250527
def convert_windows_to_linux_path(windows_path):
    # Replace backslashes with forward slashes and remove the drive letter
    # Some meta files have windows paths, but the data is stored on a linux server
    linux_path = windows_path.replace('\\', '/')
    if ':' in linux_path:
        linux_path = linux_path.split(':', 1)[1]
    return linux_path

def main(target_path, output_dir):
    metadata_files = find_metadata_files(target_path)
    pid_dirs=find_image_dirs(target_path)
    failed_files = []
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
    failed_files_path = os.path.join(output_dir, 'failed_files.json')
    meta = meta_data()
    
    # Initialize the JSON file
    if not os.path.exists(json_output_path):
        with open(json_output_path, 'w') as json_file:
            json.dump({}, json_file)
    
    if pid_dirs:
        for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
            if not os.path.isdir(os.path.join(target_path,pid_dir)):
                continue
            meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
            if os.path.isfile(meta_file):
                mf_flag=True
            else:
                mf_flag=False
            image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
            for data_dir in tqdm(image_dirs, desc="Processing images files"):
            
                location=data_dir
                
                full_path=os.path.join(target_path,pid_dir,data_dir)
                # full_path = convert_windows_to_linux_path(full_path)
                if not os.path.isdir(full_path):
                    continue
                try:
                    print(full_path)
                    dicom_fp,dicom_image = load_dicom_images(full_path)
                
                    spacing_info = dicom_image.GetSpacing()
                    
                    metadata_keys = dicom_image.GetMetaDataKeys()
                    
                    dtag=load_dicom_tag(dicom_fp[0])
                    uid=dtag.GetMetaData('0020|000e') ##Series Instance UID
                    modality=dtag.GetMetaData('0008|0060')##Modality
                    study='OSIC_PFP'##Dataset_name
                    CIA_other_info = {
                    'Study_UID':uid,
                    'metadata_file':''
                    # 'Series_Description':serise_desc
                    }   
                    if mf_flag:
                        CIA_other_info['metadata_file']=meta_file
                    size = list(dicom_image.GetSize())

                    resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)

                    # resize the image 
                    if resampler is not None:
                        proces_image = resampler.Execute(dicom_image)
                        CIA_other_info['Resample'] = True
                    else:
                        proces_image = dicom_image
                        CIA_other_info['Resample'] = False

                    # threshold the image
                    if 'CT' in modality:
                        proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
                    else:
                        pass
                except RuntimeError:
                    failed_files.append(full_path)
                    print(f"Failed to load DICOM images from {full_path}")
                    continue
                
                
                    
                        
                meta.add_keyvalue('Spacing_mm',min(spacing_info))
                meta.add_keyvalue('OriImg_path',full_path)
                meta.add_keyvalue('Size',size)  
                meta.add_keyvalue('Modality',modality)
                meta.add_keyvalue('Dataset_name',study)
                meta.add_keyvalue('ROI','lung')


                



                meta.add_extra_keyvalue('Metadata',CIA_other_info)

            
                
                output_path = os.path.join(output_dir,pid_dir, f"{os.path.basename(full_path)}.nii.gz")
                # output_path=convert_windows_to_linux_path(output_path)
                save_nifti(proces_image, output_path, full_path)
                print(f"Saved NIfTI file to {output_path}")
            
                # Write the mapping to the JSON file on the fly
                with open(json_output_path, 'r+') as json_file:
                    existing_mappings = json.load(json_file)
                    existing_mappings[output_path] = meta.get_meta_data()
                    json_file.seek(0)

                    json.dump(existing_mappings, json_file, indent=4)
                    json_file.truncate()
    else:
        print("No metadata.csv files found.")
    
    with open(failed_files_path, "w") as json_file:
        json.dump(failed_files, json_file)
        
    print(f"The list has been written to {failed_files_path}")
    print(f"Saved NIfTI mappings to {json_output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
    parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression")
    parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic/")
    args = parser.parse_args()
    print(args.target_path, args.output_dir)
    main(args.target_path, args.output_dir)