File size: 20,725 Bytes

da9fb1e

#coding:utf-8
'''

writebyygq

createon2025-08-30





BL = Baseline（基线）

FU = Follow-up（随访）



1. Baseline (基线)

    含义：指的是在疾病初期、治疗前或某个特定时间点第一次拍摄的影像（如CT、MRI、X光）。

    作用：这份影像作为评估病情严重程度和后续变化的“起跑线”或“参照物”。医生通过将未来的影像与基线影像进行比较，来判断病情的变化。

2. Follow-up (随访)

    含义：指的是在基线影像之后，按计划或根据病情需要再次拍摄的影像。

    作用：用于评估治疗效果（如肿瘤是否缩小）、监测疾病进展（如病灶是否增大或增多）、或观察术后恢复情况。

“BL FU” 在报告中的应用场景：

    当放射科医生在报告中写下“BL FU”或“compare to BL FU”时，他们的意思是：

    “本次的影像检查结果，需要与之前拍摄的基线影像进行对比，以评估变化。”



例如：

肿瘤患者：一位肺癌患者在化疗前做了一次CT（作为基线BL），化疗2个周期后又做了一次CT（作为随访FU）。放射科医生会在新报告中将两次影像进行对比，并描述：“与20XX年X月X日的基线CT（BL FU） 相比，右肺下叶肿块明显缩小。”

慢性病患者：如肺炎、肝硬化、多发性硬化等需要长期监测的疾病，医生都会通过对比基线片和随访片来精确判断病情是好转、稳定还是恶化。



label:

    0:backgroud 1-N: tumor,其中具体多少数值需要读取对应json文件信息



编号ID：10位的16进制编号，每一个对应一个csv文件，对一个或多个BL和FU。。每个对应相应的json文件和mask标签文件--

备注：CSV包含所有的label信息和编号，如果考虑按照tissue进行分别存储，可以考虑对mask文件结合csv/json信息进行提取相同的lesion_type分别存储label_dict

BL的以及对应的MASK都是inputsTr目录下面

命名形式：

    93dd4de5cd_BL_img_BL_img_00.nii.gz

    93dd4de5cd_BL_mask_BL_img_00.nii.gz

    93dd4de5cd_BL_00.json



FU在inputsTr目录下面,对应的mask在targetsTr力猛

命名形式：

    c6f057b865_FU_img_FU_img_00.nii.gz

    c6f057b865_FU_mask_FU_img_00.nii.gz

    c6f057b865_FU_img_FU_img_01.nii.gz

    c6f057b865_FU_mask_FU_img_01.nii.gz

    c6f057b865_FU_00.json

    c6f057b865_FU_01.json





元数据信息CSV-病灶或者癌症信息--对应基线的位置，对应的基线影像编号，位置，以及对应的随访位置编号以及病灶位置

lesion_id,cog_bl,img_id_bl,cog_propagated,cog_fu,img_id_fu,lesion_type

1,84.9530896759608 273.525433308214 148.780708364732,00,108.78432777048911 320.7355032513338 543.6178096475021,116.270833333333 317.46130952381 548.446428571429,00,Lung

2,206.307026476578 258.39816700611 177.256619144603,00,202.79674663210054 297.81536880017677 566.3173808142716,197.325938566553 300.598976109215 565.804607508532,00,Lymph node



json格式样例

{

    "name": "Points of interest",

    "points": [

        {

            "name": "1",

            "point": [

                84.9530896759608,

                273.525433308214,

                148.780708364732

            ]

        },

        {

            "name": "2",

            "point": [

                206.307026476578,

                258.39816700611,

                177.256619144603

            ]

        }

    ],

    "type": "Multiple points",

    "version": {

        "major": 1,

        "minor": 0

    }

}



20251101补充增加，将病灶编号进行合并同类项目，

注意处理完成后保留原影像的几何空间信息以及元数据文件信息





'''
import os
import glob
import pandas as pd
import SimpleITK as sitk
import argparse
import json
from tqdm import tqdm
from util import meta_data
import util
import numpy as np
# from bert_helper import *

import shutil


##统一编码
label_id_lut={'backgroud': 0,
 'Lymph node': 1,
 'Lung': 2,
 'Soft tissue / Skin': 3,
 'Liver': 4,
 'Skeleton': 5,
 'Adrenals': 6,
 'Spleen': 7,
 'CNS': 8,
 'Kidney': 9,
 'Heart': 10,
 'Others': 11,
 'unclear': 12,
 }


TASK_VALUE="segmentation"
CLAMP_RANGE_CT = [-300,300]
CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
TARGET_VOXEL_SPACING=None

# ##参考MSD的sub_modality描述信息
# SUB_MODALITY=["CT","PET"]
# ##文件名对应的排序顺序
# SERIES_ORDER=["0000","0001"]

##根据对应的json信息进行补充1-N的数值
LABEL_DICT={
    "0":"backgroud",
}
META_COLUMN=['lesion_id', 'cog_bl', 'img_id_bl', 'cog_propagated', 'cog_fu','img_id_fu', 'lesion_type']

# def find_metadata_files(path):
#     # for Cancer Image Archive (TCIA) dataset
#     search_pattern = os.path.join(path, '**', 'metadata.csv')
#     return glob.glob(search_pattern, recursive=True)

def find_metadata_files(path):
    # for Cancer Image Archive (TCIA) dataset
    search_pattern = os.path.join(path, '*.csv')
    return glob.glob(search_pattern, recursive=True)
##added by yanguoqing on 20250527
def find_image_dirs(path):
    return os.listdir(path)

##modify by yanguoqing on 20250527
def load_dicom_images(folder_path):
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    return dicom_names,image

##added by yanguoqing on 20250527
def load_dicom_tag(imgs):
    reader = sitk.ImageFileReader()
    # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileName(imgs)
    reader.ReadImageInformation()  # 仅读取元信息，不加载像素数据
    # metadata_keys = reader.GetMetaDataKeys()
    tag=reader.Execute()
    return tag

def load_nrrd(fp):
    return sitk.ReadImage(fp)

##modify by yanguoqing on 20250830
def merge_images(series_files):
    '''

    每个病例包含两种不同序列的 CT：CT/PET--0000/0001

    将多个分开的模态合并，构建第四个维度的数组，分别按照CT,PET顺序存放

    '''
    reader = sitk.ImageSeriesReader()
    reader.SetFileNames(series_files)
    image = reader.Execute()
    return image

def save_nifti(image, output_path, folder_path):
    # Set metadata in the NIfTI file's header
    output_dirpath = os.path.dirname(output_path)
    if not os.path.exists(output_dirpath):
        print(f"Creating directory {output_dirpath}")
        os.makedirs(output_dirpath)
    # Set metadata in the NIfTI file's header
    image.SetMetaData("FolderPath", folder_path)
    sitk.WriteImage(image, output_path)

##modify by yanguoqing on 20250527
def convert_windows_to_linux_path(windows_path):
    # Replace backslashes with forward slashes and remove the drive letter
    # Some meta files have windows paths, but the data is stored on a linux server
    linux_path = windows_path.replace('\\', '/')
    if ':' in linux_path:
        linux_path = linux_path.split(':', 1)[1]
    return linux_path
##added by yanguoqing on 2025-08-31
##根据csv文件返回的所有数据文件名称，获取所有数据id的
def get_filename_list(fp_dir):
    all_file_list=glob.glob("%s/*.csv"%fp_dir)
    

    return all_file_list
##获取study_id以及study_date
def check_fname(fname):
    if fname.startswith("fdg"):
        sid=fname[:14]
        sdate=fname[15:25]
    else:
        sid=fname[:21]
        sdate=fname[22:]
    return sid,sdate
def main(target_path, output_dir):

    pid_dirs=["inputsTr"]
    failed_files = []
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
    failed_files_path = os.path.join(output_dir, 'failed_files.json')
    meta = meta_data()
    
    # Initialize the JSON file
    if not os.path.exists(json_output_path):
        with open(json_output_path, 'w') as json_file:
            json.dump({}, json_file)


    input_dir=os.path.join(target_path,'inputsTr')
    target_dir=os.path.join(target_path,'targetsTr')

    fp_files=get_filename_list(input_dir)
    ##从辅助文件信息中获取所有1614个病例名称，每个病例名称存在0000，0001两个三维影像数据，按照顺序合并；
    if pid_dirs:
        for pid_dir in tqdm(pid_dirs, desc="Processing all dataset"):
            for fp_file in tqdm(fp_files, desc="Processing all dataset"):
                meta_file=fp_file
                df_meta=pd.read_csv(meta_file)
                
                fp_name=os.path.basename(fp_file)[:-4]
                ##依次查找BL以及FU的所有影像以及对应的mask
                for sub_mod in ['BL','FU']:

                    bl_fps=glob.glob("%s/%s_%s*.json"%(input_dir,fp_name,sub_mod))
                    if len(bl_fps)>0:
                        for bl_fp in bl_fps:
                            basename=os.path.basename(bl_fp)[:-5]
                            bl_fp_name=os.path.basename(bl_fp).replace("_BL_","_BL_img_BL_img_").replace(".json",".nii.gz")
                            bl_fp_img=os.path.join(input_dir,bl_fp_name)
                            
                            if os.path.isfile(bl_fp_img):
                                ##判定存在进行正常处理
                                
                                
                                bl_mask_name=os.path.basename(bl_fp).replace("_BL_","_BL_mask_BL_img_").replace(".json",".nii.gz")
                                
                                bl_fp_mask=os.path.join(input_dir,bl_mask_name)
                                if os.path.isfile(bl_fp_mask):
                                    label_fp=bl_fp_mask
                                    label_flag=True
                                else:
                                    bl_fp_mask=os.path.join(target_dir,bl_mask_name)
                                    if os.path.isfile(bl_fp_mask):
                                        label_fp=bl_fp_mask
                                        label_flag=True
                                    else:
                                        label_fp=None
                                        label_flag=False
                                
                                
                                modality="CT"
                                study='PSMA_Longitudinal_CT'##Dataset_name
                                CIA_other_info = {
                                'Image_id':basename,
                                'metadata_file':''
                                # 'Series_Description':serise_desc
                                }
                                CIA_other_info['split'] = "train"
                                
                                CIA_other_info['metadata_file']=meta_file
                                stk_image=util.load_nifti(bl_fp_img)
                                spacing_info = stk_image.GetSpacing()
                                size = list(stk_image.GetSize())
                                resampler =util.get_unisize_resampler(stk_image, interpolator='linear', spacing=spacing_info, size=size)
                                if resampler is not None:
                                    proces_image = resampler.Execute(stk_image)
                                    print('SPACIE INFO AFTER', proces_image.GetSpacing())
                                    CIA_other_info['Resample'] = True
                                else:
                                    proces_image = stk_image
                                    CIA_other_info['Resample'] = False

                                output_path = os.path.join(output_dir,fp_name, f"{basename}.nii.gz")
                                # output_path=convert_windows_to_linux_path(output_path)
                                save_nifti(proces_image, output_path, input_dir)
                                print(f"Saved NIfTI file to {output_path}")

                                
                                

                                if label_flag:
                                    label_path_dict = {}
                                    label_stk_img=util.load_nifti(label_fp)
                                    
                                    image_array = sitk.GetArrayFromImage(label_stk_img)
                                    ##注意处理label的赋值并还原附带原始影像的基本信息，并重新赋值合并同类项
                                    with open(bl_fp,'r') as fi:
                                        json_info=json.load(fi)
                                    
                                    label_dict={
                                            "0":"backgroud"
                                    }
                                    
                                    update_image_array=np.copy(image_array)
                                    ##获取合并同类项后的基本信息
                                    group_meta=df_meta.groupby('lesion_type')['lesion_id']
                                    for name,group in group_meta:
                                        ##分组名称以及分组后的所有leision_id
                                        ids=group_meta.get_group(name)
                                        target_id=label_id_lut[name]
                                        # ##取每个分组的最小leision_id赋值
                                        # ids_min=ids.min()
                                        # label_dict[str(ids_min)]=name
                                        label_dict[str(target_id)]=name
                                        ##并对
                                        for v in ids.tolist():
                                            print(name,v,target_id)
                                            update_image_array[image_array==v]=target_id
                                    print(np.where(update_image_array==10))
                                    image_array=None
                                    label_stk_img_update=sitk.GetImageFromArray(update_image_array)
                                    label_stk_img_update.CopyInformation(label_stk_img)
                                    # 手动复制所有元数据
                                    # 获取元数据键
                                    meta_keys = label_stk_img.GetMetaDataKeys()
                                    for key in meta_keys:
                                        value = label_stk_img.GetMetaData(key)
                                        label_stk_img_update.SetMetaData(key, value)
                                    
                                    # for lesion_info in json_info['points']:
                                    #     df_row=df_meta['lesion_type'][df_meta['lesion_id']==int(lesion_info['name'])]
                                    #     df_row=df_row.reset_index()
                                    #     lesion_type=df_row['lesion_type'][0]
                                    #     label_dict[lesion_info['name']]=lesion_type

                                    resampler =util.get_unisize_resampler(label_stk_img_update, interpolator='nearest', spacing=spacing_info, size=size)
                                    if resampler is not None:
                                        proces_label = resampler.Execute(label_stk_img_update)

                                        ary_process_label=sitk.GetArrayFromImage(proces_label)
                                        
                                        if ary_process_label[-1,:,:].mean()==ary_process_label[-1,0,0] and ary_process_label[-1,0,0]>0:
                                            print('momingqimiao',ary_process_label[-1,0,0])
                                            ary_process_label[-1,:,:]=0
                                        
                                        label_stk_img_process=sitk.GetImageFromArray(ary_process_label)
                                        label_stk_img_process.CopyInformation(proces_label)
                                        meta_keys = proces_label.GetMetaDataKeys()
                                        for key in meta_keys:
                                            value = proces_label.GetMetaData(key)
                                            label_stk_img_process.SetMetaData(key, value)
                                        


                                    else:
                                        label_stk_img_process = label_stk_img_update
                                    
                                    # print(proces_image.GetSize(),proces_label.GetSize())
                                    try:
                                        assert proces_image.GetSize() == label_stk_img_process.GetSize()
                                    except Exception as e:
                                        failed_files.append(label_fp)
                                        continue

                                    label_output_path = os.path.join(output_dir, fp_name, TASK_VALUE, f"{basename}.nii.gz")

                                    label_path_dict['tumor'] = label_output_path
                                    util.save_nifti(label_stk_img_process, label_output_path, label_fp)
                                    print(f"Saved Label Segment NIfTI file to {label_output_path}")



                            else:
                                continue
                
                
                    

                            size_processed = list(proces_image.GetSize())
                            print('size_processed',size_processed,size)

                            # meta.add_keyvalue('Image_id',meta_image_id)
                            meta.add_keyvalue('Spacing_mm',min(spacing_info[:3]))##保留前三个x,y,z的最小spacing
                            meta.add_keyvalue('OriImg_path',bl_fp_img)
                            meta.add_keyvalue('Size',size_processed)  # 这里用处理后的size -- YH Jachin
                            meta.add_keyvalue('Modality',modality)
                            meta.add_keyvalue('Dataset_name',study)
                            meta.add_keyvalue('ROI','whole-body')

                            
                            if label_flag:
                                # print(label_path_dict.keys())
                                meta.add_keyvalue('Task',TASK_VALUE)
                                # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
                                meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})

                                meta.add_keyvalue('Label_Dict',label_dict)

                            meta.add_extra_keyvalue('Metadata',CIA_other_info)
        

                
                
                


                            # Write the mapping to the JSON file on the fly
                            with open(json_output_path, 'r+') as json_file:
                                existing_mappings = json.load(json_file)
                                existing_mappings[output_path] = meta.get_meta_data()
                                json_file.seek(0)
                                # print(existing_mappings)
                                json.dump(existing_mappings, json_file, indent=4)
                                json_file.truncate()
    # else:
    #     print("No metadata.csv files found.")
    
    with open(failed_files_path, "w") as json_file:
        json.dump(failed_files, json_file)
        
    print(f"The list has been written to {failed_files_path}")
    print(f"Saved NIfTI mappings to {json_output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
    parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/ygq/Data_Engineering/PSMA_clean/demo")
    parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="//home/data/ygq/Data_Engineering/PSMA_clean/sample/")
    args = parser.parse_args()
    print(args.target_path, args.output_dir)
    main(args.target_path, args.output_dir)