|
|
| '''
|
| write by ygq
|
| create on 2025-8-18
|
| update AbdomenAtlas3.0 data clean
|
|
|
| https://arxiv.org/pdf/2407.16697
|
| https://zhuanlan.zhihu.com/p/19339643417
|
|
|
| AbdomenAtlas 3.0 是目前公开的最大规模腹部 CT 图像-文本配对数据集,旨在解决医学影像中的肿瘤检测与报告生成难题。
|
| 该数据集包含 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家,并且是首个提供逐像素(per-voxel)标注、详细肿瘤报告以及肿瘤分期信息的公开数据集。
|
| 这些 CT 扫描数据通过标准医学影像格式(NIfTI 和 DICOM)存储,具备体素间距及 HU 值等临床信息。AbdomenAtlas 3.0 整合并重新标注了 17 个公共数据集,经过 12 位放射科医生的审核,共标注了 8,562 个肿瘤实例,其中包括 3,036 个肝脏肿瘤、354 个胰腺肿瘤和 4,239 个肾脏肿瘤。此外,数据集包含 2,947 份肿瘤报告,其中 948 份为早期肿瘤报告(≤2 cm),260 份报告提供了胰腺肿瘤的 T 分期(T1-T4),并首次公开肝脏 8 个亚段和胰腺 3 个亚段的逐像素标注,以及肿瘤与关键血管(如 SMA、CA 等)的接触标注。
|
| 通过 RadGPT 自动生成的结构化和叙述性报告,数据集详细描述了肿瘤大小、形状、位置、体积以及与周围血管和器官的相互作用。这些报告的生成准确性经过验证,在检测小肿瘤(≤2 cm)方面,RadGPT 的敏感性/特异性显著优于现有方法(例如肝脏:80%/73%,胰腺:77%/77%)。数据集还包含 240 份“人类-AI 融合报告”,结合了放射科医生的临床笔记和 AI 的精确量化结果。AbdomenAtlas 3.0 的意义在于,它首次提供了一个全面的腹部 CT 图像-文本配对数据集,填补了公开领域中腹部肿瘤检测数据的空白,并为推动医学影像中的自动化肿瘤检测、分期和报告生成奠定了基础。这一数据集不仅在规模和多样性上领先,还通过结合 AI 和放射科医生的专业知识,提供了高质量的标注和诊断支持,将有助于提升 AI 模型在医学影像分析中的实际临床应用能力。
|
|
|
| 数据集统计信息
|
| 总数据量:
|
| 9,262 例 3D CT 扫描,来源于 88 家医疗机构,覆盖 19 个国家。
|
| 包含 8,562 个肿瘤实例:
|
| 肝脏肿瘤:3,036 个实例(929 份报告)
|
| 胰腺肿瘤:354 个实例(344 份报告)
|
| 肾脏肿瘤:4,239 个实例(1,674 份报告)
|
| 6,061 份无肿瘤报告(作为对照组)
|
| 小肿瘤(≤2 cm):
|
| 943 份小肿瘤相关报告:
|
| 肝脏:347 个实例(占肝脏肿瘤的 37.4%)
|
| 胰腺:83 个实例(占胰腺肿瘤的 24.1%)
|
| 肾脏:466 个实例(占肾脏肿瘤的 27.8%)
|
| 肿瘤分期与解剖结构:
|
| 260 份胰腺肿瘤分期报告(T1–T4)
|
| 提供肝脏 8 个亚段和胰腺 3 个亚段(头、体、尾)的逐像素分割
|
| 标注了肿瘤与关键血管(如 SMA、CA、CHA 等)的接触角度
|
| 图像与文本配对:
|
| 1.8M 文本 Token,包含三类报告:
|
| 结构化报告:基于模板生成,提供定量信息(如肿瘤体积、位置等)
|
| 叙述性报告:通过 LLM 转换,模仿目标医院的报告风格
|
| 人类-AI 融合报告:240 份,结合临床笔记与 AI 生成的内容
|
|
|
| AbomentAtlas数据集中每个病例里面的segmentions都是包含了25个器官组织的标注文件,同时也包含一个combined_labels.nii.gz的文件【里面加上背景值包含了0-25的数值
|
| 1 aorta
|
| 2 gall_bladder
|
| 3 kidney_left
|
| 4 kidney_right
|
| 5 liver
|
| 6 pancreas
|
| 7 postcava
|
| 8 spleen
|
| 9 stomach
|
| 10 adrenal_gland_left
|
| 11 adrenal_gland_right
|
| 12 bladder
|
| 13 celiac_trunk
|
| 14 colon
|
| 15 duodenum
|
| 16 esophagus
|
| 17 femur_left
|
| 18 femur_right
|
| 19 hepatic_vessel
|
| 20 intestine
|
| 21 lung_left
|
| 22 lung_right
|
| 23 portal_vein_and_splenic_vein
|
| 24 prostate
|
| 25 rectum
|
|
|
|
|
| 参考TotalSegment分别存储25个器官的label处理后的数据文件
|
| '''
|
| import os
|
| import glob
|
| import pandas as pd
|
| import SimpleITK as sitk
|
| import argparse
|
| import json
|
| from tqdm import tqdm
|
| from util import meta_data
|
| import util
|
| import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| TASK_VALUE="segmentation"
|
| CLAMP_RANGE_CT = [-300,300]
|
| CLAMP_RANGE_MRI = [-1,0]
|
|
|
|
|
| LUNG_VOL_THRESH=1000
|
| FEMUR_VOL_THRESH=80
|
| KIDNEY_VOL_THRESH=100
|
| gall_bladder_VOL_THRESH=12
|
| ROI="abdomen"
|
|
|
| PROCESS_FLAG=True
|
|
|
| LABEL_DICT={
|
| "0":"backgroud",
|
| "1":"aorta",
|
| "2":"gall_bladder",
|
| "3":"kidney_left",
|
| "4":"kidney_right",
|
| "5":"liver",
|
| "6":"pancreas",
|
| "7":"postcava",
|
| "8":"spleen",
|
| "9":"stomach",
|
| "10":"adrenal_gland_left",
|
| "11":"adrenal_gland_right",
|
| "12":"bladder",
|
| "13":"celiac_trunk",
|
| "14":"colon",
|
| "15":"duodenum",
|
| "16":"esophagus",
|
| "17":"femur_left",
|
| "18":"femur_right",
|
| "19":"hepatic_vessel",
|
| "20":"intestine",
|
| "21":"lung_left",
|
| "22":"lung_right",
|
| "23":"portal_vein_and_splenic_vein",
|
| "24":"prostate",
|
| "25":"rectum"
|
| }
|
|
|
|
|
|
|
|
|
|
|
| def find_metadata_files(path):
|
|
|
| search_pattern = os.path.join(path, '*.csv')
|
| return glob.glob(search_pattern, recursive=True)
|
|
|
| def find_image_dirs(path):
|
| return os.listdir(path)
|
|
|
|
|
| def load_dicom_images(folder_path):
|
| reader = sitk.ImageSeriesReader()
|
| dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
|
| reader.SetFileNames(dicom_names)
|
| image = reader.Execute()
|
| return dicom_names,image
|
|
|
|
|
| def load_dicom_tag(imgs):
|
| reader = sitk.ImageFileReader()
|
|
|
| reader.SetFileName(imgs)
|
| reader.ReadImageInformation()
|
|
|
| tag=reader.Execute()
|
| return tag
|
|
|
| def load_nrrd(fp):
|
| return sitk.ReadImage(fp)
|
|
|
| def save_nifti(image, output_path, folder_path):
|
|
|
| output_dirpath = os.path.dirname(output_path)
|
| if not os.path.exists(output_dirpath):
|
| print(f"Creating directory {output_dirpath}")
|
| os.makedirs(output_dirpath)
|
|
|
| image.SetMetaData("FolderPath", folder_path)
|
| sitk.WriteImage(image, output_path)
|
|
|
|
|
| def convert_windows_to_linux_path(windows_path):
|
|
|
|
|
| linux_path = windows_path.replace('\\', '/')
|
| if ':' in linux_path:
|
| linux_path = linux_path.split(':', 1)[1]
|
| return linux_path
|
|
|
| def simpleitk_volume_calculation(image_path):
|
| """
|
| 使用SimpleITK简化体积计算流程,计算肺部体积,左肺或右肺超过400即认定为有效throax
|
| """
|
|
|
| image=util.load_nifti(image_path)
|
|
|
| spacing = image.GetSpacing()
|
| voxel_volume = spacing[0] * spacing[1] * spacing[2]
|
|
|
|
|
|
|
|
|
|
|
| image_array2 = sitk.GetArrayFromImage(image)
|
| valid_pxiels=image_array2[image_array2==1].sum()
|
| if valid_pxiels<10:
|
| return 0
|
|
|
| segmented = sitk.BinaryThreshold(image, lowerThreshold=1, upperThreshold=1)
|
|
|
|
|
| statistics = sitk.LabelShapeStatisticsImageFilter()
|
| statistics.Execute(segmented)
|
|
|
| voxel_count = statistics.GetNumberOfPixels(1)
|
| volume_mm3 = voxel_count * voxel_volume
|
| volume_ml = volume_mm3 / 1000.0
|
|
|
|
|
|
|
|
|
| return volume_ml
|
|
|
| def main(target_path, output_dir):
|
| metadata_files = find_metadata_files(target_path)
|
| pid_dirs=find_image_dirs(target_path)
|
| failed_files = []
|
| label_dict={}
|
| if not os.path.isdir(output_dir):
|
| os.makedirs(output_dir)
|
| json_output_path = os.path.join(output_dir, 'xx.json')
|
| failed_files_path = os.path.join(output_dir, 'yy.json')
|
|
|
| with open(json_output_path,'r') as fi:
|
| fj=json.load(fi)
|
| '''
|
| # Initialize the JSON file
|
| if not os.path.exists(json_output_path):
|
| with open(json_output_path, 'w') as json_file:
|
| json.dump({}, json_file)
|
| '''
|
| if pid_dirs:
|
| for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
|
| if not os.path.isdir(os.path.join(target_path,pid_dir)):
|
| continue
|
| if not pid_dir.startswith("BDMAP_"):
|
| continue
|
|
|
| meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
|
| if os.path.isfile(meta_file):
|
| mf_flag=True
|
|
|
| else:
|
| mf_flag=False
|
|
|
| full_path=os.path.join(target_path,pid_dir,"ct.nii.gz")
|
|
|
|
|
| try:
|
| '''
|
| dicom_image=util.load_nifti(full_path)
|
| spacing_info = dicom_image.GetSpacing()
|
| print('SPACING INFO:', spacing_info)
|
|
|
| # metadata_keys = dicom_image.GetMetaDataKeys()
|
|
|
| # dtag=load_dicom_tag(dicom_fp[0])
|
| # uid=dtag.GetMetaData('0020|000e') ##Series Instance UID
|
| # modality=dtag.GetMetaData('0008|0060')##Modality
|
| uid=pid_dir
|
| modality="CT"
|
| study='AbdomenAtlas'##Dataset_name
|
| CIA_other_info = {
|
| 'Study_UID':uid,
|
| 'metadata_file':''
|
| # 'Series_Description':serise_desc
|
| }
|
| CIA_other_info['split'] = "train"
|
| if mf_flag:
|
| CIA_other_info['metadata_file']=meta_file
|
|
|
| size = list(dicom_image.GetSize())
|
| resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)
|
|
|
| # resize the image
|
| if resampler is not None:
|
| proces_image = resampler.Execute(dicom_image)
|
| print('SPACIE INFO AFTER', proces_image.GetSpacing())
|
| CIA_other_info['Resample'] = True
|
| else:
|
| proces_image = dicom_image
|
| CIA_other_info['Resample'] = False
|
|
|
| ##
|
| # CIA_other_info['Image_id']=meta_image_id
|
| # CIA_other_info['Weeks']=str(meta_weeks)
|
| # CIA_other_info['FVC']=str(meta_fvc)
|
| # CIA_other_info['Percent']=str(meta_percent)
|
| # CIA_other_info['Age']=str(meta_age)
|
| # CIA_other_info['Sex']=meta_sex
|
| # CIA_other_info['Smoke_Status']=meta_status
|
| # threshold the image
|
| if 'CT' in modality:
|
| proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
|
| else:
|
| pass
|
|
|
| output_path = os.path.join(output_dir,uid, f"{uid}.nii.gz")
|
| # output_path=convert_windows_to_linux_path(output_path)
|
| save_nifti(proces_image, output_path, full_path)
|
| print(f"Saved NIfTI file to {output_path}")
|
| '''
|
|
|
| label_path_dict = {}
|
| label_flag=True
|
|
|
| label_paths = os.path.join(target_path,pid_dir, 'segmentations')
|
| label_files=glob.glob("%s/*.nii.gz"%(label_paths))
|
|
|
| pelvis_flag=False
|
| thorax_flag=False
|
| lung_min=0
|
| lung_max=0
|
| kidney_flag=False
|
| gall_bladder_flag=False
|
| if len(label_files)>0:
|
| for lf in label_files:
|
| lf_name=os.path.basename(lf)
|
|
|
| lf_tissue=lf_name.replace(".nii.gz","")
|
|
|
| if 'femur' in lf_tissue:
|
| vol_femur=simpleitk_volume_calculation(lf)
|
| print(lf_tissue,vol_femur)
|
| if vol_femur>=FEMUR_VOL_THRESH:
|
| pelvis_flag=True
|
| if 'lung' in lf_tissue:
|
| vol_lung=simpleitk_volume_calculation(lf)
|
| print(lf_tissue,vol_lung)
|
| lung_max=max(lung_max,vol_lung)
|
| if lung_min==0:
|
| lung_min=vol_lung
|
| else:
|
| lung_min=min(lung_min,vol_lung)
|
| if lung_min>=LUNG_VOL_THRESH:
|
| thorax_flag=True
|
| if 'kidney_right' in lf_tissue:
|
| vol_kidney=simpleitk_volume_calculation(lf)
|
| print(lf_tissue,vol_kidney)
|
| if vol_kidney>=KIDNEY_VOL_THRESH:
|
| kidney_flag=True
|
|
|
| if 'gall_bladder' in lf_tissue:
|
| vol_gall_bladder=simpleitk_volume_calculation(lf)
|
| print(lf_tissue,vol_gall_bladder)
|
| if vol_gall_bladder>=gall_bladder_VOL_THRESH:
|
| gall_bladder_flag=True
|
| '''
|
| label_image=load_nrrd(lf)
|
| resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
|
| if resampler is not None:
|
| proces_label = resampler.Execute(label_image)
|
| else:
|
| proces_label = label_image
|
|
|
|
|
| # print(proces_image.GetSize(),proces_label.GetSize())
|
| try:
|
| assert proces_image.GetSize() == proces_label.GetSize()
|
| except Exception as e:
|
| failed_files.append(lf)
|
| continue
|
|
|
| label_output_path = os.path.join(output_dir, uid, TASK_VALUE, f"{lf_tissue}.nii.gz")
|
|
|
| label_path_dict[lf_tissue] = label_output_path
|
| util.save_nifti(proces_label, label_output_path, lf)
|
| print(f"Saved Label Segment NIfTI file to {label_output_path}")
|
| '''
|
| else:
|
| label_flag=False
|
| except RuntimeError:
|
| failed_files.append(full_path)
|
| print(f"Failed to load DICOM images from {full_path}")
|
| continue
|
|
|
| '''
|
| meta.add_keyvalue('Image_id',meta_image_id)
|
| meta.add_keyvalue('Weeks',meta_weeks)
|
| meta.add_keyvalue('FVC',meta_fvc)
|
| meta.add_keyvalue('Percent',meta_percent)
|
| meta.add_keyvalue('Age',meta_age)
|
| meta.add_keyvalue('Sex',meta_sex)
|
| meta.add_keyvalue('Smoke_Status',meta_status)
|
|
|
|
|
| size_processed = list(proces_image.GetSize())
|
|
|
| meta_image_id=uid
|
| # meta.add_keyvalue('Image_id',meta_image_id)
|
| meta.add_keyvalue('Spacing_mm',min(spacing_info))
|
| meta.add_keyvalue('OriImg_path',full_path)
|
| meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
|
| meta.add_keyvalue('Modality',modality)
|
| meta.add_keyvalue('Dataset_name',study)
|
| '''
|
| roi='abdomen'
|
| if thorax_flag and gall_bladder_flag:
|
| roi='thorax-'+roi
|
| if thorax_flag and not gall_bladder_flag:
|
| roi='thorax'
|
| if pelvis_flag and gall_bladder_flag:
|
| roi=roi+"-pelvis"
|
| if pelvis_flag and not gall_bladder_flag:
|
| roi='pelvis'
|
| if lung_min>0 and lung_max/lung_min>3:
|
| label_dict[pid_dir]=[lung_max,lung_min]
|
|
|
| print(pid_dir,roi)
|
|
|
| for ik in fj.keys():
|
| fi=fj[ik]
|
| jid=fi['Metadata']['Study_UID']
|
| max_length=fi['Spacing_mm']*max(fi['Size'])*0.001
|
| print(max_length,max_length>1.2)
|
| if jid==pid_dir:
|
| if roi=='thorax-abdomen-pelvis' and max_length>1.2:
|
| roi='whole-body'
|
| fj[ik]['ROI']=roi
|
| print(jid,max_length,roi)
|
| break
|
| else:
|
| continue
|
|
|
| '''
|
| if label_flag:
|
| # print(label_path_dict.keys())
|
| meta.add_keyvalue('Task',TASK_VALUE)
|
| # meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
|
| meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
|
|
|
| # meta.add_keyvalue('Label_Dict',LABEL_DICT)
|
|
|
| meta.add_extra_keyvalue('Metadata',CIA_other_info)
|
|
|
|
|
|
|
|
|
| # Write the mapping to the JSON file on the fly
|
| with open(json_output_path, 'r+') as json_file:
|
| existing_mappings = json.load(json_file)
|
| existing_mappings[output_path] = meta.get_meta_data()
|
| json_file.seek(0)
|
| json.dump(existing_mappings, json_file, indent=4)
|
| json_file.truncate()
|
| '''
|
| else:
|
| print("No metadata.csv files found.")
|
|
|
|
|
| with open(json_output_path,'w') as fi:
|
| json.dump(fj,fi)
|
| print(f"The list has been written to {failed_files_path}")
|
| print(f"Saved NIfTI mappings to {json_output_path}")
|
|
|
| if __name__ == "__main__":
|
| parser = argparse.ArgumentParser(description="Process NIIGZ files and save as NIfTI.")
|
| parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/AbdomenAtlas/uncompressed2")
|
| parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/")
|
| args = parser.parse_args()
|
| print(args.target_path, args.output_dir)
|
| main(args.target_path, args.output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|