File size: 16,076 Bytes
da9fb1e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 | #coding:utf-8
'''
write by ygq
create on 2025-09-01
OASIS(Open Access Series of Imaging Studies) 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面(Cross-Sectional)数据集是其第一个版本,发布于2007年。
OASIS-1 是横断面的,意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展,后续的 OASIS-2 和 OASIS-3(纵向数据集)是更好的选择。
1. 目录与文件命名规则
根目录下按受试者会话ID建立文件夹。
受试者ID格式:OAS1_xxxx (例如 OAS1_0012)
会话ID格式:OAS1_xxxx_MRy (例如 OAS1_0012_MR1,y代表第几次访问成像)
OAS1_xxxx_MRy/
│
├── OAS1_xxxx_MRy.xml # 包含采集细节和解剖指标的XML元数据文件
├── OAS1_xxxx_MRy.txt # 与XML内容相同的文本格式文件(便于查看)
├── RAW/ # 存储原始扫描图像(DICOM或Analyze格式)
├── PROCESSED/ # 预处理后的图像
│ ├── SUBJ_111/ # 原始空间下的平均配准图像(各向同性1mm³)
│ └── T88_111/ # 图谱配准空间下的图像
│ ├── t4_files/ # 存储配准变换矩阵文件
│ └── ... # 配准后的图像文件
└── FSL_SEG/ # 基于图谱配准图像生成的脑组织分割结果(灰质2/白质3/脑脊液1)
所有图像均以 Analyze 7.5格式 存储,包含:
一个图像文件(.img)
一个头文件(.hdr)
使用 16位大端序(big-endian) 存储
OAS1_xxxx_MRy_mpr-z_anon 单次原始扫描 256x256x128 1x1x1.25 mm 矢状位
OAS1_xxxx_MRy_mpr_ni_anon_sbj_111 多次扫描平均配准图像 256x256x160 1x1x1 mm 矢状位
OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc 增益场校正后的图谱配准图像 176x208x176 1x1x1 mm 横断位
OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc 去除非脑组织的掩模图像 176x208x176 1x1x1 mm 横断位
OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg 脑组织分割图像(灰/白/CSF) 176x208x176 1x1x1 mm 横断位
1. 人口统计学信息
性别(M/F)
用手习惯(Hand)(均为右利手)
年龄(Age)
教育程度(Educ)(1-5级)
社会经济地位(SES)
2. 临床评估
MMSE(简易精神状态检查)
CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度)
3. 衍生解剖指标
eTIV:估计颅内容积
ASF:图谱缩放因子
nWBV:标准化全脑体积
OASIS Cross-Sectional 数据集经过 FreeSurfer 处理后的版本。这通常被称为 OASIS Cross-Sectional FreeSurfer Processed 数据集
经过 FreeSurfer 处理后,每个受试者的数据都会存储在一个独立的目录中,其结构遵循 FreeSurfer 的标准输出格式。
├── sub-OASIS10001/ # 受试者1的FreeSurfer输出目录
│ ├── mri/ # 体积数据(Volume-based data)
│ │ ├── orig.mgz # 原始图像(转换为FreeSurfer格式)
│ │ ├── nu.mgz # 强度归一化后的图像
│ │ ├── T1.mgz # 用于分割的图像
│ │ ├── aseg.mgz # 自动亚结构分割(皮质下分割)
│ │ ├── aparc+aseg.mgz # 皮层+皮质下融合分割
│ │ ├── brain.mgz # 去除非脑组织后的图像
│ │ ├── brainmask.mgz # 大脑掩模
│ │ └── ... (其他文件)
│ ├── surf/ # 表面数据(Surface-based data)
│ │ ├── lh.pial # 左半球软脑膜表面
│ │ ├── lh.white # 左半球白质表面
│ │ ├── rh.pial # 右半球软脑膜表面
│ │ ├── rh.white # 右半球白质表面
│ │ ├── lh.thickness # 左半球皮层厚度图
│ │ └── ... (其他文件)
│ ├── stats/ # 统计结果(文本文件)
│ │ ├── aseg.stats # 皮质下结构体积统计
│ │ ├── lh.aparc.stats # 左半球皮层脑区厚度/面积统计
│ │ └── rh.aparc.stats # 右半球皮层脑区厚度/面积统计
│ └── label/ # 标签文件
│ └── ...
'''
import os
import glob
import pandas as pd
import SimpleITK as sitk
import argparse
import json
from tqdm import tqdm
from util import meta_data
import util
import numpy as np
# from bert_helper import *
import shutil
import warnings
warnings.filterwarnings("ignore")
meta_id_name='ID'
##性别(M/F),用手习惯(Hand)(均为右利手),年龄(Age),教育程度(Educ)(1-5级),社会经济地位(SES),MMSE(简易精神状态检查),CDR(临床痴呆评级:0=正常,0.5=非常轻微,1=轻度,2=中度),eTIV:估计颅内容积,ASF:图谱缩放因子,nWBV:标准化全脑体积
META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']
TASK_VALUE="segmentation"
CLAMP_RANGE_CT = [-300,300]
CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
TARGET_VOXEL_SPACING=None
##参考MSD的sub_modality描述信息
SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"]
##文件名对应的排序顺序
SERIES_ORDER=["flair","t1","t1ce","t2"]
LABEL_DICT={
"0":"backgroud",
"1":"cerebrospinal fluid",#CSF
"2":"gray matter",#GM
"3":"white matter"#WM
}
# def find_metadata_files(path):
# # for Cancer Image Archive (TCIA) dataset
# search_pattern = os.path.join(path, '**', 'metadata.csv')
# return glob.glob(search_pattern, recursive=True)
def find_metadata_files(path):
# for Cancer Image Archive (TCIA) dataset
search_pattern = os.path.join(path, '*.csv')
return glob.glob(search_pattern, recursive=True)
##added by yanguoqing on 20250527
def find_image_dirs(path):
return os.listdir(path)
##modify by yanguoqing on 20250527
def load_dicom_images(folder_path):
reader = sitk.ImageSeriesReader()
dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
reader.SetFileNames(dicom_names)
image = reader.Execute()
return dicom_names,image
##added by yanguoqing on 20250527
def load_dicom_tag(imgs):
reader = sitk.ImageFileReader()
# dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
reader.SetFileName(imgs)
reader.ReadImageInformation() # 仅读取元信息,不加载像素数据
# metadata_keys = reader.GetMetaDataKeys()
tag=reader.Execute()
return tag
def load_nrrd(fp):
return sitk.ReadImage(fp)
##modify by yanguoqing on 20250805
def load_brtas_images(series_files):
'''
每个病例包含四种不同序列的 3D MRI 扫描(均已进行预处理,如配准、重采样到 1mm³ 各向同性、颅骨剥离)
将多个分开的模态合并,构建第四个维度的数组,分别按照FLAIR,T1,T1CE,T2顺序存放
'''
reader = sitk.ImageSeriesReader()
reader.SetFileNames(series_files)
image = reader.Execute()
return image
def save_nifti(image, output_path, folder_path):
# Set metadata in the NIfTI file's header
output_dirpath = os.path.dirname(output_path)
if not os.path.exists(output_dirpath):
print(f"Creating directory {output_dirpath}")
os.makedirs(output_dirpath)
# Set metadata in the NIfTI file's header
image.SetMetaData("FolderPath", folder_path)
sitk.WriteImage(image, output_path)
##modify by yanguoqing on 20250527
def convert_windows_to_linux_path(windows_path):
# Replace backslashes with forward slashes and remove the drive letter
# Some meta files have windows paths, but the data is stored on a linux server
linux_path = windows_path.replace('\\', '/')
if ':' in linux_path:
linux_path = linux_path.split(':', 1)[1]
return linux_path
def main(target_path, output_dir):
pid_dirs=find_image_dirs(target_path)
failed_files = []
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
failed_files_path = os.path.join(output_dir, 'failed_files.json')
meta = meta_data()
# Initialize the JSON file
if not os.path.exists(json_output_path):
with open(json_output_path, 'w') as json_file:
json.dump({}, json_file)
##方便处理解析信息,转成csv文件
meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv')
meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx')
if os.path.isfile(meta_file):
mf_flag=True
df_meta=pd.read_csv(meta_file,sep=',')
else:
mf_flag=False
if pid_dirs:
for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
if not os.path.isdir(os.path.join(target_path,pid_dir)):
continue
##遍历所有目录下的病例数据
image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
for data_dir in tqdm(image_dirs, desc="Processing images files"):
##data_dir即id
full_path=os.path.join(target_path,pid_dir,data_dir)
modality="MRI"
study='OASIS_1'##Dataset_name
CIA_other_info = {'metadata_file':''}
CIA_other_info['split'] = "train"
CIA_other_info['metadata_file']=meta_file_ori
data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
if data_info_row.shape[0]>0:
data_info_row=data_info_row.reset_index()
#print(data_info_row[meta_id_name])
for keyname in META_COLUMN[1:]:
CIA_other_info[keyname]=str(data_info_row[keyname][0])
CIA_other_info['Image_id']=data_dir
else:
meta_image_id=data_dir
for keyname in META_COLUMN[1:]:
CIA_other_info[keyname]=''
try:
##读取去骨保留脑组织的img
#\PROCESSED\MPRAGE\T88_111\OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc.img
full_file=glob.glob("%s/PROCESSED/MPRAGE/T88_111/%s_*_anon_111_t88_masked_gfc.img"%(full_path,data_dir))[0]
# full_file=os.path.join(full_path,"PROCESSED/MPRAGE/T88_111","%s_mpr_n4_anon_111_t88_masked_gfc.img"%data_dir)
if os.path.isfile(full_file):
##存在有效的MRI影像数据进行后续处理
sitk_img_original=util.load_nifti(full_file)
else:
print("病例数据%s为空"%data_dir)
continue
original_spacing = list(sitk_img_original.GetSpacing())
original_size = list(sitk_img_original.GetSize())
meta.add_keyvalue('Spacing_mm',min(original_spacing))
meta.add_keyvalue('OriImg_path',full_file)
meta.add_keyvalue('Size',original_size) # 这里用处理后的size -- YH Jachin
meta.add_keyvalue('Modality',modality)
meta.add_keyvalue('Dataset_name',study)
meta.add_keyvalue('ROI','head')
meta.add_keyvalue('Label_Dict',LABEL_DICT)
output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
# output_path=convert_windows_to_linux_path(output_path)
##
save_nifti(sitk_img_original, output_image_file, full_path)
print(f"Saved NIfTI file to {output_image_file}")
##Label processing
label_path_dict={}
#OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc_fseg.img
full_label_file=glob.glob("%s/FSL_SEG/%s_*_anon_111_t88_masked_gfc_fseg.img"%(full_path,data_dir))[0]
process_label_path=os.path.join(output_dir,data_dir,'segmentation')
processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz")
if not os.path.isdir(process_label_path):
os.makedirs(process_label_path,exist_ok=True)
if not os.path.isfile(full_label_file):
label_flag=False
else:
sitk_lbl_original = util.load_nifti(full_label_file)
util.save_nifti(sitk_lbl_original, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original
print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}")
label_path_dict['head'] = processed_lbl_full_path
label_flag=True
if label_flag:
meta.add_keyvalue('Task',TASK_VALUE)
meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})
# try:
# assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
# except Exception as e:
# failed_files.append(full_path_label)
# continue
print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize())
except Exception as e:
print(e)
failed_files.append(data_dir)
print(f"Failed to load BRATS images from {data_dir}")
continue
meta.add_extra_keyvalue('Metadata',CIA_other_info)
# Write the mapping to the JSON file on the fly
with open(json_output_path, 'r+') as json_file:
existing_mappings = json.load(json_file)
existing_mappings[output_image_file] = meta.get_meta_data()
json_file.seek(0)
# print(existing_mappings)
json.dump(existing_mappings, json_file, indent=4)
json_file.truncate()
# else:
# print("No metadata.csv files found.")
with open(failed_files_path, "w") as json_file:
json.dump(failed_files, json_file)
print(f"The list has been written to {failed_files_path}")
print(f"Saved NIfTI mappings to {json_output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/")
parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL")
args = parser.parse_args()
print(args.target_path, args.output_dir)
main(args.target_path, args.output_dir)
|