File size: 16,076 Bytes

da9fb1e

#coding:utf-8
'''

write by ygq

create on 2025-09-01



OASIS（Open Access Series of Imaging Studies） 是一个旨在向科研界免费提供脑部MRI数据的项目。本横断面（Cross-Sectional）数据集是其第一个版本，发布于2007年。

OASIS-1 是横断面的，意味着它无法捕捉个体随时间的动态变化。对于研究疾病进展，后续的 OASIS-2 和 OASIS-3（纵向数据集）是更好的选择。



1. 目录与文件命名规则

    根目录下按受试者会话ID建立文件夹。

    受试者ID格式：OAS1_xxxx （例如 OAS1_0012）

    会话ID格式：OAS1_xxxx_MRy （例如 OAS1_0012_MR1，y代表第几次访问成像）

    OAS1_xxxx_MRy/

    │

    ├── OAS1_xxxx_MRy.xml     # 包含采集细节和解剖指标的XML元数据文件

    ├── OAS1_xxxx_MRy.txt     # 与XML内容相同的文本格式文件（便于查看）

    ├── RAW/                  # 存储原始扫描图像（DICOM或Analyze格式）

    ├── PROCESSED/            # 预处理后的图像

    │   ├── SUBJ_111/         # 原始空间下的平均配准图像（各向同性1mm³）

    │   └── T88_111/          # 图谱配准空间下的图像

    │       ├── t4_files/     # 存储配准变换矩阵文件

    │       └── ...           # 配准后的图像文件

    └── FSL_SEG/              # 基于图谱配准图像生成的脑组织分割结果（灰质2/白质3/脑脊液1）





所有图像均以 Analyze 7.5格式 存储，包含：

    一个图像文件（.img）

    一个头文件（.hdr）

    使用 16位大端序（big-endian） 存储



    OAS1_xxxx_MRy_mpr-z_anon	单次原始扫描	256x256x128	1x1x1.25 mm	矢状位

    OAS1_xxxx_MRy_mpr_ni_anon_sbj_111	多次扫描平均配准图像	256x256x160	1x1x1 mm	矢状位

    OAS1_xxxx_MRy_mpr_ni_anon_111_t88_gfc	增益场校正后的图谱配准图像	176x208x176	1x1x1 mm	横断位

    OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc	去除非脑组织的掩模图像	176x208x176	1x1x1 mm	横断位

    OAS1_xxxx_MRy_mpr_ni_anon_111_t88_masked_gfc_fseg	脑组织分割图像（灰/白/CSF）	176x208x176	1x1x1 mm	横断位



    1. 人口统计学信息

        性别（M/F）

        用手习惯（Hand）（均为右利手）

        年龄（Age）

        教育程度（Educ）（1-5级）

        社会经济地位（SES）



    2. 临床评估

        MMSE（简易精神状态检查）

        CDR（临床痴呆评级：0=正常，0.5=非常轻微，1=轻度，2=中度）



    3. 衍生解剖指标

        eTIV：估计颅内容积

        ASF：图谱缩放因子

        nWBV：标准化全脑体积





 OASIS Cross-Sectional 数据集经过 FreeSurfer 处理后的版本。这通常被称为 OASIS Cross-Sectional FreeSurfer Processed 数据集

 经过 FreeSurfer 处理后，每个受试者的数据都会存储在一个独立的目录中，其结构遵循 FreeSurfer 的标准输出格式。

    ├── sub-OASIS10001/         # 受试者1的FreeSurfer输出目录

    │   ├── mri/                # 体积数据（Volume-based data）

    │   │   ├── orig.mgz        # 原始图像（转换为FreeSurfer格式）

    │   │   ├── nu.mgz          # 强度归一化后的图像

    │   │   ├── T1.mgz          # 用于分割的图像

    │   │   ├── aseg.mgz        # 自动亚结构分割（皮质下分割）

    │   │   ├── aparc+aseg.mgz  # 皮层+皮质下融合分割

    │   │   ├── brain.mgz       # 去除非脑组织后的图像

    │   │   ├── brainmask.mgz   # 大脑掩模

    │   │   └── ... (其他文件)

    │   ├── surf/               # 表面数据（Surface-based data）

    │   │   ├── lh.pial         # 左半球软脑膜表面

    │   │   ├── lh.white        # 左半球白质表面

    │   │   ├── rh.pial         # 右半球软脑膜表面

    │   │   ├── rh.white        # 右半球白质表面

    │   │   ├── lh.thickness    # 左半球皮层厚度图

    │   │   └── ... (其他文件)

    │   ├── stats/              # 统计结果（文本文件）

    │   │   ├── aseg.stats      # 皮质下结构体积统计

    │   │   ├── lh.aparc.stats  # 左半球皮层脑区厚度/面积统计

    │   │   └── rh.aparc.stats  # 右半球皮层脑区厚度/面积统计

    │   └── label/              # 标签文件

    │       └── ...

'''
import os
import glob
import pandas as pd
import SimpleITK as sitk
import argparse
import json
from tqdm import tqdm
from util import meta_data
import util
import numpy as np
# from bert_helper import *

import shutil

import warnings
warnings.filterwarnings("ignore")
meta_id_name='ID'
##性别（M/F）,用手习惯（Hand）（均为右利手）,年龄（Age）,教育程度（Educ）（1-5级）,社会经济地位（SES）,MMSE（简易精神状态检查）,CDR（临床痴呆评级：0=正常，0.5=非常轻微，1=轻度，2=中度）,eTIV：估计颅内容积,ASF：图谱缩放因子,nWBV：标准化全脑体积
META_COLUMN=['ID', 'M/F', 'Hand', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV','nWBV', 'ASF', 'Delay']

TASK_VALUE="segmentation"
CLAMP_RANGE_CT = [-300,300]
CLAMP_RANGE_MRI = None # MRI images threshold placeholder TBC...
TARGET_VOXEL_SPACING=None

##参考MSD的sub_modality描述信息
SUB_MODALITY=["FLAIR","T1w","t1gd","T2w"]
##文件名对应的排序顺序
SERIES_ORDER=["flair","t1","t1ce","t2"]

LABEL_DICT={
    "0":"backgroud",
    "1":"cerebrospinal fluid",#CSF
    "2":"gray matter",#GM
    "3":"white matter"#WM
}
# def find_metadata_files(path):
#     # for Cancer Image Archive (TCIA) dataset
#     search_pattern = os.path.join(path, '**', 'metadata.csv')
#     return glob.glob(search_pattern, recursive=True)

def find_metadata_files(path):
    # for Cancer Image Archive (TCIA) dataset
    search_pattern = os.path.join(path, '*.csv')
    return glob.glob(search_pattern, recursive=True)
##added by yanguoqing on 20250527
def find_image_dirs(path):
    return os.listdir(path)

##modify by yanguoqing on 20250527
def load_dicom_images(folder_path):
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    return dicom_names,image

##added by yanguoqing on 20250527
def load_dicom_tag(imgs):
    reader = sitk.ImageFileReader()
    # dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
    reader.SetFileName(imgs)
    reader.ReadImageInformation()  # 仅读取元信息，不加载像素数据
    # metadata_keys = reader.GetMetaDataKeys()
    tag=reader.Execute()
    return tag

def load_nrrd(fp):
    return sitk.ReadImage(fp)

##modify by yanguoqing on 20250805
def load_brtas_images(series_files):
    '''

    每个病例包含四种不同序列的 3D MRI 扫描（均已进行预处理，如配准、重采样到 1mm³ 各向同性、颅骨剥离）

    将多个分开的模态合并，构建第四个维度的数组，分别按照FLAIR,T1,T1CE,T2顺序存放

    '''
    reader = sitk.ImageSeriesReader()
    reader.SetFileNames(series_files)
    image = reader.Execute()
    return image

def save_nifti(image, output_path, folder_path):
    # Set metadata in the NIfTI file's header
    output_dirpath = os.path.dirname(output_path)
    if not os.path.exists(output_dirpath):
        print(f"Creating directory {output_dirpath}")
        os.makedirs(output_dirpath)
    # Set metadata in the NIfTI file's header
    image.SetMetaData("FolderPath", folder_path)
    sitk.WriteImage(image, output_path)

##modify by yanguoqing on 20250527
def convert_windows_to_linux_path(windows_path):
    # Replace backslashes with forward slashes and remove the drive letter
    # Some meta files have windows paths, but the data is stored on a linux server
    linux_path = windows_path.replace('\\', '/')
    if ':' in linux_path:
        linux_path = linux_path.split(':', 1)[1]
    return linux_path

def main(target_path, output_dir):
    
    pid_dirs=find_image_dirs(target_path)
    failed_files = []
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
    failed_files_path = os.path.join(output_dir, 'failed_files.json')
    meta = meta_data()
    
    # Initialize the JSON file
    if not os.path.exists(json_output_path):
        with open(json_output_path, 'w') as json_file:
            json.dump({}, json_file)
    ##方便处理解析信息，转成csv文件
    meta_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'oasis_cross-sectional-5708aa0a98d82080.csv')
    meta_file_ori=os.path.join(target_path,'oasis_cross-sectional-5708aa0a98d82080.xlsx')
    if os.path.isfile(meta_file):
        mf_flag=True
        df_meta=pd.read_csv(meta_file,sep=',')
    else:
        mf_flag=False


    if pid_dirs:
        for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
            if not os.path.isdir(os.path.join(target_path,pid_dir)):
                continue
            
            ##遍历所有目录下的病例数据
            image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
            
            for data_dir in tqdm(image_dirs, desc="Processing images files"):
                ##data_dir即id
                full_path=os.path.join(target_path,pid_dir,data_dir)
                
                modality="MRI"
                study='OASIS_1'##Dataset_name
                CIA_other_info = {'metadata_file':''}
                CIA_other_info['split'] = "train"
                CIA_other_info['metadata_file']=meta_file_ori
                data_info_row=df_meta[df_meta[meta_id_name]==data_dir]
                
                if data_info_row.shape[0]>0:
                    data_info_row=data_info_row.reset_index()
                    #print(data_info_row[meta_id_name])
                    for keyname in META_COLUMN[1:]:
                        CIA_other_info[keyname]=str(data_info_row[keyname][0])
                    
                    CIA_other_info['Image_id']=data_dir
                  

                else:
                    meta_image_id=data_dir
                    for keyname in META_COLUMN[1:]:
                        CIA_other_info[keyname]=''
                
                

                try:
                    ##读取去骨保留脑组织的img
                    #\PROCESSED\MPRAGE\T88_111\OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc.img
                    full_file=glob.glob("%s/PROCESSED/MPRAGE/T88_111/%s_*_anon_111_t88_masked_gfc.img"%(full_path,data_dir))[0]
                    # full_file=os.path.join(full_path,"PROCESSED/MPRAGE/T88_111","%s_mpr_n4_anon_111_t88_masked_gfc.img"%data_dir)
                    
                    if os.path.isfile(full_file):
                        ##存在有效的MRI影像数据进行后续处理
                        sitk_img_original=util.load_nifti(full_file)
                    else:
                        print("病例数据%s为空"%data_dir)
                        continue
                    
                    
                    original_spacing = list(sitk_img_original.GetSpacing())
                    original_size = list(sitk_img_original.GetSize())

                    

                    meta.add_keyvalue('Spacing_mm',min(original_spacing))
                    meta.add_keyvalue('OriImg_path',full_file)
                    meta.add_keyvalue('Size',original_size)  # 这里用处理后的size -- YH Jachin
                    meta.add_keyvalue('Modality',modality)
                    meta.add_keyvalue('Dataset_name',study)
                    meta.add_keyvalue('ROI','head')

                    meta.add_keyvalue('Label_Dict',LABEL_DICT)

                    output_image_file = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
                    # output_path=convert_windows_to_linux_path(output_path)
                    ##
                    save_nifti(sitk_img_original, output_image_file, full_path)
                    print(f"Saved NIfTI file to {output_image_file}")
                    ##Label processing

                    label_path_dict={}
                    #OAS1_0001_MR1_mpr_n4_anon_111_t88_masked_gfc_fseg.img
                    full_label_file=glob.glob("%s/FSL_SEG/%s_*_anon_111_t88_masked_gfc_fseg.img"%(full_path,data_dir))[0]
                    

                    process_label_path=os.path.join(output_dir,data_dir,'segmentation')

                    processed_lbl_full_path=os.path.join(process_label_path, f"{data_dir}.nii.gz")

                    if not os.path.isdir(process_label_path):
                        os.makedirs(process_label_path,exist_ok=True)

                    if not os.path.isfile(full_label_file):
                        label_flag=False
                    else:
                        sitk_lbl_original = util.load_nifti(full_label_file)
                        util.save_nifti(sitk_lbl_original, processed_lbl_full_path, os.path.dirname(full_label_file)) # Save original
                        print(f"Saved Segemention NIfTI file to {processed_lbl_full_path}")

                        label_path_dict['head'] = processed_lbl_full_path
                        label_flag=True
                    
                    if label_flag:
                        meta.add_keyvalue('Task',TASK_VALUE)
                        meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})


               
                #     try:
                #         assert sitk_img_processed.GetSize() == sitk_lbl_processed.GetSize()
                #     except Exception as e:
                #         failed_files.append(full_path_label)
                #         continue
                    print(sitk_img_original.GetSize(),sitk_lbl_original.GetSize())

                except Exception as e:
                    print(e)
                    failed_files.append(data_dir)
                    print(f"Failed to load BRATS images from {data_dir}")
                    continue

                
                
                meta.add_extra_keyvalue('Metadata',CIA_other_info)


                # Write the mapping to the JSON file on the fly
                with open(json_output_path, 'r+') as json_file:
                    existing_mappings = json.load(json_file)
                    existing_mappings[output_image_file] = meta.get_meta_data()
                    json_file.seek(0)
                    # print(existing_mappings)
                    json.dump(existing_mappings, json_file, indent=4)
                    json_file.truncate()
    # else:
    #     print("No metadata.csv files found.")
    
    with open(failed_files_path, "w") as json_file:
        json.dump(failed_files, json_file)
        
    print(f"The list has been written to {failed_files_path}")
    print(f"Saved NIfTI mappings to {json_output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
    parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/OASIS/OASIS_1/oasis_cs_sectional/")
    parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL")
    args = parser.parse_args()
    print(args.target_path, args.output_dir)
    main(args.target_path, args.output_dir)