Data_Engineering / kaggle_osic_clean /dataclean_kaggle_update.py

maxmo2009

Initial upload: data cleanup pipeline for 12 medical imaging datasets

da9fb1e verified 17 days ago

14 kB

	#coding:utf-8
	'''
	write by ygq
	create on 2025-07-24
	update kaggle data clean

	依次解析train.csv以及test.csv文件，获取每个数据集基本信息；
	根据解析的id查找对应的train/test目录下的影像并做规范处理，同时查找label的segment目录下的标签，提取不同部位的CT的标签位置保存到json文件中；
	完成后保存json并退出

	'''
	import os
	import glob
	import pandas as pd
	import SimpleITK as sitk
	import argparse
	import json
	from tqdm import tqdm
	from util import meta_data
	import util
	import numpy as np
	# from bert_helper import *

	# model_name = "bert-large-uncased"
	# reduce_method = 'mean'
	# max_words_num = 32 # max number of words in the caption > 2

	# embeder, tokenizer = get_frozen_embeder(model_name)

	# string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
	# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)

	# string2 = "modality: ct, gender: female, age: 50, roi: head"

	# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)

	# input_size = embeder.config.vocab_size
	# in_size = embeder.config.hidden_size

	# print(embeder, input_size, in_size)
	# print(tokenizer)


	# print(embeder_output1)
	# print(embeder_output1.shape) # torch.Size([1, 8, 768])


	# print(embeder_output2)
	# print(embeder_output2.shape) # torch.Size([1, 8, 768])


	# error = torch.abs(embeder_output1 - embeder_output2)
	# print(error)
	# print("Embedding distance between the two sentences: ")
	# print(f"String1: {string1}")
	# print(f"String2: {string2}")
	# print(torch.mean(error))


	# exit()


	meta_id_name='Patient'
	meta_weeks_name='Weeks'
	meta_fvc_name='FVC'
	meta_percent_name='Percent'
	meta_age_name='Age'
	meta_sex_name='Sex'
	meta_status_name='SmokingStatus'

	TASK_VALUE="segmentation"
	CLAMP_RANGE_CT = [-300,300]
	CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...


	# def find_metadata_files(path):
	# # for Cancer Image Archive (TCIA) dataset
	# search_pattern = os.path.join(path, '**', 'metadata.csv')
	# return glob.glob(search_pattern, recursive=True)

	def find_metadata_files(path):
	# for Cancer Image Archive (TCIA) dataset
	search_pattern = os.path.join(path, '*.csv')
	return glob.glob(search_pattern, recursive=True)
	##added by yanguoqing on 20250527
	def find_image_dirs(path):
	return os.listdir(path)

	##modify by yanguoqing on 20250527
	def load_dicom_images(folder_path):
	reader = sitk.ImageSeriesReader()
	dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
	reader.SetFileNames(dicom_names)
	image = reader.Execute()
	return dicom_names,image

	##added by yanguoqing on 20250527
	def load_dicom_tag(imgs):
	reader = sitk.ImageFileReader()
	# dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
	reader.SetFileName(imgs)
	reader.ReadImageInformation() # 仅读取元信息，不加载像素数据
	# metadata_keys = reader.GetMetaDataKeys()
	tag=reader.Execute()
	return tag

	def load_nrrd(fp):
	return sitk.ReadImage(fp)

	def save_nifti(image, output_path, folder_path):
	# Set metadata in the NIfTI file's header
	output_dirpath = os.path.dirname(output_path)
	if not os.path.exists(output_dirpath):
	print(f"Creating directory {output_dirpath}")
	os.makedirs(output_dirpath)
	# Set metadata in the NIfTI file's header
	image.SetMetaData("FolderPath", folder_path)
	sitk.WriteImage(image, output_path)

	##modify by yanguoqing on 20250527
	def convert_windows_to_linux_path(windows_path):
	# Replace backslashes with forward slashes and remove the drive letter
	# Some meta files have windows paths, but the data is stored on a linux server
	linux_path = windows_path.replace('\\', '/')
	if ':' in linux_path:
	linux_path = linux_path.split(':', 1)[1]
	return linux_path

	def main(target_path, output_dir):
	metadata_files = find_metadata_files(target_path)
	pid_dirs=find_image_dirs(target_path)
	failed_files = []
	if not os.path.isdir(output_dir):
	os.makedirs(output_dir)
	json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
	failed_files_path = os.path.join(output_dir, 'failed_files.json')
	meta = meta_data()

	# Initialize the JSON file
	if not os.path.exists(json_output_path):
	with open(json_output_path, 'w') as json_file:
	json.dump({}, json_file)

	if pid_dirs:
	for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
	if not os.path.isdir(os.path.join(target_path,pid_dir)):
	continue
	meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
	if os.path.isfile(meta_file):
	mf_flag=True
	df_meta=pd.read_csv(meta_file,sep=',')
	else:
	mf_flag=False
	image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
	for data_dir in tqdm(image_dirs, desc="Processing images files"):

	location=data_dir

	full_path=os.path.join(target_path,pid_dir,data_dir)
	data_info_row=df_meta[df_meta[meta_id_name]==data_dir]

	if data_info_row.shape[0]>0:
	data_info_row=data_info_row.reset_index()
	#print(data_info_row[meta_id_name])
	meta_image_id=data_info_row[meta_id_name][0]
	meta_weeks=data_info_row[meta_weeks_name][0]
	meta_fvc=data_info_row[meta_fvc_name][0]
	meta_percent=data_info_row[meta_percent_name][0]
	meta_age=data_info_row[meta_age_name][0]
	meta_sex=data_info_row[meta_sex_name][0]
	meta_status=data_info_row[meta_status_name][0]
	else:
	meta_image_id=data_dir
	meta_weeks=''
	meta_fvc=''
	meta_percent=''
	meta_age=''
	meta_sex=''
	meta_status=''
	# full_path = convert_windows_to_linux_path(full_path)
	if not os.path.isdir(full_path):
	continue
	try:
	print(full_path)
	dicom_fp,dicom_image = load_dicom_images(full_path)

	spacing_info = dicom_image.GetSpacing()
	print('SPACING INFO:', spacing_info)

	metadata_keys = dicom_image.GetMetaDataKeys()

	dtag=load_dicom_tag(dicom_fp[0])
	uid=dtag.GetMetaData('0020\|000e') ##Series Instance UID
	modality=dtag.GetMetaData('0008\|0060')##Modality
	study='OSIC_PFP'##Dataset_name
	CIA_other_info = {
	'Study_UID':uid,
	'metadata_file':''
	# 'Series_Description':serise_desc
	}
	CIA_other_info['split'] = pid_dir
	if mf_flag:
	CIA_other_info['metadata_file']=meta_file

	size = list(dicom_image.GetSize())
	resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)

	# resize the image
	if resampler is not None:
	proces_image = resampler.Execute(dicom_image)
	print('SPACIE INFO AFTER', proces_image.GetSpacing())
	CIA_other_info['Resample'] = True
	else:
	proces_image = dicom_image
	CIA_other_info['Resample'] = False

	##
	CIA_other_info['Image_id']=meta_image_id
	CIA_other_info['Weeks']=str(meta_weeks)
	CIA_other_info['FVC']=str(meta_fvc)
	CIA_other_info['Percent']=str(meta_percent)
	CIA_other_info['Age']=str(meta_age)
	CIA_other_info['Sex']=meta_sex
	CIA_other_info['Smoke_Status']=meta_status
	# threshold the image
	if 'CT' in modality:
	proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
	else:
	pass

	output_path = os.path.join(output_dir,data_dir, f"{data_dir}.nii.gz")
	# output_path=convert_windows_to_linux_path(output_path)
	save_nifti(proces_image, output_path, full_path)
	print(f"Saved NIfTI file to {output_path}")

	##segment
	label_path_dict = {}
	label_flag=True
	pare_path=os.path.dirname(target_path)
	label_paths = os.path.join(pare_path, 'GT')
	label_files=glob.glob("%s///%s_*.nrrd"%(label_paths,data_dir))
	#print(label_paths,label_files)
	if len(label_files)>0:
	for lf in label_files:
	lf_name=os.path.basename(lf)
	lf_id=lf_name.split("_")[0]
	lf_tissue=os.path.basename(os.path.dirname(lf)).split("_")[1]
	label_image=load_nrrd(lf)
	resampler =util.get_unisize_resampler(label_image, interpolator='nearest', spacing=spacing_info, size=size)
	if resampler is not None:
	proces_label = resampler.Execute(label_image)
	else:
	proces_label = label_image

	label_output_path = os.path.join(output_dir, lf_id, TASK_VALUE, f"{lf_name}.nii.gz")

	label_path_dict[lf_tissue] = label_output_path
	util.save_nifti(proces_label, label_output_path, lf)
	print(f"Saved Label Segment NIfTI file to {label_output_path}")

	else:
	label_flag=False
	except RuntimeError:
	failed_files.append(full_path)
	print(f"Failed to load DICOM images from {full_path}")
	continue

	'''
	meta.add_keyvalue('Image_id',meta_image_id)
	meta.add_keyvalue('Weeks',meta_weeks)
	meta.add_keyvalue('FVC',meta_fvc)
	meta.add_keyvalue('Percent',meta_percent)
	meta.add_keyvalue('Age',meta_age)
	meta.add_keyvalue('Sex',meta_sex)
	meta.add_keyvalue('Smoke_Status',meta_status)
	'''
	print(proces_image.GetSize(),proces_label.GetSize())
	try:
	assert proces_image.GetSize() == proces_label.GetSize()
	except Exception as e:
	failed_files.append(full_path)
	continue
	size_processed = list(proces_image.GetSize())


	meta.add_keyvalue('Image_id',meta_image_id)
	meta.add_keyvalue('Spacing_mm',min(spacing_info))
	meta.add_keyvalue('OriImg_path',full_path)
	meta.add_keyvalue('Size',size_processed) # 这里用处理后的size -- YH Jachin
	meta.add_keyvalue('Modality',modality)
	meta.add_keyvalue('Dataset_name',study)
	meta.add_keyvalue('ROI','whole-body')

	if label_flag:
	print(label_path_dict.keys())
	meta.add_keyvalue('Task',TASK_VALUE)
	meta.add_keyvalue('Label_tissue',list(label_path_dict.keys()))
	meta.add_keyvalue('Label_path',{TASK_VALUE:label_path_dict})

	meta.add_extra_keyvalue('Metadata',CIA_other_info)




	# Write the mapping to the JSON file on the fly
	with open(json_output_path, 'r+') as json_file:
	existing_mappings = json.load(json_file)
	existing_mappings[output_path] = meta.get_meta_data()
	json_file.seek(0)
	json.dump(existing_mappings, json_file, indent=4)
	json_file.truncate()
	else:
	print("No metadata.csv files found.")

	with open(failed_files_path, "w") as json_file:
	json.dump(failed_files, json_file)

	print(f"The list has been written to {failed_files_path}")
	print(f"Saved NIfTI mappings to {json_output_path}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
	parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression")
	parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic_new/")
	args = parser.parse_args()
	print(args.target_path, args.output_dir)
	main(args.target_path, args.output_dir)