maxmo2009

Initial upload: data cleanup pipeline for 12 medical imaging datasets

da9fb1e verified 16 days ago

8.9 kB

	import os
	import glob
	import pandas as pd
	import SimpleITK as sitk
	import argparse
	import json
	from tqdm import tqdm
	from util import meta_data
	import util
	# from bert_helper import *

	# model_name = "bert-large-uncased"
	# reduce_method = 'mean'
	# max_words_num = 32 # max number of words in the caption > 2

	# embeder, tokenizer = get_frozen_embeder(model_name)

	# string1 = "modality: ct, gender: female, age: 51, roi: abdomen"
	# embeder_output1 = str2emb(string1, max_words_num, embeder, tokenizer, reduce_method=reduce_method)

	# string2 = "modality: ct, gender: female, age: 50, roi: head"

	# embeder_output2 = str2emb(string2, max_words_num, embeder, tokenizer, reduce_method=reduce_method)

	# input_size = embeder.config.vocab_size
	# in_size = embeder.config.hidden_size

	# print(embeder, input_size, in_size)
	# print(tokenizer)


	# print(embeder_output1)
	# print(embeder_output1.shape) # torch.Size([1, 8, 768])


	# print(embeder_output2)
	# print(embeder_output2.shape) # torch.Size([1, 8, 768])


	# error = torch.abs(embeder_output1 - embeder_output2)
	# print(error)
	# print("Embedding distance between the two sentences: ")
	# print(f"String1: {string1}")
	# print(f"String2: {string2}")
	# print(torch.mean(error))


	# exit()
	CLAMP_RANGE_CT = [-300,300]
	CLAMP_RANGE_MRI = [-1,0] # MRI images threshold placeholder TBC...


	# def find_metadata_files(path):
	# # for Cancer Image Archive (TCIA) dataset
	# search_pattern = os.path.join(path, '**', 'metadata.csv')
	# return glob.glob(search_pattern, recursive=True)

	def find_metadata_files(path):
	# for Cancer Image Archive (TCIA) dataset
	search_pattern = os.path.join(path, '*.csv')
	return glob.glob(search_pattern, recursive=True)
	##added by yanguoqing on 20250527
	def find_image_dirs(path):
	return os.listdir(path)

	##modify by yanguoqing on 20250527
	def load_dicom_images(folder_path):
	reader = sitk.ImageSeriesReader()
	dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
	reader.SetFileNames(dicom_names)
	image = reader.Execute()
	return dicom_names,image

	##added by yanguoqing on 20250527
	def load_dicom_tag(imgs):
	reader = sitk.ImageFileReader()
	# dicom_names = reader.GetGDCMSeriesFileNames(folder_path)
	reader.SetFileName(imgs)
	reader.ReadImageInformation() # 仅读取元信息，不加载像素数据
	# metadata_keys = reader.GetMetaDataKeys()
	tag=reader.Execute()
	return tag

	def save_nifti(image, output_path, folder_path):
	# Set metadata in the NIfTI file's header
	output_dirpath = os.path.dirname(output_path)
	if not os.path.exists(output_dirpath):
	print(f"Creating directory {output_dirpath}")
	os.makedirs(output_dirpath)
	# Set metadata in the NIfTI file's header
	image.SetMetaData("FolderPath", folder_path)
	sitk.WriteImage(image, output_path)

	##modify by yanguoqing on 20250527
	def convert_windows_to_linux_path(windows_path):
	# Replace backslashes with forward slashes and remove the drive letter
	# Some meta files have windows paths, but the data is stored on a linux server
	linux_path = windows_path.replace('\\', '/')
	if ':' in linux_path:
	linux_path = linux_path.split(':', 1)[1]
	return linux_path

	def main(target_path, output_dir):
	metadata_files = find_metadata_files(target_path)
	pid_dirs=find_image_dirs(target_path)
	failed_files = []
	if not os.path.isdir(output_dir):
	os.makedirs(output_dir)
	json_output_path = os.path.join(output_dir, 'nifti_mappings.json')
	failed_files_path = os.path.join(output_dir, 'failed_files.json')
	meta = meta_data()

	# Initialize the JSON file
	if not os.path.exists(json_output_path):
	with open(json_output_path, 'w') as json_file:
	json.dump({}, json_file)

	if pid_dirs:
	for pid_dir in tqdm(pid_dirs, desc="Processing pid dirs"):
	if not os.path.isdir(os.path.join(target_path,pid_dir)):
	continue
	meta_file=os.path.join(target_path,'%s.csv'%pid_dir)
	if os.path.isfile(meta_file):
	mf_flag=True
	else:
	mf_flag=False
	image_dirs=find_image_dirs(os.path.join(target_path,pid_dir))
	for data_dir in tqdm(image_dirs, desc="Processing images files"):

	location=data_dir

	full_path=os.path.join(target_path,pid_dir,data_dir)
	# full_path = convert_windows_to_linux_path(full_path)
	if not os.path.isdir(full_path):
	continue
	try:
	print(full_path)
	dicom_fp,dicom_image = load_dicom_images(full_path)

	spacing_info = dicom_image.GetSpacing()

	metadata_keys = dicom_image.GetMetaDataKeys()

	dtag=load_dicom_tag(dicom_fp[0])
	uid=dtag.GetMetaData('0020\|000e') ##Series Instance UID
	modality=dtag.GetMetaData('0008\|0060')##Modality
	study='OSIC_PFP'##Dataset_name
	CIA_other_info = {
	'Study_UID':uid,
	'metadata_file':''
	# 'Series_Description':serise_desc
	}
	if mf_flag:
	CIA_other_info['metadata_file']=meta_file
	size = list(dicom_image.GetSize())

	resampler =util.get_unisize_resampler(dicom_image, interpolator='linear', spacing=spacing_info, size=size)

	# resize the image
	if resampler is not None:
	proces_image = resampler.Execute(dicom_image)
	CIA_other_info['Resample'] = True
	else:
	proces_image = dicom_image
	CIA_other_info['Resample'] = False

	# threshold the image
	if 'CT' in modality:
	proces_image = util.clamp_image(proces_image, CLAMP_RANGE_CT)
	else:
	pass
	except RuntimeError:
	failed_files.append(full_path)
	print(f"Failed to load DICOM images from {full_path}")
	continue




	meta.add_keyvalue('Spacing_mm',min(spacing_info))
	meta.add_keyvalue('OriImg_path',full_path)
	meta.add_keyvalue('Size',size)
	meta.add_keyvalue('Modality',modality)
	meta.add_keyvalue('Dataset_name',study)
	meta.add_keyvalue('ROI','lung')






	meta.add_extra_keyvalue('Metadata',CIA_other_info)



	output_path = os.path.join(output_dir,pid_dir, f"{os.path.basename(full_path)}.nii.gz")
	# output_path=convert_windows_to_linux_path(output_path)
	save_nifti(proces_image, output_path, full_path)
	print(f"Saved NIfTI file to {output_path}")

	# Write the mapping to the JSON file on the fly
	with open(json_output_path, 'r+') as json_file:
	existing_mappings = json.load(json_file)
	existing_mappings[output_path] = meta.get_meta_data()
	json_file.seek(0)

	json.dump(existing_mappings, json_file, indent=4)
	json_file.truncate()
	else:
	print("No metadata.csv files found.")

	with open(failed_files_path, "w") as json_file:
	json.dump(failed_files, json_file)

	print(f"The list has been written to {failed_files_path}")
	print(f"Saved NIfTI mappings to {json_output_path}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Process DICOM files and save as NIfTI.")
	parser.add_argument("--target_path", type=str, help="Path to the target directory containing metadata files.", default="/home/data/Github/data/data_gen_def/DATASETS/Kaggle/osic_pulmonary_fibrosis_progression_Segmentation/osic-pulmonary-fibrosis-progression")
	parser.add_argument("--output_dir", type=str, help="Directory to save the NIfTI files.", default="/home/data/Github/data/data_gen_def/DATASETS_processed/Kaggle_osic/")
	args = parser.parse_args()
	print(args.target_path, args.output_dir)
	main(args.target_path, args.output_dir)