x_alhdrawi / run_preprocess.py
Alhdrawi's picture
Upload run_preprocess.py with huggingface_hub
d275aec verified
import argparse
from pathlib import Path
from data_process import get_cxr_paths_list, img_to_hdf5, get_cxr_path_csv, write_report_csv
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--csv_out_path', type=str, default='data/cxr_paths.csv', help="Directory to save paths to all chest x-ray images in dataset.")
parser.add_argument('--cxr_out_path', type=str, default='data/cxr.h5', help="Directory to save processed chest x-ray image data.")
parser.add_argument('--dataset_type', type=str, default='mimic', choices=['mimic', 'chexpert-test'], help="Type of dataset to pre-process")
parser.add_argument('--mimic_impressions_path', default='data/mimic_impressions.csv', help="Directory to save extracted impressions from radiology reports.")
parser.add_argument('--chest_x_ray_path', default='/deep/group/data/mimic-cxr/mimic-cxr-jpg/2.0.0/files', help="Directory where chest x-ray image data is stored. This should point to the files folder from the MIMIC chest x-ray dataset.")
parser.add_argument('--radiology_reports_path', default='/deep/group/data/med-data/files/', help="Directory radiology reports are stored. This should point to the files folder from the MIMIC radiology reports dataset.")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
if args.dataset_type == "mimic":
# Write Chest X-ray Image HDF5 File
get_cxr_path_csv(args.csv_out_path, args.chest_x_ray_path)
cxr_paths = get_cxr_paths_list(args.csv_out_path)
img_to_hdf5(cxr_paths, args.cxr_out_path)
#Write CSV File Containing Impressions for each Chest X-ray
write_report_csv(cxr_paths, args.radiology_reports_path, args.mimic_impressions_path)
elif args.dataset_type == "chexpert-test":
# Get all test paths based on cxr dir
cxr_dir = Path(args.chest_x_ray_path)
cxr_paths = list(cxr_dir.rglob("*.jpg"))
cxr_paths = list(filter(lambda x: "view1" in str(x), cxr_paths)) # filter only first frontal views
cxr_paths = sorted(cxr_paths) # sort to align with groundtruth
assert(len(cxr_paths) == 500)
img_to_hdf5(cxr_paths, args.cxr_out_path)