Alhdrawi commited on
Commit
7d1267d
·
verified ·
1 Parent(s): 114c58a

Upload data_process.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_process.py +148 -0
data_process.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import glob
4
+ import numpy as np
5
+ import pandas as pd
6
+ import csv
7
+ import matplotlib.pyplot as plt
8
+ from tqdm import tqdm
9
+
10
+ from PIL import Image
11
+ import h5py
12
+ import cv2
13
+ from typing import *
14
+ from pathlib import Path
15
+
16
+ import torch
17
+ from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
18
+
19
+ def load_data(filepath):
20
+ dataframe = pd.read_csv(filepath)
21
+ return dataframe
22
+
23
+ def get_cxr_paths_list(filepath):
24
+ dataframe = load_data(filepath)
25
+ cxr_paths = dataframe['Path']
26
+ return cxr_paths
27
+
28
+ '''
29
+ This function resizes and zero pads image
30
+ '''
31
+ def preprocess(img, desired_size=320):
32
+ old_size = img.size
33
+ ratio = float(desired_size)/max(old_size)
34
+ new_size = tuple([int(x*ratio) for x in old_size])
35
+ img = img.resize(new_size, Image.ANTIALIAS)
36
+ # create a new image and paste the resized on it
37
+
38
+ new_img = Image.new('L', (desired_size, desired_size))
39
+ new_img.paste(img, ((desired_size-new_size[0])//2,
40
+ (desired_size-new_size[1])//2))
41
+ return new_img
42
+
43
+ def img_to_hdf5(cxr_paths: List[Union[str, Path]], out_filepath: str, resolution=320):
44
+ """
45
+ Convert directory of images into a .h5 file given paths to all
46
+ images.
47
+ """
48
+ dset_size = len(cxr_paths)
49
+ failed_images = []
50
+ with h5py.File(out_filepath,'w') as h5f:
51
+ img_dset = h5f.create_dataset('cxr', shape=(dset_size, resolution, resolution))
52
+ for idx, path in enumerate(tqdm(cxr_paths)):
53
+ try:
54
+ # read image using cv2
55
+ img = cv2.imread(str(path))
56
+ # convert to PIL Image object
57
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
58
+ img_pil = Image.fromarray(img)
59
+ # preprocess
60
+ img = preprocess(img_pil, desired_size=resolution)
61
+ img_dset[idx] = img
62
+ except Exception as e:
63
+ failed_images.append((path, e))
64
+ print(f"{len(failed_images)} / {len(cxr_paths)} images failed to be added to h5.", failed_images)
65
+
66
+ def get_files(directory):
67
+ files = []
68
+ for (dirpath, dirnames, filenames) in os.walk(directory):
69
+ for file in filenames:
70
+ if file.endswith(".jpg"):
71
+ files.append(os.path.join(dirpath, file))
72
+ return files
73
+
74
+ def get_cxr_path_csv(out_filepath, directory):
75
+ files = get_files(directory)
76
+ file_dict = {"Path": files}
77
+ df = pd.DataFrame(file_dict)
78
+ df.to_csv(out_filepath, index=False)
79
+
80
+ def section_start(lines, section=' IMPRESSION'):
81
+ for idx, line in enumerate(lines):
82
+ if line.startswith(section):
83
+ return idx
84
+ return -1
85
+
86
+ def section_end(lines, section_start):
87
+ num_lines = len(lines)
88
+
89
+ def getIndexOfLast(l, element):
90
+ """ Get index of last occurence of element
91
+ @param l (list): list of elements
92
+ @param element (string): element to search for
93
+ @returns (int): index of last occurrence of element
94
+ """
95
+ i = max(loc for loc, val in enumerate(l) if val == element)
96
+ return i
97
+
98
+ def write_report_csv(cxr_paths, txt_folder, out_path):
99
+ imps = {"filename": [], "impression": []}
100
+ txt_reports = []
101
+ for cxr_path in cxr_paths:
102
+ tokens = cxr_path.split('/')
103
+ study_num = tokens[-2]
104
+ patient_num = tokens[-3]
105
+ patient_group = tokens[-4]
106
+ txt_report = txt_folder + patient_group + '/' + patient_num + '/' + study_num + '.txt'
107
+ filename = study_num + '.txt'
108
+ f = open(txt_report, 'r')
109
+ s = f.read()
110
+ s_split = s.split()
111
+ if "IMPRESSION:" in s_split:
112
+ begin = getIndexOfLast(s_split, "IMPRESSION:") + 1
113
+ end = None
114
+ end_cand1 = None
115
+ end_cand2 = None
116
+ # remove recommendation(s) and notification
117
+ if "RECOMMENDATION(S):" in s_split:
118
+ end_cand1 = s_split.index("RECOMMENDATION(S):")
119
+ elif "RECOMMENDATION:" in s_split:
120
+ end_cand1 = s_split.index("RECOMMENDATION:")
121
+ elif "RECOMMENDATIONS:" in s_split:
122
+ end_cand1 = s_split.index("RECOMMENDATIONS:")
123
+
124
+ if "NOTIFICATION:" in s_split:
125
+ end_cand2 = s_split.index("NOTIFICATION:")
126
+ elif "NOTIFICATIONS:" in s_split:
127
+ end_cand2 = s_split.index("NOTIFICATIONS:")
128
+
129
+ if end_cand1 and end_cand2:
130
+ end = min(end_cand1, end_cand2)
131
+ elif end_cand1:
132
+ end = end_cand1
133
+ elif end_cand2:
134
+ end = end_cand2
135
+
136
+ if end == None:
137
+ imp = " ".join(s_split[begin:])
138
+ else:
139
+ imp = " ".join(s_split[begin:end])
140
+ else:
141
+ imp = 'NO IMPRESSION'
142
+
143
+ imps["impression"].append(imp)
144
+ imps["filename"].append(filename)
145
+
146
+ df = pd.DataFrame(data=imps)
147
+ df.to_csv(out_path, index=False)
148
+