Alhdrawi commited on
Commit
9b84499
·
verified ·
1 Parent(s): 99962e3

Upload preprocess_padchest.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. preprocess_padchest.py +240 -0
preprocess_padchest.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import numpy as np
3
+ import os
4
+ import pandas as pd
5
+ from PIL import Image
6
+ import h5py
7
+ import matplotlib.pyplot as plt
8
+ from typing import List
9
+
10
+ import torch
11
+ from torch.utils import data
12
+ from tqdm.notebook import tqdm
13
+ import torch.nn as nn
14
+ from torchvision.transforms import Compose, Normalize
15
+
16
+ import sklearn
17
+ from sklearn.metrics import confusion_matrix, accuracy_score, auc, roc_auc_score, roc_curve, classification_report
18
+ from sklearn.metrics import precision_recall_curve, f1_score
19
+ from sklearn.metrics import average_precision_score
20
+
21
+ import sys
22
+ sys.path.append('../..')
23
+ sys.path.append('../data-process')
24
+ sys.path.append('data/padchest')
25
+
26
+ from data_process import *
27
+
28
+
29
+
30
+ def preprocess_data(data_root):
31
+ labels_path = os.path.join(data_root,
32
+ 'PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv')
33
+ labels = pd.read_csv(labels_path)
34
+ # get filepaths of 2.zip images
35
+ text_file_path = os.path.join(data_root, '2.zip.unzip-l.txt')
36
+ image_paths = extract_filenames(text_file_path)
37
+ labels_2_df = labels[labels['ImageID'].isin(image_paths)]
38
+ unique_labels = get_unique_labels(labels_2_df)
39
+ # multi hot encoding for labels
40
+ df_lab = create_multi_hot_labels(labels_2_df, unique_labels)
41
+
42
+ loc_2_df = labels[labels['ImageID'].isin(image_paths)]
43
+ loc_col_2 = loc_2_df.loc[:, "Labels"]
44
+ # multihot encoding for localizations
45
+ unique_loc = get_unique_labels(loc_2_df, column="Labels")
46
+ df_loc = create_multi_hot_labels(loc_2_df, unique_loc, column="Labels")
47
+ directory = 'data/padchest/images/'
48
+ cxr_paths = get_paths(directory)
49
+ write_h5(cxr_paths)
50
+ unique_labels = np.load('unique_labels.npy')
51
+ return unique_labels[0:1]
52
+
53
+ def extract_filenames(txt_path):
54
+ """
55
+ Given a filepath to a txt file with image file names,
56
+ extract a list of filenames for this zip.
57
+
58
+ Assume that the txt file has two unnecessary lines at
59
+ both the top and the bottom of the file.
60
+ """
61
+ df = pd.read_csv(txt_path)
62
+ df_list = df.values.tolist()
63
+ df_list = df_list[2:-2]
64
+
65
+ images_list = []
66
+ for file in df_list:
67
+ parsed_filename = file[0].split()[-1]
68
+ images_list.append(parsed_filename)
69
+ return images_list
70
+
71
+ # get paths of all possible labels
72
+ def get_unique_labels(labels_df, column='Labels'):
73
+ """
74
+ Given labels_df, return a list containing all unique labels
75
+ present in this dataset.
76
+ """
77
+
78
+ unique_labels = set()
79
+ # iterate through all rows in the dataframe
80
+ for index, row in labels_df.iterrows():
81
+ labels = row[column]
82
+ try:
83
+ # convert labels str to array
84
+ labels_arr = labels.strip('][').split(', ')
85
+ for label in labels_arr:
86
+ # process string
87
+ processed_label = label.split("'")[1].strip()
88
+ processed_label = processed_label.lower()
89
+ unique_labels.add(processed_label)
90
+ except:
91
+ continue
92
+
93
+ return list(unique_labels)
94
+
95
+ def create_multi_hot_labels(labels_df, unique_labels_list, column='Labels'):
96
+ """
97
+ Args:
98
+ * labels_df: original df where labels are an arr
99
+ * labels_list: list of all possible labels in respective order
100
+
101
+ Given all entries and it's corresponding labels, create a one(multi)-hot vector
102
+ where a 1 represents the presence of that disease.
103
+
104
+ Returns a Pandas dataframe mapping filename to it's multi-hot representation. Each of the diseases
105
+ are columns.
106
+ """
107
+
108
+ # todo: check how the labels are represented for CheXpert
109
+ # create a pandas datafraame with columns as unique labels, start with list of dicts
110
+ dict_list = []
111
+
112
+ # iterate through all rows in the dataframe
113
+ for index, row in labels_df.iterrows():
114
+ labels = row[column]
115
+ try:
116
+ # convert labels str to array
117
+ labels_arr = labels.strip('][').split(', ')
118
+ # print(labels_arr, len(labels_arr))
119
+
120
+ count_dict = dict() # map label name to count
121
+ count_dict['ImageID'] = row['ImageID']
122
+ # init count dict with 0s
123
+ for unq_label in unique_labels_list:
124
+ count_dict[unq_label] = 0
125
+
126
+ if len(labels_arr) > 0 and labels_arr[0] != '':
127
+ for label in labels_arr:
128
+ # process string
129
+ processed_label = label.split("'")[1].strip()
130
+ processed_label = processed_label.lower()
131
+ count_dict[processed_label] = 1
132
+
133
+ dict_list.append(count_dict)
134
+ except:
135
+ print("error when creating labels for this img.")
136
+ continue
137
+
138
+ multi_hot_labels_df = pd.DataFrame(dict_list, columns=(['ImageID'] + unique_labels_list))
139
+ return multi_hot_labels_df
140
+
141
+ # convert folder of images to h5 file
142
+ def get_paths(directory):
143
+ """
144
+ Given a directory, this function outputs
145
+ all the image paths in that directory as a
146
+ list.
147
+ """
148
+ paths_list = []
149
+ for filename in os.listdir(directory):
150
+ if filename.endswith(".png"):
151
+ paths_list.append(os.path.join(directory, filename))
152
+ else:
153
+ continue
154
+ return paths_list
155
+
156
+ def img_to_h5(
157
+ cxr_paths: List[str],
158
+ out_filepath: str,
159
+ resolution: int = 320,
160
+ ) -> List[str]:
161
+ """
162
+ Converts a set of images into a single `.h5` file.
163
+
164
+ Args:
165
+ cxr_paths: List of paths to images as `.png`
166
+ out_filepath: Path to store h5 file
167
+ resolution: image resolution
168
+
169
+ Returns a list of cxr_paths that were successfully stored in the
170
+ `.h5` file.
171
+ """
172
+ dset_size = len(cxr_paths)
173
+ proper_cxr_paths = []
174
+ with h5py.File(out_filepath,'w') as h5f:
175
+ img_dset = h5f.create_dataset('cxr', shape=(dset_size, resolution, resolution))
176
+
177
+ ctr = 0
178
+ for idx, path in enumerate(tqdm(cxr_paths)):
179
+ try:
180
+ # read image using cv2
181
+ img = cv2.imread(path)
182
+ # convert to PIL Image object
183
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
184
+ img_pil = Image.fromarray(img)
185
+ # preprocess
186
+ img = preprocess(img_pil, desired_size=resolution)
187
+ img_dset[ctr] = img
188
+ ctr += 1
189
+ proper_cxr_paths.append(path)
190
+ except:
191
+ print(f"Image {ctr} failed loading...")
192
+ continue
193
+ print(h5f)
194
+
195
+ return proper_cxr_paths
196
+
197
+ def write_h5(cxr_paths, resolution: int = 320):
198
+ out_filepath = 'data/padchest/images/2_cxr_dset_sample.h5'
199
+ dset_size = len(cxr_paths)
200
+
201
+ proper_cxr_paths = []
202
+ with h5py.File(out_filepath,'w') as h5f:
203
+ img_dset = h5f.create_dataset('cxr', shape=(2978, resolution, resolution)) # todo: replace magic number with actual number
204
+ # print('Dataset initialized.')
205
+
206
+ ctr = 0
207
+ for idx, path in enumerate(tqdm(cxr_paths)):
208
+ try:
209
+ # read image using cv2
210
+ img = cv2.imread(path)
211
+ # convert to PIL Image object
212
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
213
+ img_pil = Image.fromarray(img)
214
+ # preprocess
215
+ img = preprocess(img_pil, desired_size=resolution)
216
+ plt.imshow(img)
217
+ img_dset[ctr] = img
218
+ ctr += 1
219
+ proper_cxr_paths.append(path)
220
+ except:
221
+ print("failed!")
222
+ continue
223
+ print(h5f)
224
+ np.save("proper_cxr_paths.npy", np.array(proper_cxr_paths))
225
+ out_filepath = 'data/padchest/images/2_cxr.h5'
226
+ img_to_hdf5(cxr_paths, out_filepath, resolution=320)
227
+ df_labels_new = order_labels(df_lab, proper_cxr_paths)
228
+ labels_path = 'data/padchest/2_cxr_labels.csv'
229
+ df_labels_new.to_csv(labels_path)
230
+
231
+ def order_labels(df, cxr_paths):
232
+ """
233
+ Fixes multi-hot labels to be in order of cxr_paths
234
+ """
235
+ df_new = pd.DataFrame(columns=df.columns)
236
+ for path in cxr_paths:
237
+ imageId = path.split('/')[-1]
238
+ row = df.loc[df['ImageID'] == imageId]
239
+ df_new = df_new.append(row)
240
+ return df_new