Stroke-ia commited on
Commit
740e526
·
verified ·
1 Parent(s): 907373e

Upload team_code.py

Browse files
Files changed (1) hide show
  1. team_code.py +466 -0
team_code.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Edit this script to add your team's code. Some functions are *required*, but you can edit most parts of the required functions,
4
+ # change or remove non-required functions, and add your own functions.
5
+
6
+ ################################################################################
7
+ #
8
+ # Import libraries and functions. You can change or remove them.
9
+ #
10
+ ################################################################################
11
+
12
+ from helper_code import *
13
+ import numpy as np, scipy as sp, scipy.stats, os, sys, joblib
14
+ from sklearn.impute import SimpleImputer
15
+ from sklearn.ensemble import RandomForestClassifier
16
+ from sklearn.utils.class_weight import compute_class_weight
17
+ import os
18
+ import tqdm
19
+ import numpy as np
20
+ import tensorflow as tf
21
+ from scipy import signal
22
+ from sklearn.model_selection import StratifiedKFold
23
+ from sklearn.utils.class_weight import compute_class_weight
24
+
25
+ ################################################################################
26
+ #
27
+ # Required functions. Edit these functions to add your code, but do not change the arguments.
28
+ #
29
+ ################################################################################
30
+
31
+
32
+
33
+
34
+
35
+ # Train your model.
36
+ def train_challenge_model(data_folder, model_folder, verbose):
37
+ # Find data files.
38
+ if verbose >= 1:
39
+ print('Finding data files...')
40
+
41
+ PRE_TRAIN = False
42
+ NEW_FREQUENCY = 100 # longest signal, while resampling to 500Hz = 32256 samples
43
+ EPOCHS_1 = 30
44
+ EPOCHS_2 = 20
45
+ BATCH_SIZE_1 = 20
46
+ BATCH_SIZE_2 = 20
47
+
48
+ # Find the patient data files.
49
+ patient_files = find_patient_files(data_folder)
50
+ num_patient_files = len(patient_files)
51
+
52
+ if num_patient_files==0:
53
+ raise Exception('No data was provided.')
54
+
55
+ # Create a folder for the model if it does not already exist.
56
+ os.makedirs(model_folder, exist_ok=True)
57
+ #TODO: remove this:
58
+ #classes = ['Present', 'Unknown', 'Absent']
59
+ #num_classes = len(classes)
60
+
61
+ murmur_classes = ['Present', 'Unknown', 'Absent']
62
+ num_murmur_classes = len(murmur_classes)
63
+ outcome_classes = ['Abnormal', 'Normal']
64
+ num_outcome_classes = len(outcome_classes)
65
+
66
+ # Extract the features and labels.
67
+ if verbose >= 1:
68
+ print('Extracting features and labels from the Challenge data...')
69
+
70
+ data = []
71
+ murmurs = list()
72
+ outcomes = list()
73
+
74
+
75
+ for i in tqdm.tqdm(range(num_patient_files)):
76
+ # Load the current patient data and recordings.
77
+ current_patient_data = load_patient_data(patient_files[i])
78
+ current_recordings, freq = load_recordings(data_folder, current_patient_data, get_frequencies=True)
79
+ for j in range(len(current_recordings)):
80
+ data.append(signal.resample(current_recordings[j], int((len(current_recordings[j])/freq[j]) * NEW_FREQUENCY)))
81
+ current_auscultation_location = current_patient_data.split('\n')[1:len(current_recordings)+1][j].split(" ")[0]
82
+ all_murmur_locations = get_murmur_locations(current_patient_data).split("+")
83
+ current_murmur = np.zeros(num_murmur_classes, dtype=int)
84
+ if get_murmur(current_patient_data) == "Present":
85
+ if current_auscultation_location in all_murmur_locations:
86
+ current_murmur[0] = 1
87
+ else:
88
+ pass
89
+ elif get_murmur(current_patient_data) == "Unknown":
90
+ current_murmur[1] = 1
91
+ elif get_murmur(current_patient_data) == "Absent":
92
+ current_murmur[2] = 1
93
+ murmurs.append(current_murmur)
94
+
95
+ current_outcome = np.zeros(num_outcome_classes, dtype=int)
96
+ outcome = get_outcome(current_patient_data)
97
+ if outcome in outcome_classes:
98
+ j = outcome_classes.index(outcome)
99
+ current_outcome[j] = 1
100
+ outcomes.append(current_outcome)
101
+
102
+ data_padded = pad_array(data)
103
+ data_padded = np.expand_dims(data_padded,2)
104
+
105
+ murmurs = np.vstack(murmurs)
106
+ outcomes = np.argmax(np.vstack(outcomes),axis=1)
107
+ print(f"Number of signals = {data_padded.shape[0]}")
108
+
109
+ # The prevalence of the 3 different labels
110
+ print("Murmurs prevalence:")
111
+ print(f"Present = {np.where(np.argmax(murmurs,axis=1)==0)[0].shape[0]}, Unknown = {np.where(np.argmax(murmurs,axis=1)==1)[0].shape[0]}, Absent = {np.where(np.argmax(murmurs,axis=1)==2)[0].shape[0]}")
112
+
113
+ print("Outcomes prevalence:")
114
+ print(f"Abnormal = {len(np.where(outcomes==0)[0])}, Normal = {len(np.where(outcomes==1)[0])}")
115
+
116
+ new_weights_murmur=calculating_class_weights(murmurs)
117
+ keys = np.arange(0,len(murmur_classes),1)
118
+ murmur_weight_dictionary = dict(zip(keys, new_weights_murmur.T[1]))
119
+
120
+ weight_outcome = np.unique(outcomes, return_counts=True)[1][0]/np.unique(outcomes, return_counts=True)[1][1]
121
+ outcome_weight_dictionary = {0: 1.0, 1:weight_outcome}
122
+
123
+ lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler_2, verbose=0)
124
+
125
+ gpus = tf.config.list_logical_devices('GPU')
126
+ strategy = tf.distribute.MirroredStrategy(gpus)
127
+ with strategy.scope():
128
+ if PRE_TRAIN == False:
129
+ # Initiate the model.
130
+ clinical_model = build_clinical_model(data_padded.shape[1],data_padded.shape[2])
131
+ murmur_model = build_murmur_model(data_padded.shape[1],data_padded.shape[2])
132
+ elif PRE_TRAIN == True:
133
+ model = base_model(data_padded.shape[1],data_padded.shape[2])
134
+ model.load_weights("./pretrained_model.h5")
135
+
136
+ outcome_layer = tf.keras.layers.Dense(1, "sigmoid", name="clinical_output")(model.layers[-2].output)
137
+ clinical_model = tf.keras.Model(inputs=model.layers[0].output, outputs=[outcome_layer])
138
+ clinical_model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
139
+ metrics = [tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.AUC(curve='ROC')])
140
+
141
+ murmur_layer = tf.keras.layers.Dense(3, "softmax", name="murmur_output")(model.layers[-2].output)
142
+ murmur_model = tf.keras.Model(inputs=model.layers[0].output, outputs=[murmur_layer])
143
+ murmur_model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
144
+ metrics = [tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.AUC(curve='ROC')])
145
+
146
+
147
+
148
+ murmur_model.fit(x=data_padded, y=murmurs, epochs=EPOCHS_1, batch_size=BATCH_SIZE_1,
149
+ verbose=1, shuffle = True,
150
+ class_weight=murmur_weight_dictionary
151
+ #,callbacks=[lr_schedule]
152
+ )
153
+
154
+ clinical_model.fit(x=data_padded, y=outcomes, epochs=EPOCHS_2, batch_size=BATCH_SIZE_2,
155
+ verbose=1, shuffle = True,
156
+ class_weight=outcome_weight_dictionary
157
+ #,callbacks=[lr_schedule]
158
+ )
159
+
160
+ murmur_model.save(os.path.join(model_folder, 'murmur_model.h5'))
161
+
162
+ clinical_model.save(os.path.join(model_folder, 'clinical_model.h5'))
163
+
164
+ # Save the model.
165
+ #save_challenge_model(model_folder, classes, imputer, classifier)
166
+
167
+
168
+
169
+ # Load your trained model. This function is *required*. You should edit this function to add your code, but do *not* change the
170
+ # arguments of this function.
171
+ def load_challenge_model(model_folder, verbose):
172
+ model_dict = {}
173
+ for i in os.listdir(model_folder):
174
+ model = tf.keras.models.load_model(os.path.join(model_folder, i))
175
+ model_dict[i.split(".")[0]] = model
176
+ return model_dict
177
+
178
+
179
+ # Run your trained model. This function is *required*. You should edit this function to add your code, but do *not* change the
180
+ # arguments of this function.
181
+ def run_challenge_model(model, data, recordings, verbose):
182
+ NEW_FREQUENCY = 100
183
+
184
+ murmur_classes = ['Present', 'Unknown', 'Absent']
185
+ outcome_classes = ['Abnormal', 'Normal']
186
+
187
+ # Load the data.
188
+ #indx = get_lead_index(data)
189
+ #extracted_recordings = np.asarray(recordings)[indx]
190
+ new_sig_len = model["murmur_model"].get_config()['layers'][0]['config']['batch_input_shape'][1]
191
+ data_padded = np.zeros((len(recordings),int(new_sig_len),1))
192
+ freq = get_frequency(data)
193
+ murmur_probabilities_temp = np.zeros((len(recordings),3))
194
+ outcome_probabilities_temp = np.zeros((len(recordings),1))
195
+
196
+ for i in range(len(recordings)):
197
+ data = np.zeros((1,new_sig_len,1))
198
+ rec = np.asarray(recordings[i])
199
+ resamp_sig = signal.resample(rec, int((len(rec)/freq) * NEW_FREQUENCY))
200
+ data[0,:len(resamp_sig),0] = resamp_sig
201
+
202
+ murmur_probabilities_temp[i,:] = model["murmur_model"].predict(data)
203
+ outcome_probabilities_temp[i,:] = model["clinical_model"].predict(data)
204
+
205
+ avg_outcome_probabilities = np.sum(outcome_probabilities_temp)/len(recordings)
206
+ avg_murmur_probabilities = np.sum(murmur_probabilities_temp,axis = 0)/len(recordings)
207
+
208
+ binarized_murmur_probabilities = np.argmax(murmur_probabilities_temp, axis = 1)
209
+ binarized_outcome_probabilities = (outcome_probabilities_temp > 0.5) * 1
210
+
211
+ murmur_labels = np.zeros(len(murmur_classes), dtype=np.int_)
212
+ #murmur_indx = np.bincount(binarized_murmur_probabilities).argmax()
213
+ #murmur_labels[murmur_indx] = 1
214
+ if 0 in binarized_murmur_probabilities:
215
+ murmur_labels[0] = 1
216
+ elif 2 in binarized_murmur_probabilities:
217
+ murmur_labels[2] = 1
218
+ elif 1 in binarized_murmur_probabilities:
219
+ murmur_labels[1] = 1
220
+
221
+ outcome_labels = np.zeros(len(outcome_classes), dtype=np.int_)
222
+ # 0 = abnormal outcome
223
+ if 0 in binarized_outcome_probabilities:
224
+ outcome_labels[0] = 1
225
+ else:
226
+ outcome_labels[1] = 1
227
+
228
+
229
+ outcome_probabilities = np.array([avg_outcome_probabilities,1-avg_outcome_probabilities])
230
+ murmur_probabilities = avg_murmur_probabilities
231
+
232
+
233
+ classes = murmur_classes + outcome_classes
234
+ labels = np.concatenate((murmur_labels, outcome_labels))
235
+ probabilities = np.concatenate((murmur_probabilities.ravel(), outcome_probabilities.ravel()))
236
+
237
+ return classes, labels, probabilities
238
+
239
+ ################################################################################
240
+ #
241
+ # Optional functions. You can change or remove these functions and/or add new functions.
242
+ #
243
+ ################################################################################
244
+
245
+
246
+ # Extract features from the data.
247
+ def get_features(data, recordings):
248
+ # Extract the age group and replace with the (approximate) number of months for the middle of the age group.
249
+ age_group = get_age(data)
250
+
251
+ if compare_strings(age_group, 'Neonate'):
252
+ age = 0.5
253
+ elif compare_strings(age_group, 'Infant'):
254
+ age = 6
255
+ elif compare_strings(age_group, 'Child'):
256
+ age = 6 * 12
257
+ elif compare_strings(age_group, 'Adolescent'):
258
+ age = 15 * 12
259
+ elif compare_strings(age_group, 'Young Adult'):
260
+ age = 20 * 12
261
+ else:
262
+ age = float('nan')
263
+
264
+ # Extract sex. Use one-hot encoding.
265
+ sex = get_sex(data)
266
+
267
+ sex_features = np.zeros(2, dtype=int)
268
+ if compare_strings(sex, 'Female'):
269
+ sex_features[0] = 1
270
+ elif compare_strings(sex, 'Male'):
271
+ sex_features[1] = 1
272
+
273
+ # Extract height and weight.
274
+ height = get_height(data)
275
+ weight = get_weight(data)
276
+
277
+ # Extract pregnancy status.
278
+ is_pregnant = get_pregnancy_status(data)
279
+
280
+ # Extract recording locations and data. Identify when a location is present, and compute the mean, variance, and skewness of
281
+ # each recording. If there are multiple recordings for one location, then extract features from the last recording.
282
+ locations = get_locations(data)
283
+
284
+ recording_locations = ['AV', 'MV', 'PV', 'TV', 'PhC']
285
+ num_recording_locations = len(recording_locations)
286
+ recording_features = np.zeros((num_recording_locations, 4), dtype=float)
287
+ num_locations = len(locations)
288
+ num_recordings = len(recordings)
289
+ if num_locations==num_recordings:
290
+ for i in range(num_locations):
291
+ for j in range(num_recording_locations):
292
+ if compare_strings(locations[i], recording_locations[j]) and np.size(recordings[i])>0:
293
+ recording_features[j, 0] = 1
294
+ recording_features[j, 1] = np.mean(recordings[i])
295
+ recording_features[j, 2] = np.var(recordings[i])
296
+ recording_features[j, 3] = sp.stats.skew(recordings[i])
297
+ recording_features = recording_features.flatten()
298
+
299
+ features = np.hstack(([age], sex_features, [height], [weight], [is_pregnant], recording_features))
300
+
301
+ return np.asarray(features, dtype=np.float32)
302
+
303
+ def _inception_module(input_tensor, stride=1, activation='linear', use_bottleneck=True, kernel_size=40, bottleneck_size=32, nb_filters=32):
304
+
305
+ if use_bottleneck and int(input_tensor.shape[-1]) > 1:
306
+ input_inception = tf.keras.layers.Conv1D(filters=bottleneck_size, kernel_size=1,
307
+ padding='same', activation=activation, use_bias=False)(input_tensor)
308
+ else:
309
+ input_inception = input_tensor
310
+
311
+ # kernel_size_s = [3, 5, 8, 11, 17]
312
+ kernel_size_s = [kernel_size // (2 ** i) for i in range(3)]
313
+
314
+ conv_list = []
315
+
316
+ for i in range(len(kernel_size_s)):
317
+ conv_list.append(tf.keras.layers.Conv1D(filters=nb_filters, kernel_size=kernel_size_s[i],
318
+ strides=stride, padding='same', activation=activation, use_bias=False)(
319
+ input_inception))
320
+
321
+ max_pool_1 = tf.keras.layers.MaxPool1D(pool_size=3, strides=stride, padding='same')(input_tensor)
322
+
323
+ conv_6 = tf.keras.layers.Conv1D(filters=nb_filters, kernel_size=1,
324
+ padding='same', activation=activation, use_bias=False)(max_pool_1)
325
+
326
+ conv_list.append(conv_6)
327
+
328
+ x = tf.keras.layers.Concatenate(axis=2)(conv_list)
329
+ x = tf.keras.layers.BatchNormalization()(x)
330
+ x = tf.keras.layers.Activation(activation='relu')(x)
331
+ return x
332
+
333
+ def _shortcut_layer(input_tensor, out_tensor):
334
+ shortcut_y = tf.keras.layers.Conv1D(filters=int(out_tensor.shape[-1]), kernel_size=1,
335
+ padding='same', use_bias=False)(input_tensor)
336
+ shortcut_y = tf.keras.layers.BatchNormalization()(shortcut_y)
337
+
338
+ x = tf.keras.layers.Add()([shortcut_y, out_tensor])
339
+ x = tf.keras.layers.Activation('relu')(x)
340
+ return x
341
+
342
+ def base_model(sig_len,n_features, depth=10, use_residual=True):
343
+ input_layer = tf.keras.layers.Input(shape=(sig_len,n_features))
344
+
345
+ x = input_layer
346
+ input_res = input_layer
347
+
348
+ for d in range(depth):
349
+
350
+ x = _inception_module(x)
351
+
352
+ if use_residual and d % 3 == 2:
353
+ x = _shortcut_layer(input_res, x)
354
+ input_res = x
355
+
356
+ gap_layer = tf.keras.layers.GlobalAveragePooling1D()(x)
357
+
358
+ output = tf.keras.layers.Dense(1, activation='sigmoid')(gap_layer)
359
+
360
+ model = tf.keras.models.Model(inputs=input_layer, outputs=output)
361
+ return model
362
+
363
+ def build_murmur_model(sig_len,n_features, depth=10, use_residual=True):
364
+ input_layer = tf.keras.layers.Input(shape=(sig_len,n_features))
365
+
366
+ x = input_layer
367
+ input_res = input_layer
368
+
369
+ for d in range(depth):
370
+
371
+ x = _inception_module(x)
372
+
373
+ if use_residual and d % 3 == 2:
374
+ x = _shortcut_layer(input_res, x)
375
+ input_res = x
376
+
377
+ gap_layer = tf.keras.layers.GlobalAveragePooling1D()(x)
378
+
379
+ murmur_output = tf.keras.layers.Dense(3, activation='softmax', name="murmur_output")(gap_layer)
380
+ #clinical_output = tf.keras.layers.Dense(1, activation='sigmoid', name="clinical_output")(gap_layer)
381
+
382
+ model = tf.keras.models.Model(inputs=input_layer, outputs=murmur_output)
383
+ model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics = [tf.keras.metrics.CategoricalAccuracy(),
384
+ tf.keras.metrics.AUC(curve='ROC')])
385
+ return model
386
+
387
+ def build_clinical_model(sig_len,n_features, depth=10, use_residual=True):
388
+ input_layer = tf.keras.layers.Input(shape=(sig_len,n_features))
389
+
390
+ x = input_layer
391
+ input_res = input_layer
392
+
393
+ for d in range(depth):
394
+
395
+ x = _inception_module(x)
396
+
397
+ if use_residual and d % 3 == 2:
398
+ x = _shortcut_layer(input_res, x)
399
+ input_res = x
400
+
401
+ gap_layer = tf.keras.layers.GlobalAveragePooling1D()(x)
402
+
403
+ clinical_output = tf.keras.layers.Dense(1, activation='sigmoid', name="clinical_output")(gap_layer)
404
+
405
+ model = tf.keras.models.Model(inputs=input_layer, outputs=clinical_output)
406
+ model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics = [tf.keras.metrics.BinaryAccuracy(),
407
+ tf.keras.metrics.AUC(curve='ROC')])
408
+
409
+ return model
410
+
411
+
412
+ def get_lead_index(patient_metadata):
413
+ lead_name = []
414
+ lead_num = []
415
+ cnt = 0
416
+ for i in patient_metadata.splitlines():
417
+ if i.split(" ")[0] == "AV" or i.split(" ")[0] == "PV" or i.split(" ")[0] == "TV" or i.split(" ")[0] == "MV":
418
+ if not i.split(" ")[0] in lead_name:
419
+ lead_name.append(i.split(" ")[0])
420
+ lead_num.append(cnt)
421
+ cnt += 1
422
+ return np.asarray(lead_num)
423
+
424
+ def scheduler(epoch, lr):
425
+ if epoch == 10:
426
+ return lr * 0.1
427
+ elif epoch == 15:
428
+ return lr * 0.1
429
+ elif epoch == 20:
430
+ return lr * 0.1
431
+ else:
432
+ return lr
433
+
434
+ def scheduler_2(epoch, lr):
435
+ return lr - (lr * 0.1)
436
+
437
+ def get_murmur_locations(data):
438
+ murmur_location = None
439
+ for l in data.split('\n'):
440
+ if l.startswith('#Murmur locations:'):
441
+ try:
442
+ murmur_location = l.split(': ')[1]
443
+ except:
444
+ pass
445
+ if murmur_location is None:
446
+ raise ValueError('No outcome available. Is your code trying to load labels from the hidden data?')
447
+ return murmur_location
448
+
449
+ def pad_array(data, signal_length = None):
450
+ max_len = 0
451
+ for i in data:
452
+ if len(i) > max_len:
453
+ max_len = len(i)
454
+ if not signal_length == None:
455
+ max_len = signal_length
456
+ new_arr = np.zeros((len(data),max_len))
457
+ for j in range(len(data)):
458
+ new_arr[j,:len(data[j])] = data[j]
459
+ return new_arr
460
+
461
+ def calculating_class_weights(y_true):
462
+ number_dim = np.shape(y_true)[1]
463
+ weights = np.empty([number_dim, 2])
464
+ for i in range(number_dim):
465
+ weights[i] = compute_class_weight(class_weight='balanced', classes=[0.,1.], y=y_true[:, i])
466
+ return weights