shivapriyasom commited on
Commit
03469f7
·
verified ·
1 Parent(s): 0d705ca

Upload 2 files

Browse files
Files changed (2) hide show
  1. app_external validation.py +623 -0
  2. year6.parquet +3 -0
app_external validation.py ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, roc_auc_score
7
+ from sklearn.calibration import calibration_curve
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from io import StringIO
11
+ import warnings
12
+ warnings.filterwarnings('ignore')
13
+ import numpy as np
14
+ import pandas as pd
15
+ import pyarrow.parquet as pq
16
+ from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
17
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
18
+ from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,RepeatedStratifiedKFold
19
+ from sklearn.metrics import confusion_matrix,classification_report,precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score, matthews_corrcoef
20
+ from sklearn.metrics import roc_auc_score,auc
21
+ import pickle
22
+
23
+ from sklearn.utils.class_weight import compute_sample_weight
24
+
25
+ import xgboost as xgb
26
+ from xgboost.sklearn import XGBClassifier
27
+ from sklearn.naive_bayes import GaussianNB
28
+ from sklearn.ensemble import AdaBoostClassifier
29
+ from sklearn.svm import SVC
30
+ from sklearn.linear_model import LogisticRegression
31
+ from sklearn.preprocessing import StandardScaler
32
+ from sklearn.metrics import brier_score_loss
33
+ from sklearn.calibration import calibration_curve
34
+ import matplotlib.pyplot as plt
35
+ from sklearn.calibration import CalibratedClassifierCV
36
+ from sklearn.linear_model import LinearRegression
37
+
38
+ # Global variables for training data and column names
39
+ training_data = None
40
+ column_names = None
41
+ test_list=[]
42
+ def rand_for(neww_list,x_te,rf,lab,x_tr,actual,paramss,X_Tempp,enco,my_table_str,my_table_num,tabl,tracount):
43
+ cl_list=[]
44
+ pro_list=[]
45
+ for i in neww_list:
46
+ dff_copy=i.copy()
47
+ y_cl=dff_copy.loc[:,lab]
48
+ teemp_list=[]
49
+ ftli=[]
50
+ X_cl=dff_copy.drop([lab],axis=1)
51
+ x_te=pd.DataFrame(x_te,columns=X_Tempp.columns)
52
+
53
+ if tracount==0:
54
+
55
+ #mm=RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=42,bootstrap=True, oob_score=True,class_weight='balanced',ccp_alpha=0.01)
56
+ mm=RandomForestClassifier(n_estimators=100, criterion='entropy',max_features=None,random_state=42,bootstrap=True, oob_score=True,class_weight='balanced',ccp_alpha=0.01)
57
+ #mm.fit(X_cl,y_cl)
58
+ calibrated_rf = CalibratedClassifierCV(estimator=mm, method='isotonic', cv=5)
59
+ calibrated_rf.fit(X_cl, y_cl)
60
+ #print(calibrated_rf.get_params())
61
+ out=calibrated_rf.predict(x_te)
62
+ probs=calibrated_rf.predict_proba(x_te)[:,1]
63
+ elif tracount==1:
64
+ dtrain = xgb.DMatrix(X_cl.to_numpy(), label=y_cl)
65
+ dtest = xgb.DMatrix(x_te.to_numpy(), label=y_te)
66
+ params = {
67
+ 'objective': 'binary:logistic', # Binary classification problem
68
+ 'eval_metric': 'logloss', # Logarithmic loss for evaluation
69
+ 'max_depth': 60,
70
+ 'eta': 0.1,
71
+ 'subsample': 0.8,
72
+ 'colsample_bytree': 0.8,
73
+ 'seed': 42}
74
+ num_rounds = 100
75
+ mm=xgb.train(params, dtrain, num_rounds)
76
+ probs = mm.predict(dtest)
77
+ out = (probs > 0.5).astype(int)
78
+
79
+ elif tracount==5:
80
+ mm=LogisticRegression(penalty='l2',solver='newton-cholesky',max_iter=200)
81
+ mm.fit(X_cl,y_cl)
82
+ out=mm.predict(x_te)
83
+ probs=mm.predict_proba(x_te)[:,1]
84
+
85
+
86
+ elif tracount==4:
87
+ var_smoothing_value = 1e-9 # Adjust this value as needed
88
+ mm = GaussianNB(var_smoothing=var_smoothing_value)
89
+ mm.fit(X_cl, y_cl)
90
+ out = mm.predict(x_te)
91
+ probs = mm.predict_proba(x_te)[:, 1]
92
+
93
+ elif tracount==1:
94
+ mm = AdaBoostClassifier(n_estimators=100,random_state=42,estimator=RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=42,bootstrap=True, oob_score=True,class_weight='balanced',ccp_alpha=0.01))
95
+ out = mm.predict(x_te)
96
+ probs = mm.predict_proba(x_te)[:, 1]
97
+
98
+ elif tracount==6:
99
+ mm = SVC(probability=True, C=3)
100
+ mm.fit(X_cl, y_cl)
101
+ out = mm.predict(x_te)
102
+ probs = mm.predict_proba(x_te)[:, 1]
103
+
104
+
105
+
106
+ cl_list.append(out)
107
+ pro_list.append(probs)
108
+
109
+
110
+
111
+ return cl_list,pro_list
112
+ def ne_calib(some_prob,down_factor,origin_factor):
113
+ aa=some_prob*origin_factor/down_factor
114
+ denone=(1-some_prob)*(1-origin_factor)/(1-down_factor)
115
+ new_dum_prob=aa/(denone+aa)
116
+ return new_dum_prob
117
+ def actualll(sl_list,pro_list,delt,down_factor,origin_factor):
118
+ ac_list=[]
119
+ probab_list=[]
120
+ second_probab_list=[]
121
+
122
+ for i in range(len(sl_list[0])):
123
+ sum=0
124
+ sum_pro=0
125
+ sum_pro_pro=0
126
+ for j in range(len(sl_list)):
127
+
128
+ sum_pro+=ne_calib(pro_list[j][i],down_factor,origin_factor)
129
+ sum_pro_pro+=pro_list[j][i]
130
+
131
+ if sl_list[j][i]==-1:
132
+ sum+=(sl_list[j][i])
133
+ else:
134
+ sum+=(sl_list[j][i])
135
+
136
+ sum/=len(sl_list)
137
+ sum_pro/=len(sl_list)
138
+ sum_pro_pro/=len(sl_list)
139
+
140
+
141
+ if sum>=delt:
142
+ ac_list.append(1)
143
+ probab_list.append(sum_pro)
144
+ second_probab_list.append(sum_pro_pro)
145
+ elif sum<=delt and sum >=0 :
146
+ ac_list.append(0)
147
+ probab_list.append(1-sum_pro)
148
+ second_probab_list.append(1-sum_pro_pro)
149
+ elif sum<=delt and sum <0:
150
+ ac_list.append(0)
151
+ probab_list.append(sum_pro)
152
+ second_probab_list.append(sum_pro_pro)
153
+ return ac_list,probab_list,second_probab_list
154
+
155
+
156
+
157
+ def sli_mod(c_lisy):
158
+ sli_list=[]
159
+ ### I am changing the threshold
160
+ for i in c_lisy:
161
+ k=np.array(i)
162
+ k[k<0.5]=-1
163
+ k[k>=0.5]=1
164
+ #k=k/len(c_lisy)
165
+ sli_list.append(list(k))
166
+ return sli_list
167
+
168
+ def run_model(x_tr,x_te,y_tr,deltaa,lab,rf,X_Tempp,track,actual,paramss,enco,my_table_str,my_table_num,tabl,tracount,origin_factor):
169
+
170
+ x_tr=pd.DataFrame(x_tr,columns=X_Tempp.columns)
171
+ y_tr=pd.DataFrame(y_tr,columns=[test_list[track]])
172
+ master_table=pd.concat([x_tr,y_tr],axis=1).copy()
173
+
174
+ only_minority=master_table.loc[master_table[lab]==1]
175
+
176
+ only_majority=master_table.drop(only_minority.index)
177
+ min_index=only_minority.index
178
+ max_index=only_majority.index
179
+
180
+ df_list=[]
181
+ down_factor=0
182
+ if (len(min_index)<=60):
183
+ for i in range(20):
184
+ np.random.seed(i+30)
185
+ if test_list[track]=='VOD' or test_list[track]=='STROKEHI':# or test_list[track]=='ACSPSHI' or test_list[track]=='AVNPSHI':
186
+ sampled_array = np.random.choice(max_index,size=int(3*len(min_index)), replace=True)
187
+ down_factor=0.25
188
+ elif test_list[track]=='ACSPSHI':
189
+ sampled_array = np.random.choice(max_index,size=int(2.5*len(min_index)), replace=True)
190
+ down_factor=1/(1+2.5)
191
+ else:
192
+ sampled_array = np.random.choice(max_index,size=int(2*len(min_index)), replace=True)
193
+ down_factor=1/(1+2)
194
+ temp_df=only_majority.loc[sampled_array]
195
+
196
+ new_df=pd.concat([temp_df,only_minority])
197
+
198
+ df_list.append(new_df)
199
+ else:
200
+ for i in range(10):
201
+ np.random.seed(i+30)
202
+ if test_list[track]=='DEAD':
203
+ sampled_array = np.random.choice(max_index,size=int(3*len(min_index)), replace=True)
204
+ down_factor=1/(1+3)
205
+ else:
206
+ sampled_array = np.random.choice(max_index,size=int(3*len(min_index)), replace=True)
207
+ down_factor=1/(1+3)
208
+ temp_df=only_majority.loc[sampled_array]
209
+
210
+ new_df=pd.concat([temp_df,only_minority])
211
+
212
+ df_list.append(new_df)
213
+
214
+
215
+
216
+ #neww_list=my_tomek(df_list,lab)
217
+ neww_list=df_list
218
+ c_lisy,pro_lisy=rand_for(neww_list,x_te,rf,lab,x_tr,actual,paramss,X_Tempp,enco,my_table_str,my_table_num,tabl,tracount)
219
+ sli_lisy=sli_mod(c_lisy)
220
+
221
+ a_lisy,probab_lisy,secondlisy=actualll(sli_lisy,pro_lisy,deltaa,down_factor,origin_factor)
222
+ return a_lisy,probab_lisy,secondlisy
223
+ def load_training_data():
224
+
225
+ global training_data, column_names, test_list
226
+
227
+
228
+ try:
229
+ my_table=pq.read_table('year6.parquet').to_pandas()
230
+ print(my_table['YEARGPF'].value_counts())
231
+ my_table=my_table[(my_table['YEARGPF']!='< 2008')]
232
+ my_table=my_table.reset_index(drop=True)
233
+
234
+ pa=pd.read_csv('may_final.csv')
235
+ pali=list(pa.iloc[:,0])
236
+ print(pali)
237
+
238
+ #pali.append(test_list[track])
239
+ #pali.append('DUMMYID')
240
+ #pali.remove('AGEGPFF')
241
+ #pali.remove('COUNTRY')
242
+ #print(pali)
243
+ #my_table=my_table[pali]
244
+ training_data = my_table
245
+ column_names=pali
246
+ except FileNotFoundError:
247
+
248
+ return "No training Data"
249
+
250
+ def train_and_evaluate(input_file):
251
+
252
+ global training_data, column_names,test_list
253
+
254
+ if training_data is None or column_names is None:
255
+ load_training_data()
256
+
257
+ if input_file is None:
258
+ return None, None, None
259
+
260
+ try:
261
+
262
+ input_data = pd.read_csv(input_file.name)
263
+
264
+
265
+ available_features = [col for col in column_names if col in training_data.columns]
266
+ available_features_input = [col for col in available_features if col in input_data.columns]
267
+
268
+ if not available_features_input:
269
+ return "Error: No matching columns found between datasets", None, None
270
+
271
+ # Prepare training data
272
+
273
+ #X_train_full = training_data[available_features]
274
+ outcome_cols = ['EFS', 'DEAD', 'GF', 'AGVHD', 'CGVHD', 'VOCPSHI', 'STROKEHI']
275
+ test_list=outcome_cols.copy()
276
+ total_cols=available_features+outcome_cols
277
+ inter_df=training_data[total_cols]
278
+ inter_df=inter_df.dropna()
279
+ inter_df=inter_df.reset_index(drop=True)
280
+
281
+
282
+ input_data=input_data[(input_data['YEARGPF']!='< 2008')]
283
+ input_data=input_data.reset_index(drop=True)
284
+
285
+ inter_input=input_data[total_cols]
286
+ inter_input=inter_input.dropna()
287
+ inter_input=inter_input.reset_index(drop=True)
288
+ my_table=inter_df[available_features]
289
+ # Prepare input data
290
+ X_input = inter_input[available_features]
291
+ X_input = X_input[my_table.columns]
292
+ my_test=X_input
293
+ '''li1=['Yes','No']
294
+ li2=['Event happened', 'No event']
295
+ cols_with_unique_values1 = []
296
+ cols_with_unique_values2 = []
297
+ #print(my_table['EXCHTFPR'].isin(li1))
298
+ for col in my_table.columns:
299
+ if my_table[col].isin(li1).all():
300
+ cols_with_unique_values1.append(col)
301
+ for col in my_table.columns:
302
+ if my_table[col].isin(li2).all():
303
+ cols_with_unique_values2.append(col)
304
+ #print(len(cols_with_unique_values1))
305
+ #print(len(cols_with_unique_values2))
306
+ my_ye=my_table[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64')
307
+ my_eve=my_table[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64')
308
+ my_table2=my_table.copy()
309
+ ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2]
310
+ #print(ccc)
311
+ my_table_modify=my_table2.drop(ccc,axis=1)
312
+ my_table_modify=pd.concat([my_table_modify,my_ye,my_eve],axis=1)
313
+ #my_table_modify=my_table_modify.drop([test_list[track],'DUMMYID'],axis=1)
314
+ my_table_str=my_table_modify.select_dtypes(exclude=['number'])
315
+ print(my_table_str.shape)
316
+ my_table_num=my_table_modify.select_dtypes(include=['number'])
317
+ #print(my_table_num.shape)
318
+ enco=OneHotEncoder(sparse_output=True)
319
+ fito=enco.fit(my_table_str)
320
+ #mmm=aa.inverse_transform(g)
321
+ tabl=enco.transform(my_table_str)
322
+ tabl=pd.DataFrame(tabl.toarray(),columns=enco.get_feature_names_out())
323
+ #print(tabl.shape)
324
+ #print(dfcopy)
325
+ ftable=pd.concat([tabl,my_table_num],axis=1)
326
+ X_train_full=ftable
327
+ li1=['Yes','No']
328
+ li2=['Event happened', 'No event']
329
+ cols_with_unique_values1 = []
330
+ cols_with_unique_values2 = []
331
+ for col in my_test.columns:
332
+ if my_test[col].isin(li1).all():
333
+ cols_with_unique_values1.append(col)
334
+ for col in my_test.columns:
335
+ if my_test[col].isin(li2).all():
336
+ cols_with_unique_values2.append(col)
337
+ #print(len(cols_with_unique_values1))
338
+ #print(len(cols_with_unique_values2))
339
+ my_ye=my_test[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64')
340
+ my_eve=my_test[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64')
341
+ my_test2=my_test.copy()
342
+ ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2]
343
+ #print(ccc)
344
+ my_test_modify=my_test2.drop(ccc,axis=1)
345
+ my_test=pd.concat([my_test_modify,my_ye,my_eve],axis=1)
346
+ #print(my_table_str.shape)
347
+ my_test_num=my_test.select_dtypes(include=['number'])
348
+ my_test_str=my_test.select_dtypes(exclude=['number'])
349
+ mm=my_test_str.columns
350
+ my_test_str=enco.transform(my_test_str)
351
+ my_test_str=pd.DataFrame(my_test_str.toarray(),columns=enco.get_feature_names_out())
352
+ my_test_real=pd.concat([my_test_str,my_test_num],axis=1)'''
353
+
354
+ # Train data numerical
355
+ li1=['Yes','No']
356
+ li2=['Event happened', 'No event']
357
+ cols_with_unique_values1 = []
358
+ cols_with_unique_values2 = []
359
+ #print(my_table['EXCHTFPR'].isin(li1))
360
+ for col in my_table.columns:
361
+ if my_table[col].isin(li1).all():
362
+ cols_with_unique_values1.append(col)
363
+ for col in my_table.columns:
364
+ if my_table[col].isin(li2).all():
365
+ cols_with_unique_values2.append(col)
366
+ #print(len(cols_with_unique_values1))
367
+ #print(len(cols_with_unique_values2))
368
+ my_ye=my_table[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64')
369
+ my_eve=my_table[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64')
370
+ my_table2=my_table.copy()
371
+ ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2]
372
+ #print(ccc)
373
+ my_table_modify=my_table2.drop(ccc,axis=1)
374
+ my_table_modify=pd.concat([my_table_modify,my_ye,my_eve],axis=1)
375
+ #my_table_modify=my_table_modify.drop([test_list[track],'DUMMYID'],axis=1)
376
+ my_table_str=my_table_modify.select_dtypes(exclude=['number'])
377
+ print(my_table_str.shape)
378
+ my_table_num=my_table_modify.select_dtypes(include=['number'])
379
+
380
+ #Test Data Numerical
381
+ li1=['Yes','No']
382
+ li2=['Event happened', 'No event']
383
+ cols_with_unique_values1 = []
384
+ cols_with_unique_values2 = []
385
+ for col in my_test.columns:
386
+ if my_test[col].isin(li1).all():
387
+ cols_with_unique_values1.append(col)
388
+ for col in my_test.columns:
389
+ if my_test[col].isin(li2).all():
390
+ cols_with_unique_values2.append(col)
391
+ #print(len(cols_with_unique_values1))
392
+ #print(len(cols_with_unique_values2))
393
+ my_ye=my_test[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64')
394
+ my_eve=my_test[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64')
395
+ my_test2=my_test.copy()
396
+ ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2]
397
+ #print(ccc)
398
+ my_test_modify=my_test2.drop(ccc,axis=1)
399
+ my_test=pd.concat([my_test_modify,my_ye,my_eve],axis=1)
400
+ #print(my_table_str.shape)
401
+ my_test_num=my_test.select_dtypes(include=['number'])
402
+ my_test_str=my_test.select_dtypes(exclude=['number'])
403
+ mm=my_test_str.columns
404
+
405
+
406
+ # Common encoding
407
+ df_combined = pd.concat([my_table_str, my_test_str], axis=0, ignore_index=True)
408
+ enco = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
409
+ encoded = enco.fit_transform(df_combined)
410
+ encoded_df = pd.DataFrame(encoded, columns=enco.get_feature_names_out())
411
+
412
+ tabl = encoded_df.iloc[:len(my_table_str)].reset_index(drop=True)
413
+ tabl=tabl.reset_index(drop=True)
414
+ ftable=pd.concat([tabl,my_table_num],axis=1)
415
+ X_train_full=ftable
416
+ my_test_str = encoded_df.iloc[len(my_table_str):].reset_index(drop=True)
417
+ my_test_str=my_test_str.reset_index(drop=True)
418
+ my_test_real=pd.concat([my_test_str,my_test_num],axis=1)
419
+
420
+
421
+
422
+
423
+ metrics_results = []
424
+ calibration_results = []
425
+ calibration_plots = []
426
+
427
+ outcome_names = ['Overall Survival', 'Graft Failure', 'Acute GVHD', 'Chronic GVHD', 'Vaso-Occlusive Crisis Post-HCT', 'Stroke Post-HCT']
428
+
429
+ for i, (outcome_col, outcome_name) in enumerate(zip(outcome_cols, outcome_names)):
430
+ if outcome_col not in training_data.columns:
431
+ continue
432
+
433
+ y_train_full = inter_df[outcome_col]
434
+ amaj1=y_train_full.value_counts().idxmax()
435
+ amin1=y_train_full.value_counts().idxmin()
436
+ #print(y.value_counts().idxmax())
437
+ y_train_full=y_train_full.replace([amin1,amaj1],[1,0])
438
+
439
+ y_test_full = inter_input[outcome_col]
440
+ amaj1=y_test_full.value_counts().idxmax()
441
+ amin1=y_test_full.value_counts().idxmin()
442
+ #print(y.value_counts().idxmax())
443
+ y_test_full=y_test_full.replace([amin1,amaj1],[1,0])
444
+
445
+ X_train,y_train=X_train_full.values,y_train_full.values
446
+ x_te,y_test=my_test_real.values,y_test_full.values
447
+ vddc=len(np.where(y_train_full.to_numpy()==1)[0])/X_train_full.shape[0]
448
+ deltaa=0.2
449
+ rf=RandomForestClassifier()
450
+ paramss={}
451
+ tracount=0
452
+ y_pred,y_pred_proba,secondnaive=run_model(X_train,x_te,y_train,deltaa,outcome_col,rf,X_train_full,i,ftable,paramss,enco,my_table_str,my_table_num,tabl,tracount,vddc)
453
+ #mm=RandomForestClassifier(n_estimators=100, criterion='entropy')
454
+ #mm.fit(X_train,y_train)
455
+ #y_pred=mm.predict(x_te)
456
+ #y_pred_proba=mm.predict_proba(x_te)[:,1]
457
+
458
+ accuracy = accuracy_score(y_test, y_pred)
459
+ balanced_acc = balanced_accuracy_score(y_test, y_pred)
460
+ precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
461
+ recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
462
+ auc = roc_auc_score(y_test, y_pred_proba)
463
+
464
+ metrics_results.append([outcome_name, f"{accuracy:.3f}", f"{balanced_acc:.3f}",
465
+ f"{precision:.3f}", f"{recall:.3f}", f"{auc:.3f}"])
466
+
467
+
468
+ fraction_pos, mean_pred = calibration_curve(y_test, y_pred_proba, n_bins=10)
469
+
470
+
471
+ if len(mean_pred) > 1 and len(fraction_pos) > 1:
472
+ slope = np.polyfit(mean_pred, fraction_pos, 1)[0]
473
+ intercept = np.polyfit(mean_pred, fraction_pos, 1)[1]
474
+ else:
475
+ slope, intercept = 1.0, 0.0
476
+
477
+ calibration_results.append([outcome_name, f"{slope:.3f}", f"{intercept:.3f}"])
478
+
479
+
480
+ fig, ax = plt.subplots(figsize=(8, 6))
481
+ ax.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
482
+ ax.plot(mean_pred, fraction_pos, 'o-', label=f'{outcome_name}')
483
+ ax.set_xlabel('Mean Predicted Probability')
484
+ ax.set_ylabel('Fraction of Positives')
485
+ ax.set_title(f'Calibration Plot - {outcome_name}')
486
+ ax.legend()
487
+ ax.grid(True, alpha=0.3)
488
+ plt.tight_layout()
489
+ calibration_plots.append(fig)
490
+
491
+
492
+ metrics_df = pd.DataFrame(metrics_results,
493
+ columns=['Outcome', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'AUC'])
494
+
495
+
496
+ calibration_df = pd.DataFrame(calibration_results,
497
+ columns=['Outcome', 'Slope', 'Intercept'])
498
+
499
+ return metrics_df, calibration_df, calibration_plots
500
+
501
+ except Exception as e:
502
+ return f"Error processing data: {str(e)}", None, None
503
+
504
+ def create_interface():
505
+
506
+
507
+
508
+ load_training_data()
509
+
510
+ with gr.Blocks(
511
+ css="""
512
+ .gradio-container {
513
+ max-width: none !important;
514
+ height: 100vh;
515
+ overflow-y: auto;
516
+ }
517
+ .main-container {
518
+ padding: 20px;
519
+ }
520
+ .big-title {
521
+ font-size: 2.5em;
522
+ font-weight: bold;
523
+ margin-bottom: 30px;
524
+ text-align: center;
525
+ }
526
+ .section-title {
527
+ font-size: 2em;
528
+ font-weight: bold;
529
+ margin: 40px 0 20px 0;
530
+ color: #2d5aa0;
531
+ }
532
+ .subsection-title {
533
+ font-size: 1.5em;
534
+ font-weight: bold;
535
+ margin: 30px 0 15px 0;
536
+ color: #4a4a4a;
537
+ }
538
+ """,
539
+ title="ML Model Evaluation Pipeline"
540
+ ) as demo:
541
+
542
+ with gr.Column(elem_classes=["main-container"]):
543
+
544
+ gr.HTML('<div class="big-title">Input</div>')
545
+
546
+ gr.Markdown("### Please upload the dataset:")
547
+ file_input = gr.File(
548
+ label="Upload Dataset (CSV)",
549
+ file_types=[".csv"],
550
+ type="filepath"
551
+ )
552
+
553
+
554
+ process_btn = gr.Button("Process Dataset", variant="primary", size="lg")
555
+
556
+
557
+ gr.HTML('<div class="section-title">Outputs</div>')
558
+
559
+
560
+ gr.HTML('<div class="subsection-title">Metrics</div>')
561
+ metrics_table = gr.Dataframe(
562
+ headers=["Outcome", "Accuracy", "Balanced Accuracy", "Precision", "Recall", "AUC"],
563
+ interactive=False,
564
+ wrap=True
565
+ )
566
+
567
+
568
+ gr.HTML('<div class="subsection-title">Calibration</div>')
569
+ calibration_table = gr.Dataframe(
570
+ headers=["Outcome", "Slope", "Intercept"],
571
+ interactive=False,
572
+ wrap=True
573
+ )
574
+
575
+
576
+ gr.Markdown("#### Calibration Curves")
577
+
578
+
579
+ #plot1 = gr.Plot(label="Event Free Survival")
580
+ plot2 = gr.Plot(label="Overall Survival")
581
+ plot3 = gr.Plot(label="Graft Failure")
582
+ plot4 = gr.Plot(label="Acute GVHD")
583
+ plot5 = gr.Plot(label="Chronic GVHD")
584
+ plot6 = gr.Plot(label="Vaso-Occlusive Crisis Post-HCT")
585
+ plot7 = gr.Plot(label="Stroke Post-HCT")
586
+
587
+ plots = [plot2, plot3, plot4, plot5]
588
+
589
+
590
+ def process_and_display(file):
591
+ metrics_df, calibration_df, calibration_plots = train_and_evaluate(file)
592
+
593
+ if isinstance(metrics_df, str): # Error case
594
+ return metrics_df, None, None, None, None, None, None
595
+
596
+
597
+ plot_outputs = [None] * 5
598
+ if calibration_plots:
599
+ for i, plot in enumerate(calibration_plots[:5]):
600
+ plot_outputs[i] = plot
601
+
602
+ return (metrics_df, calibration_df,
603
+ plot_outputs[0], plot_outputs[1], plot_outputs[2],
604
+ plot_outputs[3], plot_outputs[4])
605
+
606
+
607
+ process_btn.click(
608
+ fn=process_and_display,
609
+ inputs=[file_input],
610
+ outputs=[metrics_table, calibration_table] + plots
611
+ )
612
+
613
+ return demo
614
+
615
+
616
+ if __name__ == "__main__":
617
+ demo = create_interface()
618
+ demo.launch(
619
+ share=True,
620
+ inbrowser=True,
621
+ height=800,
622
+ show_error=True
623
+ )
year6.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f0e8390efceb4d68ad535ff323c96ee8eea66ea6dc83523436cf8052572b58
3
+ size 706589