import gradio as gr import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, roc_auc_score from sklearn.calibration import calibration_curve import matplotlib.pyplot as plt import seaborn as sns from io import StringIO import warnings warnings.filterwarnings('ignore') import numpy as np import pandas as pd import pyarrow.parquet as pq from sklearn.preprocessing import OneHotEncoder,MinMaxScaler from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,RepeatedStratifiedKFold from sklearn.metrics import confusion_matrix,classification_report,precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score, matthews_corrcoef from sklearn.metrics import roc_auc_score,auc import pickle from sklearn.utils.class_weight import compute_sample_weight import xgboost as xgb from xgboost.sklearn import XGBClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import AdaBoostClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.metrics import brier_score_loss from sklearn.calibration import calibration_curve import matplotlib.pyplot as plt from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LinearRegression # Global variables for training data and column names training_data = None column_names = None test_list=[] def rand_for(neww_list,x_te,rf,lab,x_tr,actual,paramss,X_Tempp,enco,my_table_str,my_table_num,tabl,tracount): cl_list=[] pro_list=[] for i in neww_list: dff_copy=i.copy() y_cl=dff_copy.loc[:,lab] teemp_list=[] ftli=[] X_cl=dff_copy.drop([lab],axis=1) x_te=pd.DataFrame(x_te,columns=X_Tempp.columns) if tracount==0: #mm=RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=42,bootstrap=True, oob_score=True,class_weight='balanced',ccp_alpha=0.01) mm=RandomForestClassifier(n_estimators=100, criterion='entropy',max_features=None,random_state=42,bootstrap=True, oob_score=True,class_weight='balanced',ccp_alpha=0.01) #mm.fit(X_cl,y_cl) calibrated_rf = CalibratedClassifierCV(estimator=mm, method='isotonic', cv=5) calibrated_rf.fit(X_cl, y_cl) #print(calibrated_rf.get_params()) out=calibrated_rf.predict(x_te) probs=calibrated_rf.predict_proba(x_te)[:,1] elif tracount==1: dtrain = xgb.DMatrix(X_cl.to_numpy(), label=y_cl) dtest = xgb.DMatrix(x_te.to_numpy(), label=y_te) params = { 'objective': 'binary:logistic', # Binary classification problem 'eval_metric': 'logloss', # Logarithmic loss for evaluation 'max_depth': 60, 'eta': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42} num_rounds = 100 mm=xgb.train(params, dtrain, num_rounds) probs = mm.predict(dtest) out = (probs > 0.5).astype(int) elif tracount==5: mm=LogisticRegression(penalty='l2',solver='newton-cholesky',max_iter=200) mm.fit(X_cl,y_cl) out=mm.predict(x_te) probs=mm.predict_proba(x_te)[:,1] elif tracount==4: var_smoothing_value = 1e-9 # Adjust this value as needed mm = GaussianNB(var_smoothing=var_smoothing_value) mm.fit(X_cl, y_cl) out = mm.predict(x_te) probs = mm.predict_proba(x_te)[:, 1] elif tracount==1: mm = AdaBoostClassifier(n_estimators=100,random_state=42,estimator=RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=42,bootstrap=True, oob_score=True,class_weight='balanced',ccp_alpha=0.01)) out = mm.predict(x_te) probs = mm.predict_proba(x_te)[:, 1] elif tracount==6: mm = SVC(probability=True, C=3) mm.fit(X_cl, y_cl) out = mm.predict(x_te) probs = mm.predict_proba(x_te)[:, 1] cl_list.append(out) pro_list.append(probs) return cl_list,pro_list def ne_calib(some_prob,down_factor,origin_factor): aa=some_prob*origin_factor/down_factor denone=(1-some_prob)*(1-origin_factor)/(1-down_factor) new_dum_prob=aa/(denone+aa) return new_dum_prob def actualll(sl_list,pro_list,delt,down_factor,origin_factor): ac_list=[] probab_list=[] second_probab_list=[] for i in range(len(sl_list[0])): sum=0 sum_pro=0 sum_pro_pro=0 for j in range(len(sl_list)): sum_pro+=ne_calib(pro_list[j][i],down_factor,origin_factor) sum_pro_pro+=pro_list[j][i] if sl_list[j][i]==-1: sum+=(sl_list[j][i]) else: sum+=(sl_list[j][i]) sum/=len(sl_list) sum_pro/=len(sl_list) sum_pro_pro/=len(sl_list) if sum>=delt: ac_list.append(1) probab_list.append(sum_pro) second_probab_list.append(sum_pro_pro) elif sum<=delt and sum >=0 : ac_list.append(0) probab_list.append(1-sum_pro) second_probab_list.append(1-sum_pro_pro) elif sum<=delt and sum <0: ac_list.append(0) probab_list.append(sum_pro) second_probab_list.append(sum_pro_pro) return ac_list,probab_list,second_probab_list def sli_mod(c_lisy): sli_list=[] ### I am changing the threshold for i in c_lisy: k=np.array(i) k[k<0.5]=-1 k[k>=0.5]=1 #k=k/len(c_lisy) sli_list.append(list(k)) return sli_list def run_model(x_tr,x_te,y_tr,deltaa,lab,rf,X_Tempp,track,actual,paramss,enco,my_table_str,my_table_num,tabl,tracount,origin_factor): x_tr=pd.DataFrame(x_tr,columns=X_Tempp.columns) y_tr=pd.DataFrame(y_tr,columns=[test_list[track]]) master_table=pd.concat([x_tr,y_tr],axis=1).copy() only_minority=master_table.loc[master_table[lab]==1] only_majority=master_table.drop(only_minority.index) min_index=only_minority.index max_index=only_majority.index df_list=[] down_factor=0 if (len(min_index)<=60): for i in range(20): np.random.seed(i+30) if test_list[track]=='VOD' or test_list[track]=='STROKEHI':# or test_list[track]=='ACSPSHI' or test_list[track]=='AVNPSHI': sampled_array = np.random.choice(max_index,size=int(3*len(min_index)), replace=True) down_factor=0.25 elif test_list[track]=='ACSPSHI': sampled_array = np.random.choice(max_index,size=int(2.5*len(min_index)), replace=True) down_factor=1/(1+2.5) else: sampled_array = np.random.choice(max_index,size=int(2*len(min_index)), replace=True) down_factor=1/(1+2) temp_df=only_majority.loc[sampled_array] new_df=pd.concat([temp_df,only_minority]) df_list.append(new_df) else: for i in range(10): np.random.seed(i+30) if test_list[track]=='DEAD': sampled_array = np.random.choice(max_index,size=int(3*len(min_index)), replace=True) down_factor=1/(1+3) else: sampled_array = np.random.choice(max_index,size=int(3*len(min_index)), replace=True) down_factor=1/(1+3) temp_df=only_majority.loc[sampled_array] new_df=pd.concat([temp_df,only_minority]) df_list.append(new_df) #neww_list=my_tomek(df_list,lab) neww_list=df_list c_lisy,pro_lisy=rand_for(neww_list,x_te,rf,lab,x_tr,actual,paramss,X_Tempp,enco,my_table_str,my_table_num,tabl,tracount) sli_lisy=sli_mod(c_lisy) a_lisy,probab_lisy,secondlisy=actualll(sli_lisy,pro_lisy,deltaa,down_factor,origin_factor) return a_lisy,probab_lisy,secondlisy def load_training_data(): global training_data, column_names, test_list try: my_table=pq.read_table('year6.parquet').to_pandas() print(my_table['YEARGPF'].value_counts()) my_table=my_table[(my_table['YEARGPF']!='< 2008')] my_table=my_table.reset_index(drop=True) pa=pd.read_csv('final_variable.csv') pali=list(pa.iloc[:,0]) print(pali) #pali.append(test_list[track]) #pali.append('DUMMYID') #pali.remove('AGEGPFF') #pali.remove('COUNTRY') #print(pali) #my_table=my_table[pali] training_data = my_table column_names=pali except FileNotFoundError: return "No training Data" def train_and_evaluate(input_file): global training_data, column_names,test_list if training_data is None or column_names is None: load_training_data() if input_file is None: return None, None, None try: input_data = pd.read_csv(input_file.name) available_features = [col for col in column_names if col in training_data.columns] available_features_input = [col for col in available_features if col in input_data.columns] if not available_features_input: return "Error: No matching columns found between datasets", None, None # Prepare training data #X_train_full = training_data[available_features] outcome_cols = ['DEAD', 'GF', 'AGVHD', 'CGVHD', 'VOCPSHI', 'STROKEHI'] test_list=outcome_cols.copy() total_cols=available_features+outcome_cols inter_df=training_data[total_cols] inter_df=inter_df.dropna() inter_df=inter_df.reset_index(drop=True) input_data=input_data[(input_data['YEARGPF']!='< 2008')] input_data=input_data.reset_index(drop=True) inter_input=input_data[total_cols] inter_input=inter_input.dropna() inter_input=inter_input.reset_index(drop=True) my_table=inter_df[available_features] # Prepare input data X_input = inter_input[available_features] X_input = X_input[my_table.columns] my_test=X_input '''li1=['Yes','No'] li2=['Event happened', 'No event'] cols_with_unique_values1 = [] cols_with_unique_values2 = [] #print(my_table['EXCHTFPR'].isin(li1)) for col in my_table.columns: if my_table[col].isin(li1).all(): cols_with_unique_values1.append(col) for col in my_table.columns: if my_table[col].isin(li2).all(): cols_with_unique_values2.append(col) #print(len(cols_with_unique_values1)) #print(len(cols_with_unique_values2)) my_ye=my_table[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64') my_eve=my_table[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64') my_table2=my_table.copy() ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2] #print(ccc) my_table_modify=my_table2.drop(ccc,axis=1) my_table_modify=pd.concat([my_table_modify,my_ye,my_eve],axis=1) #my_table_modify=my_table_modify.drop([test_list[track],'DUMMYID'],axis=1) my_table_str=my_table_modify.select_dtypes(exclude=['number']) print(my_table_str.shape) my_table_num=my_table_modify.select_dtypes(include=['number']) #print(my_table_num.shape) enco=OneHotEncoder(sparse_output=True) fito=enco.fit(my_table_str) #mmm=aa.inverse_transform(g) tabl=enco.transform(my_table_str) tabl=pd.DataFrame(tabl.toarray(),columns=enco.get_feature_names_out()) #print(tabl.shape) #print(dfcopy) ftable=pd.concat([tabl,my_table_num],axis=1) X_train_full=ftable li1=['Yes','No'] li2=['Event happened', 'No event'] cols_with_unique_values1 = [] cols_with_unique_values2 = [] for col in my_test.columns: if my_test[col].isin(li1).all(): cols_with_unique_values1.append(col) for col in my_test.columns: if my_test[col].isin(li2).all(): cols_with_unique_values2.append(col) #print(len(cols_with_unique_values1)) #print(len(cols_with_unique_values2)) my_ye=my_test[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64') my_eve=my_test[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64') my_test2=my_test.copy() ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2] #print(ccc) my_test_modify=my_test2.drop(ccc,axis=1) my_test=pd.concat([my_test_modify,my_ye,my_eve],axis=1) #print(my_table_str.shape) my_test_num=my_test.select_dtypes(include=['number']) my_test_str=my_test.select_dtypes(exclude=['number']) mm=my_test_str.columns my_test_str=enco.transform(my_test_str) my_test_str=pd.DataFrame(my_test_str.toarray(),columns=enco.get_feature_names_out()) my_test_real=pd.concat([my_test_str,my_test_num],axis=1)''' # Train data numerical li1=['Yes','No'] #li2=['Event happened', 'No event'] cols_with_unique_values1 = [] cols_with_unique_values2 = [] #print(my_table['EXCHTFPR'].isin(li1)) for col in my_table.columns: if my_table[col].isin(li1).all(): cols_with_unique_values1.append(col) #for col in my_table.columns: #if my_table[col].isin(li2).all(): #cols_with_unique_values2.append(col) #print(len(cols_with_unique_values1)) #print(len(cols_with_unique_values2)) my_ye=my_table[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64') #my_eve=my_table[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64') my_table2=my_table.copy() ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2] #print(ccc) my_table_modify=my_table2.drop(ccc,axis=1) my_table_modify=pd.concat([my_table_modify,my_ye],axis=1) #my_table_modify=my_table_modify.drop([test_list[track],'DUMMYID'],axis=1) my_table_str=my_table_modify.select_dtypes(exclude=['number']) print(my_table_str.shape) my_table_num=my_table_modify.select_dtypes(include=['number']) #Test Data Numerical li1=['Yes','No'] li2=['Event happened', 'No event'] cols_with_unique_values1 = [] cols_with_unique_values2 = [] for col in my_test.columns: if my_test[col].isin(li1).all(): cols_with_unique_values1.append(col) for col in my_test.columns: if my_test[col].isin(li2).all(): cols_with_unique_values2.append(col) #print(len(cols_with_unique_values1)) #print(len(cols_with_unique_values2)) my_ye=my_test[cols_with_unique_values1].replace(['Yes','No'],[1,0]).astype('int64') #my_eve=my_test[cols_with_unique_values2].replace(['Event happened','No event'],[1,0]).astype('int64') my_test2=my_test.copy() ccc=[elem for elem in cols_with_unique_values1+cols_with_unique_values2] #print(ccc) my_test_modify=my_test2.drop(ccc,axis=1) my_test=pd.concat([my_test_modify,my_ye],axis=1) #print(my_table_str.shape) my_test_num=my_test.select_dtypes(include=['number']) my_test_str=my_test.select_dtypes(exclude=['number']) mm=my_test_str.columns # Common encoding df_combined = pd.concat([my_table_str, my_test_str], axis=0, ignore_index=True) enco = OneHotEncoder(sparse_output=False, handle_unknown='ignore') encoded = enco.fit_transform(df_combined) encoded_df = pd.DataFrame(encoded, columns=enco.get_feature_names_out()) tabl = encoded_df.iloc[:len(my_table_str)].reset_index(drop=True) tabl=tabl.reset_index(drop=True) ftable=pd.concat([tabl,my_table_num],axis=1) X_train_full=ftable my_test_str = encoded_df.iloc[len(my_table_str):].reset_index(drop=True) my_test_str=my_test_str.reset_index(drop=True) my_test_real=pd.concat([my_test_str,my_test_num],axis=1) metrics_results = [] calibration_results = [] calibration_plots = [] outcome_names = ['Overall Survival', 'Graft Failure', 'Acute GVHD', 'Chronic GVHD', 'Vaso-Occlusive Crisis Post-HCT', 'Stroke Post-HCT'] for i, (outcome_col, outcome_name) in enumerate(zip(outcome_cols, outcome_names)): if outcome_col not in training_data.columns: continue y_train_full = inter_df[outcome_col] amaj1=y_train_full.value_counts().idxmax() amin1=y_train_full.value_counts().idxmin() #print(y.value_counts().idxmax()) y_train_full=y_train_full.replace([amin1,amaj1],[1,0]).astype(int) # FIX 1: added .astype(int) y_test_full = inter_input[outcome_col] amaj1=y_test_full.value_counts().idxmax() amin1=y_test_full.value_counts().idxmin() #print(y.value_counts().idxmax()) y_test_full=y_test_full.replace([amin1,amaj1],[1,0]).astype(int) # FIX 1: added .astype(int) X_train,y_train=X_train_full.values,y_train_full.values x_te,y_test=my_test_real.values,y_test_full.values vddc=len(np.where(y_train_full.to_numpy()==1)[0])/X_train_full.shape[0] deltaa=0.2 rf=RandomForestClassifier() paramss={} tracount=0 y_pred,y_pred_proba,secondnaive=run_model(X_train,x_te,y_train,deltaa,outcome_col,rf,X_train_full,i,ftable,paramss,enco,my_table_str,my_table_num,tabl,tracount,vddc) #mm=RandomForestClassifier(n_estimators=100, criterion='entropy') #mm.fit(X_train,y_train) #y_pred=mm.predict(x_te) #y_pred_proba=mm.predict_proba(x_te)[:,1] accuracy = accuracy_score(y_test, y_pred) balanced_acc = balanced_accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) auc = roc_auc_score(y_test, y_pred_proba) metrics_results.append([outcome_name, f"{accuracy:.3f}", f"{balanced_acc:.3f}", f"{precision:.3f}", f"{recall:.3f}", f"{auc:.3f}"]) fraction_pos, mean_pred = calibration_curve(y_test, y_pred_proba, n_bins=10) if len(mean_pred) > 1 and len(fraction_pos) > 1: slope = np.polyfit(mean_pred, fraction_pos, 1)[0] intercept = np.polyfit(mean_pred, fraction_pos, 1)[1] else: slope, intercept = 1.0, 0.0 calibration_results.append([outcome_name, f"{slope:.3f}", f"{intercept:.3f}"]) fig, ax = plt.subplots(figsize=(8, 6)) ax.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration') ax.plot(mean_pred, fraction_pos, 'o-', label=f'{outcome_name}') ax.set_xlabel('Mean Predicted Probability') ax.set_ylabel('Fraction of Positives') ax.set_title(f'Calibration Plot - {outcome_name}') ax.legend() ax.grid(True, alpha=0.3) plt.tight_layout() calibration_plots.append(fig) metrics_df = pd.DataFrame(metrics_results, columns=['Outcome', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'AUC']) calibration_df = pd.DataFrame(calibration_results, columns=['Outcome', 'Slope', 'Intercept']) return metrics_df, calibration_df, calibration_plots except Exception as e: return f"Error processing data: {str(e)}", None, None def create_interface(): load_training_data() with gr.Blocks( css=""" .gradio-container { max-width: none !important; height: 100vh; overflow-y: auto; } .main-container { padding: 20px; } .big-title { font-size: 2.5em; font-weight: bold; margin-bottom: 30px; text-align: center; } .section-title { font-size: 2em; font-weight: bold; margin: 40px 0 20px 0; color: #2d5aa0; } .subsection-title { font-size: 1.5em; font-weight: bold; margin: 30px 0 15px 0; color: #4a4a4a; } """, title="ML Model Evaluation Pipeline" ) as demo: with gr.Column(elem_classes=["main-container"]): gr.HTML('