import os import sys import numpy as np import pandas as pd import mlflow import model_h # Plotting import matplotlib.pyplot as plt import seaborn as sns # Model training and evaluation from sklearn.dummy import DummyClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_validate, cross_val_predict from sklearn.metrics import confusion_matrix, precision_recall_curve from sklearn.calibration import calibration_curve, CalibratedClassifierCV from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier import lightgbm as lgb import xgboost as xgb from catboost import CatBoostClassifier import ml_insights as mli # Explainability import shap ############################################################## # Specify which model to perform cross validation on ############################################################## model_only_hosp = True if model_only_hosp is True: file_suffix = "_only_hosp" else: file_suffix = "_hosp_comm" ############################################################## # Load data ############################################################## # Setup log file log = open("./training/logging/modelling" + file_suffix + ".log", "w") sys.stdout = log # Load CV folds fold_patients = np.load( './data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True) # Load imputed train data train_data_imp = model_h.load_data_for_modelling( './data/model_data/train_data_cv_imp' + file_suffix + '.pkl') train_data_imp = train_data_imp.drop(columns=['Sex_F', 'Age_TEnc']) # Load not imputed train data train_data_no_imp = model_h.load_data_for_modelling( './data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl') train_data_no_imp = train_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc']) # Load imputed test data test_data_imp = model_h.load_data_for_modelling( './data/model_data/test_data_imp' + file_suffix + '.pkl') test_data_imp = test_data_imp.drop(columns=['Sex_F', 'Age_TEnc']) # Load not imputed test data test_data_no_imp = model_h.load_data_for_modelling( './data/model_data/test_data_no_imp' + file_suffix + '.pkl') test_data_no_imp = test_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc']) # Create a tuple with training and validation indicies for each fold. Can be done with # either imputed or not imputed data as both have same patients cross_val_fold_indices = [] for fold in fold_patients: fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)] fold_train_ids = train_data_no_imp[~( train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))] # Get index of rows in val and train fold_val_index = fold_val_ids.index fold_train_index = fold_train_ids.index # Append tuple of training and val indices cross_val_fold_indices.append((fold_train_index, fold_val_index)) # Create list of model features cols_to_drop = ['StudyId', 'ExacWithin3Months'] features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop] # Train data # Separate features from target for data with no imputation performed train_features_no_imp = train_data_no_imp[features_list].astype('float') train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float') # Separate features from target for data with no imputation performed train_features_imp = train_data_imp[features_list].astype('float') train_target_imp = train_data_imp.ExacWithin3Months.astype('float') # Test data # Separate features from target for data with no imputation performed test_features_no_imp = test_data_no_imp[features_list].astype('float') test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float') # Separate features from target for data with no imputation performed test_features_imp = test_data_imp[features_list].astype('float') test_target_imp = test_data_imp.ExacWithin3Months.astype('float') # Check that the target in imputed and not imputed datasets are the same. If not, # raise an error if not train_target_no_imp.equals(train_target_imp): raise ValueError( 'Target variable is not the same in imputed and non imputed datasets in the train set.') if not test_target_no_imp.equals(test_target_imp): raise ValueError( 'Target variable is not the same in imputed and non imputed datasets in the test set.') train_target = train_target_no_imp test_target = test_target_no_imp # Make sure all features are numeric for features in [train_features_no_imp, train_features_imp, test_features_no_imp, test_features_imp]: for col in features: features[col] = pd.to_numeric(features[col], errors='coerce') ############################################################## # Specify which models to evaluate ############################################################## # Set up MLflow mlflow.set_tracking_uri("sqlite:///mlruns.db") mlflow.set_experiment('model_h_drop_1' + file_suffix) # Set CV scoring strategies and any model parameters scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'neg_brier_score'] scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1] # Set up models, each tuple contains 4 elements: model, model name, imputation status, # type of model models = [] # Dummy classifier models.append((DummyClassifier(strategy='stratified'), 'dummy_classifier', 'imputed')) # Logistic regression models.append((LogisticRegression(random_state=0, max_iter=200), 'logistic_regression', 'imputed', 'linear')) models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200), 'logistic_regression_CW_balanced', 'imputed', 'linear')) # Random forest models.append((RandomForestClassifier(random_state=0), 'random_forest', 'imputed', 'tree')) models.append((RandomForestClassifier(random_state=0, class_weight='balanced'), 'random_forest_CW_balanced', 'imputed', 'tree')) models.append((BalancedRandomForestClassifier(random_state=0), 'balanced_random_forest', 'imputed', 'tree')) # Bagging models.append((BalancedBaggingClassifier(random_state=0), 'balanced_bagging', 'imputed', 'tree')) # XGBoost models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss', learning_rate=0.1), 'xgb', 'not_imputed', 'tree')) models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss', learning_rate=0.1, max_depth=4), 'xgb_mdepth_4', 'not_imputed', 'tree')) models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1), 'xgb_spw', 'not_imputed', 'tree')) models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1, max_depth=4), 'xgb_spw_mdepth_4', 'not_imputed', 'tree')) # Light GBM models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1, verbose_eval=-1), 'lgbm', 'not_imputed', 'tree')) models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1, scale_pos_weight=scale_pos_weight, verbose_eval=-1), 'lgbm_spw', 'not_imputed', 'tree')) # CatBoost models.append((CatBoostClassifier(random_state=0, learning_rate=0.1), 'catboost', 'not_imputed', 'tree')) # Convert features and target to a numpy array # Train data #train_features_no_imp = train_features_no_imp.to_numpy() #train_features_imp = train_features_imp.to_numpy() #train_target = train_target.to_numpy() # Test data #test_features_no_imp = test_features_no_imp.to_numpy() #test_features_imp = test_features_imp.to_numpy() #test_target = test_target.to_numpy() ############################################################## # Run models ############################################################## #In MLflow run, perform K-fold cross validation and capture mean score across folds. with mlflow.start_run(run_name='model_selection_less_features_3rd_iter_minus_sex'): for model in models: with mlflow.start_run(run_name=model[1], nested=True): print(model[1]) # Create the artifacts directory if it doesn't exist artifact_dir = './tmp' os.makedirs(artifact_dir, exist_ok=True) # Remove existing directory contents to not mix files between different runs for f in os.listdir(artifact_dir): os.remove(os.path.join(artifact_dir, f)) # Perform K-fold cross validation with custom folds using imputed dataset for # non-sparsity aware models if model[2] == 'imputed': crossval = cross_validate(model[0], train_features_imp, train_target, cv=cross_val_fold_indices, return_estimator=True, scoring=scoring, return_indices=True) # Get the predicted probabilities from each models probabilities_cv = cross_val_predict(model[0], train_features_imp, train_target, cv=cross_val_fold_indices, method='predict_proba')[:, 1] else: crossval = cross_validate(model[0], train_features_no_imp, train_target, cv=cross_val_fold_indices, return_estimator=True, scoring=scoring, return_indices=True) # Get the predicted probabilities from each models probabilities_cv = cross_val_predict(model[0], train_features_no_imp, train_target, cv=cross_val_fold_indices, method='predict_proba')[:, 1] # Get threshold that gives best F1 score precision, recall, thresholds = precision_recall_curve( train_target, probabilities_cv) fscore = (2 * precision * recall) / (precision + recall) # When getting the max fscore, if fscore is nan, nan will be returned as the # max. Iterate until nan not returned. fscore_zero = True position = -1 while fscore_zero is True: best_thres_idx = np.argsort(fscore, axis=0)[position] if np.isnan(fscore[best_thres_idx]) == True: position = position - 1 else: fscore_zero = False best_threshold = thresholds[best_thres_idx] print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % ( best_threshold, fscore[best_thres_idx], precision[best_thres_idx], recall[best_thres_idx])) # Save f1 score, precision and recall for the best threshold mlflow.log_metric('best_threshold', best_threshold) mlflow.log_metric('f1_best_thres', fscore[best_thres_idx]) mlflow.log_metric('precision_best_thres', precision[best_thres_idx]) mlflow.log_metric('recall_best_thres', recall[best_thres_idx]) # Plot confusion matrix at different thresholds thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_threshold] for threshold in thresholds: y_predicted = probabilities_cv > threshold model_h.plot_confusion_matrix( train_target, y_predicted, model[1], threshold, file_suffix) # Generate calibration curves if model[1] != 'dummy_classifier': # Calibrated model (Sigmoid) model_sig = CalibratedClassifierCV( model[0], method='sigmoid',cv=cross_val_fold_indices) if model[2] == 'imputed': model_sig.fit(train_features_imp, train_target) probs_sig = model_sig.predict_proba(test_features_imp)[:, 1] else: model_sig.fit(train_features_no_imp, train_target) probs_sig = model_sig.predict_proba(test_features_no_imp)[:, 1] # Calibrated model (Isotonic) model_iso = CalibratedClassifierCV( model[0], method='isotonic', cv=cross_val_fold_indices) if model[2] == 'imputed': model_iso.fit(train_features_imp, train_target) probs_iso = model_iso.predict_proba(test_features_imp)[:, 1] else: model_iso.fit(train_features_no_imp, train_target) probs_iso = model_iso.predict_proba(test_features_no_imp)[:, 1] # Spline calibration spline_calib = mli.SplineCalib() spline_calib.fit(probabilities_cv, train_target) if model[2] == 'imputed': model[0].fit(train_features_imp,train_target) preds_test_uncalib = model[0].predict_proba(test_features_imp)[:,1] else: model[0].fit(train_features_no_imp,train_target) preds_test_uncalib = model[0].predict_proba(test_features_no_imp)[:,1] probs_spline = spline_calib.calibrate(preds_test_uncalib) # Plot calibration curves for equal width bins (each bin has same width) and # equal frequency bins (each bin has same number of observations) for strategy in ['uniform', 'quantile']: for bin_num in [5, 10]: if strategy == 'uniform': print('--- Creating calibration curve with equal width bins ---') print('-- Num bins:', bin_num, ' --') else: print('--- Creating calibration curve with equal frequency bins ---') print('-- Num bins:', bin_num, ' --') print('Uncalibrated model:') prob_true_uncal, prob_pred_uncal = calibration_curve( train_target, probabilities_cv,n_bins=bin_num, strategy=strategy) print('Calibrated model (sigmoid):') prob_true_sig, prob_pred_sig = calibration_curve( test_target, probs_sig, n_bins=bin_num, strategy=strategy) print('Calibrated model (isotonic):') prob_true_iso, prob_pred_iso = calibration_curve( test_target, probs_iso, n_bins=bin_num, strategy=strategy) print('Calibrated model (spline):') prob_true_spline, prob_pred_spline = calibration_curve( test_target, probs_spline, n_bins=bin_num, strategy=strategy) plt.figure(figsize=(8,8)) plt.plot([0, 1], [0, 1], linestyle='--') plt.plot(prob_pred_uncal, prob_true_uncal, marker='.', label='Uncalibrated\n' + model[1]) plt.plot(prob_pred_sig, prob_true_sig, marker='.', label='Calibrated (Sigmoid)\n' + model[1]) plt.plot(prob_pred_iso, prob_true_iso, marker='.', label='Calibrated (Isotonic)\n' + model[1]) plt.plot(prob_pred_spline, prob_true_spline, marker='.', label='Calibrated (Spline)\n' + model[1]) plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') plt.tight_layout() plt.savefig(os.path.join(artifact_dir, model[1] + '_uncal_' + strategy + '_bins' + str(bin_num) + file_suffix + '.png')) plt.close() # Get total gain and total cover for boosting machine models if model[1].startswith("xgb"): feat_importance_tot_gain_df = model_h.plot_feat_importance_model( model[0], model[1], file_suffix=file_suffix) if (model[1].startswith("lgbm")): feature_names = train_features_no_imp.columns.tolist() feat_importance_tot_gain_df = model_h.plot_feat_importance_model( model[0], model[1], file_suffix=file_suffix, feature_names=feature_names) # Save feature importance by total gain if (model[1].startswith("xgb")) | (model[1].startswith("lgbm")): feat_importance_tot_gain_df.to_csv( './data/feature_importance_tot_gain' + file_suffix + '.csv', index=False) # SHAP if model[1] not in ['dummy_classifier', 'balanced_bagging']: shap_values_list_train = [] shap_vals_per_cv = {} # Create a dictionary to contain shap values. Dictionary is structured as # index : fold_num : shap_values for idx in range(0, len(train_data_imp)): shap_vals_per_cv[idx] = {} for n_fold in range(0, 5): shap_vals_per_cv[idx][n_fold] = {} # Get SHAP values for each fold fold_num = 0 for i, estimator in enumerate(crossval['estimator']): fold_num = fold_num + 1 # If imputation needed for model, use imputed features if model[1] in ['logistic_regression', 'logistic_regression_CW_balanced', 'random_forest', 'random_forest_CW_balanced', 'balanced_bagging', 'balanced_random_forest']: #X_test = train_features_imp[crossval['indices']['test'][i]] X_train = train_features_imp.iloc[crossval['indices']['train'][i]] X_test = train_features_imp.iloc[crossval['indices']['test'][i]] else: X_train = train_features_no_imp.iloc[crossval['indices']['train'][i]] X_test = train_features_no_imp.iloc[crossval['indices']['test'][i]] # Apply different explainers depending on type of model if model[3] == 'linear': explainer = shap.LinearExplainer(estimator, X_train) if model[3] == 'tree': explainer = shap.TreeExplainer(estimator) # Get shap values shap_values_train = explainer.shap_values(X_train) # Output of shap values for some models is (class, num samples, # num features). Get these in the format of (num samples, num features) if len(np.shape(shap_values_train)) == 3: shap_values_train = shap_values_train[1] # Plot SHAP plots for each cv fold shap.summary_plot(np.array(shap_values_train), X_train, show=False) plt.savefig(os.path.join(artifact_dir, model[1] + '_shap_cv_fold_' + str(fold_num) + file_suffix + '.png')) plt.close() # Add shap values to a dictionary. train_idxs = X_train.index.tolist() for n, train_idx in enumerate(train_idxs): shap_vals_per_cv[train_idx][i] = shap_values_train[n] # Calculate average shap values average_shap_values, stds, ranges = [],[],[] for i in range(0,len(train_data_imp)): for n in range(0,5): # If a cv fold is empty as that set has not been used in training, # replace empty fold with NaN try: if not shap_vals_per_cv[i][n]: shap_vals_per_cv[i][n] = np.NaN except: pass # Create a df for each index that contains all shap values for each cv # fold df_per_obs = pd.DataFrame.from_dict(shap_vals_per_cv[i]) # Get relevant statistics for every sample average_shap_values.append(df_per_obs.mean(axis=1).values) stds.append(df_per_obs.std(axis=1).values) ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values) # Plot SHAP plots if model[2] == 'imputed': shap.summary_plot(np.array(average_shap_values), train_data_imp.drop( columns=['StudyId', 'ExacWithin3Months']), show=False) if model[2] == 'not_imputed': shap.summary_plot(np.array(average_shap_values), train_data_no_imp.drop( columns=['StudyId', 'ExacWithin3Months']), show=False) plt.savefig( os.path.join(artifact_dir, model[1] + '_shap' + file_suffix + '.png')) plt.close() # Get list of most important features in order feat_importance_df = model_h.get_shap_feat_importance( model[1], average_shap_values, features_list, file_suffix) feat_importance_df.to_csv( './data/feature_importance_shap' + file_suffix + '.csv', index=False) # Plot distribution of model scores (histogram plus KDE) model_scores = pd.DataFrame({'model_score': probabilities_cv, 'true_label': train_target}) sns.displot(model_scores, x="model_score", hue="true_label", kde=True) plt.savefig(os.path.join(artifact_dir, model[1] + 'score_distribution' + file_suffix + '.png')) plt.close() # Log metrics averaged across folds for score in scoring: mlflow.log_metric(score, crossval['test_' + score].mean()) mlflow.log_metric(score + '_std', crossval['test_' + score].std()) # Log model parameters params = model[0].get_params() for param in params: mlflow.log_param(param, params[param]) # Log artifacts mlflow.log_artifacts(artifact_dir) mlflow.end_run() # Join shap feature importance and total gain shap_feat_importance = pd.read_csv( './data/feature_importance_shap' + file_suffix + '.csv') tot_gain_feat_importance = pd.read_csv( './data/feature_importance_tot_gain' + file_suffix + '.csv') tot_gain_feat_importance = tot_gain_feat_importance.rename(columns={'index':'col_name'}) feat_importance_hierarchy = shap_feat_importance.merge( tot_gain_feat_importance, on='col_name', how='left') feat_importance_hierarchy.to_csv( './data/feat_importance_hierarchy' + file_suffix + '.csv', index=False)