File size: 23,807 Bytes

000de75

import os
import sys
import numpy as np
import pandas as pd
import mlflow
import model_h

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Model training and evaluation
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import ml_insights as mli

# Explainability
import shap

##############################################################
# Specify which model to perform cross validation on
##############################################################
model_only_hosp = True
if model_only_hosp is True:
    file_suffix = "_only_hosp"
else:
    file_suffix = "_hosp_comm"

##############################################################
# Load data
##############################################################
# Setup log file
log = open("./training/logging/modelling" + file_suffix + ".log", "w")
sys.stdout = log

# Load CV folds
fold_patients = np.load(
    './data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True)

# Load imputed train data
train_data_imp = model_h.load_data_for_modelling(
    './data/model_data/train_data_cv_imp' + file_suffix + '.pkl')
train_data_imp = train_data_imp.drop(columns=['Sex_F', 'Age_TEnc'])

# Load not imputed train data
train_data_no_imp = model_h.load_data_for_modelling(
    './data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl')
train_data_no_imp = train_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc'])

# Load imputed test data
test_data_imp = model_h.load_data_for_modelling(
    './data/model_data/test_data_imp' + file_suffix + '.pkl')
test_data_imp = test_data_imp.drop(columns=['Sex_F', 'Age_TEnc'])

# Load not imputed test data
test_data_no_imp = model_h.load_data_for_modelling(
    './data/model_data/test_data_no_imp' + file_suffix + '.pkl')
test_data_no_imp = test_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc'])

# Create a tuple with training and validation indicies for each fold. Can be done with
# either imputed or not imputed data as both have same patients
cross_val_fold_indices = []
for fold in fold_patients:
    fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
    fold_train_ids = train_data_no_imp[~(
        train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]

    # Get index of rows in val and train
    fold_val_index = fold_val_ids.index
    fold_train_index = fold_train_ids.index

    # Append tuple of training and val indices
    cross_val_fold_indices.append((fold_train_index, fold_val_index))

# Create list of model features
cols_to_drop = ['StudyId', 'ExacWithin3Months']
features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]

# Train data
# Separate features from target for data with no imputation performed
train_features_no_imp = train_data_no_imp[features_list].astype('float')
train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
train_features_imp = train_data_imp[features_list].astype('float')
train_target_imp = train_data_imp.ExacWithin3Months.astype('float')

# Test data
# Separate features from target for data with no imputation performed
test_features_no_imp = test_data_no_imp[features_list].astype('float')
test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
test_features_imp = test_data_imp[features_list].astype('float')
test_target_imp = test_data_imp.ExacWithin3Months.astype('float')

# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not train_target_no_imp.equals(train_target_imp):
    raise ValueError(
        'Target variable is not the same in imputed and non imputed datasets in the train set.')
if not test_target_no_imp.equals(test_target_imp):
    raise ValueError(
        'Target variable is not the same in imputed and non imputed datasets in the test set.')
train_target = train_target_no_imp
test_target = test_target_no_imp

# Make sure all features are numeric
for features in [train_features_no_imp, train_features_imp,
                 test_features_no_imp, test_features_imp]:
    for col in features:
        features[col] = pd.to_numeric(features[col], errors='coerce')

##############################################################
# Specify which models to evaluate
##############################################################
# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('model_h_drop_1' + file_suffix)

# Set CV scoring strategies and any model parameters
scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
           'average_precision', 'neg_brier_score']
scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1]

# Set up models, each tuple contains 4 elements: model, model name, imputation status,
# type of model
models = []
# Dummy classifier
models.append((DummyClassifier(strategy='stratified'),
               'dummy_classifier', 'imputed'))
# Logistic regression
models.append((LogisticRegression(random_state=0, max_iter=200),
               'logistic_regression', 'imputed', 'linear'))
models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200),
               'logistic_regression_CW_balanced', 'imputed', 'linear'))
# Random forest
models.append((RandomForestClassifier(random_state=0),
               'random_forest', 'imputed', 'tree'))
models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
               'random_forest_CW_balanced', 'imputed', 'tree'))
models.append((BalancedRandomForestClassifier(random_state=0),
               'balanced_random_forest', 'imputed', 'tree'))
# Bagging
models.append((BalancedBaggingClassifier(random_state=0),
               'balanced_bagging', 'imputed', 'tree'))
# XGBoost
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
               eval_metric='logloss', learning_rate=0.1),
               'xgb', 'not_imputed', 'tree'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
               eval_metric='logloss', learning_rate=0.1, max_depth=4),
               'xgb_mdepth_4', 'not_imputed', 'tree'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
               eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1),
               'xgb_spw', 'not_imputed', 'tree'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
               eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1,
               max_depth=4),
               'xgb_spw_mdepth_4', 'not_imputed', 'tree'))
# Light GBM
models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1, verbose_eval=-1),
               'lgbm', 'not_imputed', 'tree'))
models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1,
                                  scale_pos_weight=scale_pos_weight, verbose_eval=-1),
                                  'lgbm_spw', 'not_imputed', 'tree'))
# CatBoost
models.append((CatBoostClassifier(random_state=0, learning_rate=0.1),
               'catboost', 'not_imputed', 'tree'))

# Convert features and target to a numpy array
# Train data
#train_features_no_imp = train_features_no_imp.to_numpy()
#train_features_imp = train_features_imp.to_numpy()
#train_target = train_target.to_numpy()
# Test data
#test_features_no_imp = test_features_no_imp.to_numpy()
#test_features_imp = test_features_imp.to_numpy()
#test_target = test_target.to_numpy()

##############################################################
# Run models
##############################################################
#In MLflow run, perform K-fold cross validation and capture mean score across folds.
with mlflow.start_run(run_name='model_selection_less_features_3rd_iter_minus_sex'):
    for model in models:
        with mlflow.start_run(run_name=model[1], nested=True):
            print(model[1])
            # Create the artifacts directory if it doesn't exist
            artifact_dir = './tmp'
            os.makedirs(artifact_dir, exist_ok=True)
            # Remove existing directory contents to not mix files between different runs
            for f in os.listdir(artifact_dir):
                os.remove(os.path.join(artifact_dir, f))
            
            # Perform K-fold cross validation with custom folds using imputed dataset for
            # non-sparsity aware models
            if model[2] == 'imputed':
                crossval = cross_validate(model[0], train_features_imp, train_target,
                                          cv=cross_val_fold_indices,
                                          return_estimator=True, scoring=scoring,
                                          return_indices=True)
                
                # Get the predicted probabilities from each models
                probabilities_cv = cross_val_predict(model[0], train_features_imp,
                                                     train_target, cv=cross_val_fold_indices,
                                                     method='predict_proba')[:, 1]
            else:
                crossval = cross_validate(model[0], train_features_no_imp, train_target,
                                          cv=cross_val_fold_indices, return_estimator=True,
                                          scoring=scoring, return_indices=True)
                
                # Get the predicted probabilities from each models
                probabilities_cv = cross_val_predict(model[0], train_features_no_imp,
                                                     train_target, cv=cross_val_fold_indices,
                                                     method='predict_proba')[:, 1]

            # Get threshold that gives best F1 score
            precision, recall, thresholds = precision_recall_curve(
                train_target, probabilities_cv)
            fscore = (2 * precision * recall) / (precision + recall)
            # When getting the max fscore, if fscore is nan, nan will be returned as the
            # max. Iterate until nan not returned.
            fscore_zero = True
            position = -1
            while fscore_zero is True:
                best_thres_idx = np.argsort(fscore, axis=0)[position]
                if np.isnan(fscore[best_thres_idx]) == True:
                    position = position - 1
                else:
                    fscore_zero = False
            best_threshold = thresholds[best_thres_idx]
            print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (
                best_threshold, fscore[best_thres_idx], precision[best_thres_idx], 
                recall[best_thres_idx]))
            # Save f1 score, precision and recall for the best threshold
            mlflow.log_metric('best_threshold', best_threshold)
            mlflow.log_metric('f1_best_thres', fscore[best_thres_idx])
            mlflow.log_metric('precision_best_thres', precision[best_thres_idx])
            mlflow.log_metric('recall_best_thres', recall[best_thres_idx])
            
            # Plot confusion matrix at different thresholds
            thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_threshold]
            for threshold in thresholds:
                y_predicted = probabilities_cv > threshold
                model_h.plot_confusion_matrix(
                    train_target, y_predicted, model[1], threshold, file_suffix)

            # Generate calibration curves
            if model[1] != 'dummy_classifier':
                # Calibrated model (Sigmoid)
                model_sig = CalibratedClassifierCV(
                    model[0], method='sigmoid',cv=cross_val_fold_indices)
                if model[2] == 'imputed':
                    model_sig.fit(train_features_imp, train_target)
                    probs_sig = model_sig.predict_proba(test_features_imp)[:, 1]
                else:
                    model_sig.fit(train_features_no_imp, train_target)
                    probs_sig = model_sig.predict_proba(test_features_no_imp)[:, 1]

                # Calibrated model (Isotonic)
                model_iso = CalibratedClassifierCV(
                    model[0], method='isotonic', cv=cross_val_fold_indices)
                if model[2] == 'imputed':
                    model_iso.fit(train_features_imp, train_target)
                    probs_iso = model_iso.predict_proba(test_features_imp)[:, 1]
                else:
                    model_iso.fit(train_features_no_imp, train_target)
                    probs_iso = model_iso.predict_proba(test_features_no_imp)[:, 1]

                # Spline calibration
                spline_calib = mli.SplineCalib()
                spline_calib.fit(probabilities_cv, train_target)

                if model[2] == 'imputed':
                    model[0].fit(train_features_imp,train_target)
                    preds_test_uncalib = model[0].predict_proba(test_features_imp)[:,1]
                else:
                    model[0].fit(train_features_no_imp,train_target)
                    preds_test_uncalib = model[0].predict_proba(test_features_no_imp)[:,1]
                probs_spline = spline_calib.calibrate(preds_test_uncalib)

                # Plot calibration curves for equal width bins (each bin has same width) and
                # equal frequency bins (each bin has same number of observations)
                for strategy in ['uniform', 'quantile']:
                    for bin_num in [5, 10]:
                        if strategy == 'uniform':
                            print('--- Creating calibration curve with equal width bins ---')
                            print('-- Num bins:', bin_num, ' --')
                        else:
                            print('--- Creating calibration curve with equal frequency bins ---')
                            print('-- Num bins:', bin_num, ' --')
                        print('Uncalibrated model:')
                        prob_true_uncal, prob_pred_uncal = calibration_curve(
                            train_target, probabilities_cv,n_bins=bin_num, strategy=strategy)
                        print('Calibrated model (sigmoid):')
                        prob_true_sig, prob_pred_sig = calibration_curve(
                            test_target, probs_sig, n_bins=bin_num, strategy=strategy)
                        print('Calibrated model (isotonic):')
                        prob_true_iso, prob_pred_iso = calibration_curve(
                            test_target, probs_iso, n_bins=bin_num, strategy=strategy)
                        print('Calibrated model (spline):')
                        prob_true_spline, prob_pred_spline = calibration_curve(
                            test_target, probs_spline, n_bins=bin_num, strategy=strategy)
                
                        plt.figure(figsize=(8,8))
                        plt.plot([0, 1], [0, 1], linestyle='--')
                        plt.plot(prob_pred_uncal, prob_true_uncal, marker='.',
                                 label='Uncalibrated\n' + model[1])
                        plt.plot(prob_pred_sig, prob_true_sig, marker='.',
                                 label='Calibrated (Sigmoid)\n' + model[1])
                        plt.plot(prob_pred_iso, prob_true_iso, marker='.',
                                 label='Calibrated (Isotonic)\n' + model[1])
                        plt.plot(prob_pred_spline, prob_true_spline, marker='.',
                                 label='Calibrated (Spline)\n' + model[1])
                        plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
                        plt.tight_layout()
                        plt.savefig(os.path.join(artifact_dir, model[1] + '_uncal_' +
                                                 strategy + '_bins' + str(bin_num) +
                                                 file_suffix + '.png'))
                        plt.close()

            # Get total gain and total cover for boosting machine models
            if model[1].startswith("xgb"):
                feat_importance_tot_gain_df = model_h.plot_feat_importance_model(
                    model[0], model[1], file_suffix=file_suffix)
            if (model[1].startswith("lgbm")):
                feature_names = train_features_no_imp.columns.tolist()
                feat_importance_tot_gain_df = model_h.plot_feat_importance_model(
                    model[0], model[1], file_suffix=file_suffix, feature_names=feature_names)
            # Save feature importance by total gain
            if (model[1].startswith("xgb")) | (model[1].startswith("lgbm")):
                feat_importance_tot_gain_df.to_csv(
                    './data/feature_importance_tot_gain' + file_suffix + '.csv', index=False)
            
            # SHAP
            if model[1] not in ['dummy_classifier', 'balanced_bagging']:
                shap_values_list_train = []
                shap_vals_per_cv = {}

                # Create a dictionary to contain shap values. Dictionary is structured as
                # index : fold_num : shap_values
                for idx in range(0, len(train_data_imp)):
                    shap_vals_per_cv[idx] = {}
                    for n_fold in range(0, 5):
                        shap_vals_per_cv[idx][n_fold] = {}
                
                # Get SHAP values for each fold
                fold_num = 0
                for i, estimator in enumerate(crossval['estimator']):
                    fold_num = fold_num + 1
                    # If imputation needed for model, use imputed features
                    if model[1] in ['logistic_regression',
                                'logistic_regression_CW_balanced', 'random_forest',
                                'random_forest_CW_balanced', 'balanced_bagging',
                                'balanced_random_forest']:
                        #X_test = train_features_imp[crossval['indices']['test'][i]]
                        X_train = train_features_imp.iloc[crossval['indices']['train'][i]]
                        X_test = train_features_imp.iloc[crossval['indices']['test'][i]]
                    else:
                        X_train = train_features_no_imp.iloc[crossval['indices']['train'][i]]
                        X_test = train_features_no_imp.iloc[crossval['indices']['test'][i]]

                    # Apply different explainers depending on type of model
                    if model[3] == 'linear':
                        explainer = shap.LinearExplainer(estimator, X_train)
                    if model[3] == 'tree':
                        explainer = shap.TreeExplainer(estimator)
                    
                    # Get shap values
                    shap_values_train = explainer.shap_values(X_train)
                    # Output of shap values for some models is (class, num samples,
                    # num features). Get these in the format of (num samples, num features)
                    if len(np.shape(shap_values_train)) == 3:
                        shap_values_train = shap_values_train[1]

                    # Plot SHAP plots for each cv fold
                    shap.summary_plot(np.array(shap_values_train), X_train, show=False)
                    plt.savefig(os.path.join(artifact_dir, model[1] + '_shap_cv_fold_' +
                                             str(fold_num) + file_suffix + '.png'))
                    plt.close()

                    # Add shap values to a dictionary.
                    train_idxs = X_train.index.tolist()
                    for n, train_idx in enumerate(train_idxs):
                        shap_vals_per_cv[train_idx][i] = shap_values_train[n]

                # Calculate average shap values
                average_shap_values, stds, ranges = [],[],[]
                for i in range(0,len(train_data_imp)):
                    for n in range(0,5):
                        # If a cv fold is empty as that set has not been used in training,
                        # replace empty fold with NaN
                        try:
                            if not shap_vals_per_cv[i][n]:
                                shap_vals_per_cv[i][n] = np.NaN
                        except:
                            pass
                    # Create a df for each index that contains all shap values for each cv
                    # fold
                    df_per_obs = pd.DataFrame.from_dict(shap_vals_per_cv[i])
                    # Get relevant statistics for every sample 
                    average_shap_values.append(df_per_obs.mean(axis=1).values) 
                    stds.append(df_per_obs.std(axis=1).values)
                    ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)

                # Plot SHAP plots
                if model[2] == 'imputed':
                    shap.summary_plot(np.array(average_shap_values), train_data_imp.drop(
                        columns=['StudyId', 'ExacWithin3Months']), show=False)
                if model[2] == 'not_imputed':
                    shap.summary_plot(np.array(average_shap_values), train_data_no_imp.drop(
                        columns=['StudyId', 'ExacWithin3Months']), show=False)
                plt.savefig(
                    os.path.join(artifact_dir, model[1] + '_shap' + file_suffix + '.png'))
                plt.close()

                # Get list of most important features in order
                feat_importance_df = model_h.get_shap_feat_importance(
                    model[1], average_shap_values, features_list, file_suffix)
                feat_importance_df.to_csv(
                    './data/feature_importance_shap' + file_suffix + '.csv', index=False)

            # Plot distribution of model scores (histogram plus KDE)
            model_scores = pd.DataFrame({'model_score': probabilities_cv,
                                        'true_label': train_target})
            sns.displot(model_scores, x="model_score", hue="true_label", kde=True)
            plt.savefig(os.path.join(artifact_dir, model[1] + 'score_distribution' +
                                     file_suffix + '.png'))
            plt.close()

            # Log metrics averaged across folds
            for score in scoring:
                mlflow.log_metric(score, crossval['test_' + score].mean())
                mlflow.log_metric(score + '_std', crossval['test_' + score].std())
            # Log model parameters
            params = model[0].get_params()
            for param in params:
                mlflow.log_param(param, params[param])
            # Log artifacts
            mlflow.log_artifacts(artifact_dir)

mlflow.end_run()

# Join shap feature importance and total gain
shap_feat_importance = pd.read_csv(
    './data/feature_importance_shap' + file_suffix + '.csv')
tot_gain_feat_importance = pd.read_csv(
    './data/feature_importance_tot_gain' + file_suffix + '.csv')
tot_gain_feat_importance = tot_gain_feat_importance.rename(columns={'index':'col_name'})
feat_importance_hierarchy = shap_feat_importance.merge(
    tot_gain_feat_importance, on='col_name', how='left')
feat_importance_hierarchy.to_csv(
    './data/feat_importance_hierarchy' + file_suffix + '.csv', index=False)