copd-model-h / training /cross_val_first_models.py
IamGrooooot's picture
Inital Upload
000de75
import os
import sys
import numpy as np
import pandas as pd
import mlflow
import model_h
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
# Model training and evaluation
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import ml_insights as mli
# Explainability
import shap
##############################################################
# Specify which model to perform cross validation on
##############################################################
model_only_hosp = True
if model_only_hosp is True:
file_suffix = "_only_hosp"
else:
file_suffix = "_hosp_comm"
##############################################################
# Load data
##############################################################
# Setup log file
log = open("./training/logging/modelling" + file_suffix + ".log", "w")
sys.stdout = log
# Load CV folds
fold_patients = np.load(
'./data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True)
# Load imputed train data
train_data_imp = model_h.load_data_for_modelling(
'./data/model_data/train_data_cv_imp' + file_suffix + '.pkl')
train_data_imp = train_data_imp.drop(columns=['Sex_F', 'Age_TEnc'])
# Load not imputed train data
train_data_no_imp = model_h.load_data_for_modelling(
'./data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl')
train_data_no_imp = train_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc'])
# Load imputed test data
test_data_imp = model_h.load_data_for_modelling(
'./data/model_data/test_data_imp' + file_suffix + '.pkl')
test_data_imp = test_data_imp.drop(columns=['Sex_F', 'Age_TEnc'])
# Load not imputed test data
test_data_no_imp = model_h.load_data_for_modelling(
'./data/model_data/test_data_no_imp' + file_suffix + '.pkl')
test_data_no_imp = test_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc'])
# Create a tuple with training and validation indicies for each fold. Can be done with
# either imputed or not imputed data as both have same patients
cross_val_fold_indices = []
for fold in fold_patients:
fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
fold_train_ids = train_data_no_imp[~(
train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]
# Get index of rows in val and train
fold_val_index = fold_val_ids.index
fold_train_index = fold_train_ids.index
# Append tuple of training and val indices
cross_val_fold_indices.append((fold_train_index, fold_val_index))
# Create list of model features
cols_to_drop = ['StudyId', 'ExacWithin3Months']
features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]
# Train data
# Separate features from target for data with no imputation performed
train_features_no_imp = train_data_no_imp[features_list].astype('float')
train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
train_features_imp = train_data_imp[features_list].astype('float')
train_target_imp = train_data_imp.ExacWithin3Months.astype('float')
# Test data
# Separate features from target for data with no imputation performed
test_features_no_imp = test_data_no_imp[features_list].astype('float')
test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
test_features_imp = test_data_imp[features_list].astype('float')
test_target_imp = test_data_imp.ExacWithin3Months.astype('float')
# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not train_target_no_imp.equals(train_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the train set.')
if not test_target_no_imp.equals(test_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the test set.')
train_target = train_target_no_imp
test_target = test_target_no_imp
# Make sure all features are numeric
for features in [train_features_no_imp, train_features_imp,
test_features_no_imp, test_features_imp]:
for col in features:
features[col] = pd.to_numeric(features[col], errors='coerce')
##############################################################
# Specify which models to evaluate
##############################################################
# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('model_h_drop_1' + file_suffix)
# Set CV scoring strategies and any model parameters
scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
'average_precision', 'neg_brier_score']
scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1]
# Set up models, each tuple contains 4 elements: model, model name, imputation status,
# type of model
models = []
# Dummy classifier
models.append((DummyClassifier(strategy='stratified'),
'dummy_classifier', 'imputed'))
# Logistic regression
models.append((LogisticRegression(random_state=0, max_iter=200),
'logistic_regression', 'imputed', 'linear'))
models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200),
'logistic_regression_CW_balanced', 'imputed', 'linear'))
# Random forest
models.append((RandomForestClassifier(random_state=0),
'random_forest', 'imputed', 'tree'))
models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
'random_forest_CW_balanced', 'imputed', 'tree'))
models.append((BalancedRandomForestClassifier(random_state=0),
'balanced_random_forest', 'imputed', 'tree'))
# Bagging
models.append((BalancedBaggingClassifier(random_state=0),
'balanced_bagging', 'imputed', 'tree'))
# XGBoost
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
eval_metric='logloss', learning_rate=0.1),
'xgb', 'not_imputed', 'tree'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
eval_metric='logloss', learning_rate=0.1, max_depth=4),
'xgb_mdepth_4', 'not_imputed', 'tree'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1),
'xgb_spw', 'not_imputed', 'tree'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1,
max_depth=4),
'xgb_spw_mdepth_4', 'not_imputed', 'tree'))
# Light GBM
models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1, verbose_eval=-1),
'lgbm', 'not_imputed', 'tree'))
models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1,
scale_pos_weight=scale_pos_weight, verbose_eval=-1),
'lgbm_spw', 'not_imputed', 'tree'))
# CatBoost
models.append((CatBoostClassifier(random_state=0, learning_rate=0.1),
'catboost', 'not_imputed', 'tree'))
# Convert features and target to a numpy array
# Train data
#train_features_no_imp = train_features_no_imp.to_numpy()
#train_features_imp = train_features_imp.to_numpy()
#train_target = train_target.to_numpy()
# Test data
#test_features_no_imp = test_features_no_imp.to_numpy()
#test_features_imp = test_features_imp.to_numpy()
#test_target = test_target.to_numpy()
##############################################################
# Run models
##############################################################
#In MLflow run, perform K-fold cross validation and capture mean score across folds.
with mlflow.start_run(run_name='model_selection_less_features_3rd_iter_minus_sex'):
for model in models:
with mlflow.start_run(run_name=model[1], nested=True):
print(model[1])
# Create the artifacts directory if it doesn't exist
artifact_dir = './tmp'
os.makedirs(artifact_dir, exist_ok=True)
# Remove existing directory contents to not mix files between different runs
for f in os.listdir(artifact_dir):
os.remove(os.path.join(artifact_dir, f))
# Perform K-fold cross validation with custom folds using imputed dataset for
# non-sparsity aware models
if model[2] == 'imputed':
crossval = cross_validate(model[0], train_features_imp, train_target,
cv=cross_val_fold_indices,
return_estimator=True, scoring=scoring,
return_indices=True)
# Get the predicted probabilities from each models
probabilities_cv = cross_val_predict(model[0], train_features_imp,
train_target, cv=cross_val_fold_indices,
method='predict_proba')[:, 1]
else:
crossval = cross_validate(model[0], train_features_no_imp, train_target,
cv=cross_val_fold_indices, return_estimator=True,
scoring=scoring, return_indices=True)
# Get the predicted probabilities from each models
probabilities_cv = cross_val_predict(model[0], train_features_no_imp,
train_target, cv=cross_val_fold_indices,
method='predict_proba')[:, 1]
# Get threshold that gives best F1 score
precision, recall, thresholds = precision_recall_curve(
train_target, probabilities_cv)
fscore = (2 * precision * recall) / (precision + recall)
# When getting the max fscore, if fscore is nan, nan will be returned as the
# max. Iterate until nan not returned.
fscore_zero = True
position = -1
while fscore_zero is True:
best_thres_idx = np.argsort(fscore, axis=0)[position]
if np.isnan(fscore[best_thres_idx]) == True:
position = position - 1
else:
fscore_zero = False
best_threshold = thresholds[best_thres_idx]
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (
best_threshold, fscore[best_thres_idx], precision[best_thres_idx],
recall[best_thres_idx]))
# Save f1 score, precision and recall for the best threshold
mlflow.log_metric('best_threshold', best_threshold)
mlflow.log_metric('f1_best_thres', fscore[best_thres_idx])
mlflow.log_metric('precision_best_thres', precision[best_thres_idx])
mlflow.log_metric('recall_best_thres', recall[best_thres_idx])
# Plot confusion matrix at different thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_threshold]
for threshold in thresholds:
y_predicted = probabilities_cv > threshold
model_h.plot_confusion_matrix(
train_target, y_predicted, model[1], threshold, file_suffix)
# Generate calibration curves
if model[1] != 'dummy_classifier':
# Calibrated model (Sigmoid)
model_sig = CalibratedClassifierCV(
model[0], method='sigmoid',cv=cross_val_fold_indices)
if model[2] == 'imputed':
model_sig.fit(train_features_imp, train_target)
probs_sig = model_sig.predict_proba(test_features_imp)[:, 1]
else:
model_sig.fit(train_features_no_imp, train_target)
probs_sig = model_sig.predict_proba(test_features_no_imp)[:, 1]
# Calibrated model (Isotonic)
model_iso = CalibratedClassifierCV(
model[0], method='isotonic', cv=cross_val_fold_indices)
if model[2] == 'imputed':
model_iso.fit(train_features_imp, train_target)
probs_iso = model_iso.predict_proba(test_features_imp)[:, 1]
else:
model_iso.fit(train_features_no_imp, train_target)
probs_iso = model_iso.predict_proba(test_features_no_imp)[:, 1]
# Spline calibration
spline_calib = mli.SplineCalib()
spline_calib.fit(probabilities_cv, train_target)
if model[2] == 'imputed':
model[0].fit(train_features_imp,train_target)
preds_test_uncalib = model[0].predict_proba(test_features_imp)[:,1]
else:
model[0].fit(train_features_no_imp,train_target)
preds_test_uncalib = model[0].predict_proba(test_features_no_imp)[:,1]
probs_spline = spline_calib.calibrate(preds_test_uncalib)
# Plot calibration curves for equal width bins (each bin has same width) and
# equal frequency bins (each bin has same number of observations)
for strategy in ['uniform', 'quantile']:
for bin_num in [5, 10]:
if strategy == 'uniform':
print('--- Creating calibration curve with equal width bins ---')
print('-- Num bins:', bin_num, ' --')
else:
print('--- Creating calibration curve with equal frequency bins ---')
print('-- Num bins:', bin_num, ' --')
print('Uncalibrated model:')
prob_true_uncal, prob_pred_uncal = calibration_curve(
train_target, probabilities_cv,n_bins=bin_num, strategy=strategy)
print('Calibrated model (sigmoid):')
prob_true_sig, prob_pred_sig = calibration_curve(
test_target, probs_sig, n_bins=bin_num, strategy=strategy)
print('Calibrated model (isotonic):')
prob_true_iso, prob_pred_iso = calibration_curve(
test_target, probs_iso, n_bins=bin_num, strategy=strategy)
print('Calibrated model (spline):')
prob_true_spline, prob_pred_spline = calibration_curve(
test_target, probs_spline, n_bins=bin_num, strategy=strategy)
plt.figure(figsize=(8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(prob_pred_uncal, prob_true_uncal, marker='.',
label='Uncalibrated\n' + model[1])
plt.plot(prob_pred_sig, prob_true_sig, marker='.',
label='Calibrated (Sigmoid)\n' + model[1])
plt.plot(prob_pred_iso, prob_true_iso, marker='.',
label='Calibrated (Isotonic)\n' + model[1])
plt.plot(prob_pred_spline, prob_true_spline, marker='.',
label='Calibrated (Spline)\n' + model[1])
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(artifact_dir, model[1] + '_uncal_' +
strategy + '_bins' + str(bin_num) +
file_suffix + '.png'))
plt.close()
# Get total gain and total cover for boosting machine models
if model[1].startswith("xgb"):
feat_importance_tot_gain_df = model_h.plot_feat_importance_model(
model[0], model[1], file_suffix=file_suffix)
if (model[1].startswith("lgbm")):
feature_names = train_features_no_imp.columns.tolist()
feat_importance_tot_gain_df = model_h.plot_feat_importance_model(
model[0], model[1], file_suffix=file_suffix, feature_names=feature_names)
# Save feature importance by total gain
if (model[1].startswith("xgb")) | (model[1].startswith("lgbm")):
feat_importance_tot_gain_df.to_csv(
'./data/feature_importance_tot_gain' + file_suffix + '.csv', index=False)
# SHAP
if model[1] not in ['dummy_classifier', 'balanced_bagging']:
shap_values_list_train = []
shap_vals_per_cv = {}
# Create a dictionary to contain shap values. Dictionary is structured as
# index : fold_num : shap_values
for idx in range(0, len(train_data_imp)):
shap_vals_per_cv[idx] = {}
for n_fold in range(0, 5):
shap_vals_per_cv[idx][n_fold] = {}
# Get SHAP values for each fold
fold_num = 0
for i, estimator in enumerate(crossval['estimator']):
fold_num = fold_num + 1
# If imputation needed for model, use imputed features
if model[1] in ['logistic_regression',
'logistic_regression_CW_balanced', 'random_forest',
'random_forest_CW_balanced', 'balanced_bagging',
'balanced_random_forest']:
#X_test = train_features_imp[crossval['indices']['test'][i]]
X_train = train_features_imp.iloc[crossval['indices']['train'][i]]
X_test = train_features_imp.iloc[crossval['indices']['test'][i]]
else:
X_train = train_features_no_imp.iloc[crossval['indices']['train'][i]]
X_test = train_features_no_imp.iloc[crossval['indices']['test'][i]]
# Apply different explainers depending on type of model
if model[3] == 'linear':
explainer = shap.LinearExplainer(estimator, X_train)
if model[3] == 'tree':
explainer = shap.TreeExplainer(estimator)
# Get shap values
shap_values_train = explainer.shap_values(X_train)
# Output of shap values for some models is (class, num samples,
# num features). Get these in the format of (num samples, num features)
if len(np.shape(shap_values_train)) == 3:
shap_values_train = shap_values_train[1]
# Plot SHAP plots for each cv fold
shap.summary_plot(np.array(shap_values_train), X_train, show=False)
plt.savefig(os.path.join(artifact_dir, model[1] + '_shap_cv_fold_' +
str(fold_num) + file_suffix + '.png'))
plt.close()
# Add shap values to a dictionary.
train_idxs = X_train.index.tolist()
for n, train_idx in enumerate(train_idxs):
shap_vals_per_cv[train_idx][i] = shap_values_train[n]
# Calculate average shap values
average_shap_values, stds, ranges = [],[],[]
for i in range(0,len(train_data_imp)):
for n in range(0,5):
# If a cv fold is empty as that set has not been used in training,
# replace empty fold with NaN
try:
if not shap_vals_per_cv[i][n]:
shap_vals_per_cv[i][n] = np.NaN
except:
pass
# Create a df for each index that contains all shap values for each cv
# fold
df_per_obs = pd.DataFrame.from_dict(shap_vals_per_cv[i])
# Get relevant statistics for every sample
average_shap_values.append(df_per_obs.mean(axis=1).values)
stds.append(df_per_obs.std(axis=1).values)
ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)
# Plot SHAP plots
if model[2] == 'imputed':
shap.summary_plot(np.array(average_shap_values), train_data_imp.drop(
columns=['StudyId', 'ExacWithin3Months']), show=False)
if model[2] == 'not_imputed':
shap.summary_plot(np.array(average_shap_values), train_data_no_imp.drop(
columns=['StudyId', 'ExacWithin3Months']), show=False)
plt.savefig(
os.path.join(artifact_dir, model[1] + '_shap' + file_suffix + '.png'))
plt.close()
# Get list of most important features in order
feat_importance_df = model_h.get_shap_feat_importance(
model[1], average_shap_values, features_list, file_suffix)
feat_importance_df.to_csv(
'./data/feature_importance_shap' + file_suffix + '.csv', index=False)
# Plot distribution of model scores (histogram plus KDE)
model_scores = pd.DataFrame({'model_score': probabilities_cv,
'true_label': train_target})
sns.displot(model_scores, x="model_score", hue="true_label", kde=True)
plt.savefig(os.path.join(artifact_dir, model[1] + 'score_distribution' +
file_suffix + '.png'))
plt.close()
# Log metrics averaged across folds
for score in scoring:
mlflow.log_metric(score, crossval['test_' + score].mean())
mlflow.log_metric(score + '_std', crossval['test_' + score].std())
# Log model parameters
params = model[0].get_params()
for param in params:
mlflow.log_param(param, params[param])
# Log artifacts
mlflow.log_artifacts(artifact_dir)
mlflow.end_run()
# Join shap feature importance and total gain
shap_feat_importance = pd.read_csv(
'./data/feature_importance_shap' + file_suffix + '.csv')
tot_gain_feat_importance = pd.read_csv(
'./data/feature_importance_tot_gain' + file_suffix + '.csv')
tot_gain_feat_importance = tot_gain_feat_importance.rename(columns={'index':'col_name'})
feat_importance_hierarchy = shap_feat_importance.merge(
tot_gain_feat_importance, on='col_name', how='left')
feat_importance_hierarchy.to_csv(
'./data/feat_importance_hierarchy' + file_suffix + '.csv', index=False)