copd-model-h / training /cross_val_final_models.py
IamGrooooot's picture
Inital Upload
000de75
import os
import sys
import numpy as np
import pandas as pd
import model_h
import shutil
import pickle
import yaml
# Plotting
import matplotlib.pyplot as plt
# Model training and evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import precision_recall_curve, auc
from sklearn.calibration import CalibratedClassifierCV
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
import ml_insights as mli
import mlflow
# Explainability
from sklearn.inspection import permutation_importance
with open("./training/config.yaml", "r") as config:
config = yaml.safe_load(config)
model_type = config['model_settings']['model_type']
##############################################################
# Load data
##############################################################
# Setup log file
log = open(
os.path.join(config['outputs']['logging_dir'], "modelling_" + model_type + ".log"), "w")
sys.stdout = log
# Load CV folds
fold_patients = np.load(os.path.join(config['outputs']['cohort_info_dir'],
'fold_patients_' + model_type + '.npy'), allow_pickle=True)
# Load imputed crossval data
train_data_imp = model_h.load_data_for_modelling(os.path.join(
config["outputs"]["model_input_data_dir"],
"train_imputed_cv_{}.pkl".format(model_type),
))
# Load not imputed crossval data
train_data_no_imp = model_h.load_data_for_modelling(os.path.join(
config["outputs"]["model_input_data_dir"],
"train_not_imputed_cv_{}.pkl".format(model_type),
))
# Load imputed test data
test_data_imp = model_h.load_data_for_modelling(os.path.join(
config["outputs"]["model_input_data_dir"],
"test_imputed_{}.pkl".format(model_type),
))
# Load not imputed test data
test_data_no_imp = model_h.load_data_for_modelling(os.path.join(
config["outputs"]["model_input_data_dir"],
"test_not_imputed_{}.pkl".format(model_type),
))
# Load exac data
#train_exac_data = pd.read_pickle('./data/train_exac_data_' + model_type + '.pkl')
#test_exac_data = pd.read_pickle('./data/test_exac_data_' + model_type + '.pkl')
# Print date ranges for train and test set
print('Train date range',
train_data_imp['IndexDate'].min(), train_data_imp['IndexDate'].max())
print('Test date range',
test_data_imp['IndexDate'].min(), test_data_imp['IndexDate'].max())
# Set tags
tags = {"prediction_window": config['model_settings']['prediction_window'],
"lookback_period": config['model_settings']['lookback_period'],
"min_index_date": train_data_imp['IndexDate'].min(),
"max_index_date": train_data_imp['IndexDate'].max(),
"1_row_per_length_in_service_days": config['model_settings']['one_row_per_days_in_service'],
}
# Create a tuple with training and validation indicies for each fold. Can be done with
# either imputed or not imputed data as both have same patients
cross_val_fold_indices = []
for fold in fold_patients:
fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
fold_train_ids = train_data_no_imp[~(
train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]
# Get index of rows in val and train
fold_val_index = fold_val_ids.index
fold_train_index = fold_train_ids.index
# Append tuple of training and val indices
cross_val_fold_indices.append((fold_train_index, fold_val_index))
# Create list of model features
cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate', 'HospExacWithin3Months',
'CommExacWithin3Months']
features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]
### Train data ###
# Separate features from target for data with no imputation performed
train_features_no_imp = train_data_no_imp[features_list].astype('float')
train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
train_features_imp = train_data_imp[features_list].astype('float')
train_target_imp = train_data_imp.ExacWithin3Months.astype('float')
### Test data ###
# Separate features from target for data with no imputation performed
test_features_no_imp = test_data_no_imp[features_list].astype('float')
test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
test_features_imp = test_data_imp[features_list].astype('float')
test_target_imp = test_data_imp.ExacWithin3Months.astype('float')
# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not train_target_no_imp.equals(train_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the train set.')
if not test_target_no_imp.equals(test_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the test set.')
train_target = train_target_no_imp
test_target = test_target_no_imp
# Make sure all features are numeric
for features in [train_features_no_imp, train_features_imp,
test_features_no_imp, test_features_imp]:
for col in features:
features[col] = pd.to_numeric(features[col], errors='coerce')
##############################################################
# Specify which models to evaluate
##############################################################
# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('model_h_drop_1_' + model_type)
# Set CV scoring strategies and any model parameters
scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
'average_precision', 'neg_brier_score']
# Set up models, each tuple contains 4 elements: model, model name, imputation status,
# type of model
models = []
# These models are run for both hospital exac model and hospital and community exac model
models.append((BalancedRandomForestClassifier(random_state=0),
'balanced_random_forest', 'imputed', 'tree'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
eval_metric='logloss'),
'xgb', 'not_imputed', 'tree'))
models.append((RandomForestClassifier(),
'random_forest', 'imputed', 'tree'))
# Get the parent run where hyperparameter tuning was done
if model_type == 'only_hosp':
parent_run_id = 'ba2d7244654c4b84a815932a3167648f'
if model_type == 'hosp_comm':
parent_run_id = 'f71edd4c72f14c0692431dca297ec131'
##############################################################
# Run models
##############################################################
#In MLflow run, perform K-fold cross validation and capture mean score across folds.
with mlflow.start_run(run_name='hyperparameter_optimised_models_12'):
for model in models:
# Get parameters of best scoring models
best_params = model_h.get_mlflow_run_params(
model[1], parent_run_id, 'sqlite:///mlruns.db', model_type)
# Each model will have multiple best scores for different scoring metrics.
for n, scorer in enumerate(best_params):
params = best_params[scorer]
model[0].set_params(**params)
with mlflow.start_run(run_name=model[1] + '_tuning_scorer_' + scorer, nested=True):
print(model[1], scorer)
# Create the artifacts directory if it doesn't exist
os.makedirs(config['outputs']['artifact_dir'], exist_ok=True)
# Remove existing directory contents to not mix files between different runs
shutil.rmtree(config['outputs']['artifact_dir'])
# Select correct data based on whether model is using imputed or not imputed
# dataset
if model[2] == 'imputed':
train_features = train_features_imp
test_features = test_features_imp
train_data = train_data_imp
test_data = test_data_imp
else:
train_features = train_features_no_imp
test_features = test_features_no_imp
train_data = train_data_no_imp
test_data = test_data_no_imp
mlflow.set_tags(tags=tags)
# Perform K-fold cross validation with custom folds
crossval = cross_validate(model[0], train_features, train_target,
cv=cross_val_fold_indices,
return_estimator=True, scoring=scoring,
return_indices=True)
# Get the predicted probabilities from each models
probabilities_cv = cross_val_predict(model[0], train_features,
train_target,
cv=cross_val_fold_indices,
method='predict_proba')[:, 1]
# Evaluation for uncalibrated model - test set
for iter_num, estimator in enumerate(crossval['estimator']):
probs_test = estimator.predict_proba(test_features)[:,1]
preds_test = estimator.predict(test_features)
uncalib_metrics_test = model_h.calc_eval_metrics_for_model(
test_target, preds_test, probs_test, 'uncalib_test')
if iter_num == 0:
uncalib_metrics_test_df = pd.DataFrame(
uncalib_metrics_test, index=[iter_num])
else:
uncalib_metrics_test_df_iter = pd.DataFrame(
uncalib_metrics_test, index=[iter_num])
uncalib_metrics_test_df = pd.concat(
[uncalib_metrics_test_df, uncalib_metrics_test_df_iter])
uncalib_metrics_test_mean = uncalib_metrics_test_df.mean()
uncalib_metrics_test_mean = uncalib_metrics_test_mean.to_dict()
# Get threshold that gives best F1 score for uncalibrated model
best_thres_uncal, f1_bt, prec_bt, rec_bt = model_h.get_threshold_with_best_f1_score(
train_target, probabilities_cv)
# Save f1 score, precision and recall for the best threshold
mlflow.log_metric('best_thres_uncal', best_thres_uncal)
mlflow.log_metric('f1_best_thres', f1_bt)
mlflow.log_metric('precision_best_thres', prec_bt)
mlflow.log_metric('recall_best_thres', rec_bt)
#### Plot confusion matrix at different thresholds ####
model_h.plot_confusion_matrix(
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_uncal], probabilities_cv,
train_target, model[1], model_type, 'uncalib')
#### Calculate AUC-PR score ####
precision, recall, thresholds = precision_recall_curve(
train_target, probabilities_cv)
auc_pr = auc(recall, precision)
mlflow.log_metric('auc_pr', auc_pr)
#### Generate calibration curves ####
if model[1] != 'dummy_classifier':
### Sigmoid calibration ###
# Perform calibration
model_sig = CalibratedClassifierCV(
model[0], method='sigmoid',cv=cross_val_fold_indices)
model_sig.fit(train_features, train_target)
probs_sig = model_sig.predict_proba(test_features)[:, 1]
probs_sig_2 = model_sig.predict_proba(test_features)
preds_sig = model_sig.predict(test_features)
# Generate metrics for calibrated model
calib_metrics_sig = model_h.calc_eval_metrics_for_model(
test_target, preds_sig, probs_sig, 'sig')
# Get threshold with best f1 score for calibrated model
best_thres_sig, _, _, _ = model_h.get_threshold_with_best_f1_score(
test_target, probs_sig)
mlflow.log_metric('best_thres_sig', best_thres_sig)
# Plot confusion matrices for calibrated model
model_h.plot_confusion_matrix(
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_sig], probs_sig,
test_target, model[1], model_type, "sig")
# Plot score distribution for calibrated model
model_h.plot_score_distribution(
test_target, probs_sig, config['outputs']['artifact_dir'], model[1], model_type, 'sig')
# Calculate std of auc-pr between CV folds
model_h.calc_std_for_calibrated_classifiers(
model_sig, 'sig', test_features, test_target)
### Isotonic calibration ###
# Perform calibration
model_iso = CalibratedClassifierCV(
model[0], method='isotonic', cv=cross_val_fold_indices)
model_iso.fit(train_features, train_target)
probs_iso = model_iso.predict_proba(test_features)[:, 1]
preds_iso = model_iso.predict(test_features)
# Generate metrics for calibrated model
calib_metrics_iso = model_h.calc_eval_metrics_for_model(
test_target, preds_iso, probs_iso, 'iso')
# Get threshold with best f1 score for calibrated model
best_thres_iso, _, _, _ = model_h.get_threshold_with_best_f1_score(
test_target, probs_iso)
mlflow.log_metric('best_thres_iso', best_thres_iso)
# Plot confusion matrices for calibrated model
model_h.plot_confusion_matrix(
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_iso], probs_iso,
test_target, model[1], model_type, "iso")
# Plot score distribution for calibrated model
model_h.plot_score_distribution(
test_target, probs_iso, config['outputs']['artifact_dir'], model[1], model_type, 'iso')
# Calculate std of auc-pr between CV folds
model_h.calc_std_for_calibrated_classifiers(
model_iso, 'iso', test_features, test_target)
### Spline calibration ###
# Perform calibration
spline_calib = mli.SplineCalib()
spline_calib.fit(probabilities_cv, train_target)
model[0].fit(train_features, train_target)
preds_test_uncalib = model[0].predict_proba(test_features)[:,1]
probs_spline = spline_calib.calibrate(preds_test_uncalib)
preds_spline = probs_spline > 0.5
preds_spline = preds_spline.astype(int)
# Generate metrics for calibrated model
calib_metrics_spline = model_h.calc_eval_metrics_for_model(
test_target, preds_spline, probs_spline, 'spline')
# Get threshold with best f1 score for calibrated model
best_thres_spline, _, _, _ = model_h.get_threshold_with_best_f1_score(
test_target, probs_spline)
mlflow.log_metric('best_thres_spline', best_thres_spline)
# Plot confusion matrices for calibrated model
model_h.plot_confusion_matrix(
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_spline], probs_spline,
test_target, model[1], model_type, "spline")
# Plot score distribution for calibrated model
model_h.plot_score_distribution(
test_target, probs_spline, config['outputs']['artifact_dir'], model[1], model_type, 'spline')
### Plot calibration curves ###
# Plot calibration curves for equal width bins (each bin has same width)
# and equal frequency bins (each bin has same number of observations)
for strategy in ['uniform', 'quantile']:
for bins in [5, 6, 10]:
plt.figure(figsize=(8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
model_h.plot_calibration_curve(
train_target, probabilities_cv, bins, strategy, 'Uncalibrated')
model_h.plot_calibration_curve(
test_target, probs_sig, bins, strategy,'Sigmoid')
model_h.plot_calibration_curve(
test_target, probs_iso, bins, strategy, 'Isotonic')
model_h.plot_calibration_curve(
test_target, probs_spline, bins, strategy, 'Spline')
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title(model[1])
plt.tight_layout()
plt.savefig(
os.path.join(config['outputs']['artifact_dir'], model[1] +
'_' + strategy + '_bins' + str(bins) + '_' +
model_type + '.png'))
plt.close()
# Plot uncalibrated model calibration curve at different bins and
# strategies
fig, (ax1,ax2) = plt.subplots(ncols=2, sharex=True, figsize=(15,10))
#plt.figure(figsize=(8,8))
for ax in [ax1, ax2]:
ax.plot([0, 1], [0, 1], linestyle='--')
for bins in [5, 6, 7, 8, 9]:
model_h.plot_calibration_curve(
train_target, probabilities_cv, bins, 'quantile', 'Bins=' +
str(bins), ax1)
for bins in [5, 6, 7, 8, 9]:
model_h.plot_calibration_curve(
train_target, probabilities_cv, bins, 'uniform', 'Bins=' +
str(bins), ax2)
ax1.title.set_text(model[1] + ' uncalibrated model quantile bins')
ax2.title.set_text(model[1] + ' uncalibrated model uniform bins')
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.tight_layout()
plt.savefig(
os.path.join(config['outputs']['artifact_dir'], model[1] + '_uncal_'
+ model_type + '.png'))
plt.close()
# Plot calibration curves with error bars
model_h.plot_calibration_plot_with_error_bars(
probabilities_cv, probs_sig, probs_iso, probs_spline, train_target,
test_target, model[1])
plt.close()
#### Get total gain and total cover for boosting machine models ####
if model[1].startswith("xgb"):
feat_importance_tot_gain_df = model_h.plot_feat_importance_model(
model[0], model[1], model_type)
# Save feature importance by total gain
if model[1].startswith("xgb"):
feat_importance_tot_gain_df.to_csv(
'./data/feature_importance_tot_gain_' + model_type + '.csv', index=False)
#### Calculate model performance by event type ####
if model[1] not in ['dummy_classifier']:
# Create df to contain prediction data and event type data
preds_event_df_uncalib = model_h.create_df_probabilities_and_predictions(
probabilities_cv, best_thres_uncal,
train_data['StudyId'].tolist(),
train_target,
train_data[['ExacWithin3Months','HospExacWithin3Months','CommExacWithin3Months']],
model[1], model_type, output_dir='./data/prediction_and_events/')
preds_events_df_sig = model_h.create_df_probabilities_and_predictions(
probs_sig, best_thres_sig, test_data['StudyId'].tolist(),
test_target,
test_data[['ExacWithin3Months', 'HospExacWithin3Months','CommExacWithin3Months']],
model[1], model_type, output_dir='./data/prediction_and_events/',
calib_type='sig')
preds_events_df_iso = model_h.create_df_probabilities_and_predictions(
probs_iso, best_thres_iso, test_data['StudyId'].tolist(),
test_target,
test_data[['ExacWithin3Months', 'HospExacWithin3Months','CommExacWithin3Months']],
model[1], model_type, output_dir='./data/prediction_and_events/',
calib_type='iso')
preds_events_df_spline = model_h.create_df_probabilities_and_predictions(
probs_spline, best_thres_spline, test_data['StudyId'].tolist(),
test_target,
test_data[['ExacWithin3Months', 'HospExacWithin3Months','CommExacWithin3Months']],
model[1], model_type, output_dir='./data/prediction_and_events/',
calib_type='spline')
# Subset to each event type and calculate metrics
metrics_by_event_type_uncalib = model_h.calc_metrics_by_event_type(
preds_event_df_uncalib, calib_type="uncalib")
metrics_by_event_type_sig = model_h.calc_metrics_by_event_type(
preds_events_df_sig, calib_type='sig')
metrics_by_event_type_iso = model_h.calc_metrics_by_event_type(
preds_events_df_iso, calib_type='iso')
metrics_by_event_type_spline = model_h.calc_metrics_by_event_type(
preds_events_df_spline, calib_type='spline')
# Subset to each event type and plot ROC curve
model_h.plot_roc_curve_by_event_type(
preds_event_df_uncalib, model[1], 'uncalib')
model_h.plot_roc_curve_by_event_type(
preds_events_df_sig, model[1], 'sig')
model_h.plot_roc_curve_by_event_type(
preds_events_df_iso, model[1], 'iso')
model_h.plot_roc_curve_by_event_type(
preds_events_df_spline, model[1], 'spline')
# Subset to each event type and plot PR curve
model_h.plot_prec_recall_by_event_type(
preds_event_df_uncalib, model[1], 'uncalib')
model_h.plot_prec_recall_by_event_type(
preds_events_df_sig, model[1], 'sig')
model_h.plot_prec_recall_by_event_type(
preds_events_df_iso, model[1], 'iso')
model_h.plot_prec_recall_by_event_type(
preds_events_df_spline, model[1], 'spline')
#### SHAP ####
if model[1] not in ['dummy_classifier']:
### Uncalibrated model ###
# Get the average SHAP values from CV folds for uncalibrated model
shap_values_v_uncal, shap_values_t_uncal = model_h.get_uncalibrated_shap(
crossval['estimator'], test_features, train_features,
train_data[features_list].columns,
model[1], model_type)
## Plot SHAP summary plots ##
model_h.plot_averaged_summary_plot(
shap_values_t_uncal,
train_data[features_list],
model[1], 'uncalib', model_type)
## Plot SHAP interaction heatmap ##
model_h.plot_shap_interaction_value_heatmap(
crossval['estimator'], train_features,
train_data[features_list].columns,
model[1], model_type)
### Calibrated models ###
calib_models = {'sig':model_sig, 'iso':model_iso}
for calib_model_name in calib_models:
# Get the average SHAP values from CV folds for calibrated model
shap_values_v, shap_values_t = model_h.get_calibrated_shap_by_classifier(
calib_models[calib_model_name], test_features, train_features,
train_data.drop(
columns=['StudyId', 'ExacWithin3Months', 'IndexDate',
'HospExacWithin3Months',
'CommExacWithin3Months']).columns,
calib_model_name, model[1], model_type)
## Plot SHAP summary plots ##
model_h.plot_averaged_summary_plot(
shap_values_t,
train_data.drop(
columns=['StudyId', 'ExacWithin3Months', 'IndexDate',
'HospExacWithin3Months','CommExacWithin3Months']),
model[1], calib_model_name, model_type)
## Get feature importance for local SHAP values ##
feature_imp_df = model_h.get_local_shap_values(
model[1], model_type, shap_values_v, test_features,
calib_model_name,shap_ids_dir='./data/prediction_and_events/')
feature_imp_df.to_csv(
'./data/prediction_and_events/local_feature_imp_df' + model[1] +
'_' + calib_model_name + '.csv')
## Plot local SHAP plots ##
test_feat_enc_conv = model_h.plot_local_shap(
model[1], model_type, shap_values_v, test_features, train_features,
calib_model_name,
row_ids_to_plot=['missed', 'incorrect', 'correct'],
artifact_dir=config['outputs']['artifact_dir'],
shap_ids_dir='./data/prediction_and_events/',
reverse_scaling_flag=False,
convert_target_encodings=True, imputation=model[2],
target_enc_path="./data/artifacts/target_encodings_" + model_type + ".json",
return_enc_converted_df=False)
"""
### Plot SHAP dependency plots ###
os.makedirs( "./tmp/dependence_plots", exist_ok=True)
categorical_cols = [
"DaysSinceLastExac_te", "FEV1PercentPredicted_te"]
for categorical_col in categorical_cols:
shap.dependence_plot(
categorical_col, shap_values_v, test_feat_enc_conv,
interaction_index=None, show=False)
plt.tight_layout()
plt.savefig(
"./tmp/dependence_plots/dependence_plot_" + categorical_col
+ "_" + model[1] + "_" + calib_model_name + file_suffix + ".png")
plt.close()
"""
### Plot distribution of model scores for uncalibrated model ###
model_h.plot_score_distribution(
train_target, probabilities_cv, config['outputs']['artifact_dir'],
model[1], model_type)
"""
### Permutation feature importance ###
def calc_permutation_importance(model, features, target, scoring, n_repeats):
permutation_imp = permutation_importance(model, features, target, random_state=0, scoring=scoring, n_repeats=n_repeats)
for n, score in enumerate(permutation_imp):
if n == 0:
df = pd.DataFrame(data=permutation_imp[score]['importances_mean'], index=features.columns)
df = df.rename(columns={0:score})
else:
df[score] = permutation_imp[score]['importances_mean']
return df, permutation_imp
def plot_permutation_feature_importance(permutation_imp_full, metric, col_names, n_repeats, train_or_test):
os.makedirs("./tmp/permutation_feat_imp", exist_ok=True)
sorted_importances_idx = permutation_imp_full[metric].importances_mean.argsort()
importances = pd.DataFrame(
permutation_imp_full[metric].importances[sorted_importances_idx].T,
columns=col_names[sorted_importances_idx],
)
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances(" + train_or_test + ")")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()
plt.savefig('./tmp/permutation_feat_imp/' + train_or_test + '_' + metric + '_repeats' + str(n_repeats) +'.png')
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr
full_dataset_feat = pd.concat([train_features, test_features], axis=0)
print(train_features)
print(full_dataset_feat)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
corr = spearmanr(full_dataset_feat).correlation
# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)
# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - np.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
dendro = hierarchy.dendrogram(
dist_linkage, labels=full_dataset_feat.columns.to_list(), ax=ax1, leaf_rotation=90
)
dendro_idx = np.arange(0, len(dendro["ivl"]))
ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
ax2.set_yticklabels(dendro["ivl"])
_ = fig.tight_layout()
plt.show()
plt.close()
#features_to_drop = ["TotalEngagementMRC", "NumCommExacPrior6mo", "WeekAvgCATQ2", "WeekAvgCATQ4"]
#X_train_sel = train_features.drop(columns=features_to_drop)
#X_test_sel = test_features.drop(columns=features_to_drop)
from collections import defaultdict
cluster_ids = hierarchy.fcluster(dist_linkage, 0.5, criterion="distance")
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
selected_features_names = full_dataset_feat.columns[selected_features]
X_train_sel = train_features[selected_features_names]
X_test_sel = test_features[selected_features_names]
print(selected_features_names)
# retrain
# Perform calibration
model_sig_perm = CalibratedClassifierCV(
model[0], method='sigmoid',cv=cross_val_fold_indices)
model_sig_perm.fit(X_train_sel, train_target)
probs_sig = model_sig_perm.predict_proba(X_test_sel)[:, 1]
probs_sig_2 = model_sig_perm.predict_proba(X_test_sel)
preds_sig = model_sig_perm.predict(X_test_sel)
print('before')
print(calib_metrics_sig)
# Generate metrics for calibrated model
calib_metrics_sig = copd.calc_eval_metrics_for_model(
test_target, preds_sig, probs_sig, 'sig')
print(calib_metrics_sig)
def plot_permutation_importance(clf, X, y, ax):
result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=2,scoring='average_precision')
perm_sorted_idx = result.importances_mean.argsort()
ax.boxplot(
result.importances[perm_sorted_idx].T,
vert=False,
labels=X.columns[perm_sorted_idx],
)
ax.axvline(x=0, color="k", linestyle="--")
return ax
fig, ax = plt.subplots(figsize=(7, 6))
plot_permutation_importance(model_sig_perm, X_test_sel, test_target, ax)
ax.set_title("Permutation Importances on selected subset of features\n(test set)")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()
plt.savefig('./tmp/permutation_feat_imp.png')
#for metric in ['f1', 'average_precision', 'roc_auc']:
# for n_repeats in [5,10, 50]:
# permutation_imp_train_df, permutation_imp_train_dict = calc_permutation_importance(model_sig, train_features, train_target, scoring=scoring, n_repeats=n_repeats)
# plot_permutation_feature_importance(permutation_imp_train_dict, metric, train_features.columns, n_repeats, 'train')
# for n_repeats in [5,10, 50]:
# permutation_imp_test_df, permutation_imp_test_dict = calc_permutation_importance(model_sig, test_features, test_target, scoring=scoring, n_repeats=n_repeats)
# plot_permutation_feature_importance(permutation_imp_test_dict, metric, test_features.columns, n_repeats, 'test')
"""
### Log metrics, parameters, and artifacts ###
# Log metrics averaged across folds
for score in scoring:
mlflow.log_metric(score, crossval['test_' + score].mean())
mlflow.log_metric(score + '_std', crossval['test_' + score].std())
# Log metrics for calibrated models
if model[1] != 'dummy_classifier':
mlflow.log_metrics(uncalib_metrics_test_mean)
mlflow.log_metrics(calib_metrics_sig)
mlflow.log_metrics(calib_metrics_iso)
mlflow.log_metrics(calib_metrics_spline)
mlflow.log_metrics(metrics_by_event_type_uncalib)
mlflow.log_metrics(metrics_by_event_type_sig)
mlflow.log_metrics(metrics_by_event_type_iso)
mlflow.log_metrics(metrics_by_event_type_spline)
# Log model parameters
params = model[0].get_params()
for param in params:
mlflow.log_param(param, params[param])
# Log artifacts
mlflow.log_artifacts(config['outputs']['artifact_dir'])
# Save sig model
with open('./data/model/trained_sig_' + model[1] + '_pkl', 'wb') as files:
pickle.dump(model_sig, files)
with open('./data/model/trained_iso_' + model[1] + '_pkl', 'wb') as files:
pickle.dump(model_iso, files)
with open('./data/model/trained_spline_' + model[1] + '_pkl', 'wb') as files:
pickle.dump(spline_calib, files)
mlflow.end_run()