"""Perform model calibration in CV on different algorithms and log to mlflow. Nests runs for different algos under parent run and logs the following artifacts as well as metrics and parameters: 1. Calibration curves for each child algo run (calibration in CV and calibration on holdout test after applying isotonic and sigmoid calibration) 2. Calibration curve under parent run to compare all algos in CV and post calibration 3. Cumulative gains curve 4. Lift curve 5. Probability distributions with KDE (CV) """ import matplotlib.pyplot as plt import matplotlib.lines as mlines from lenusml import splits, plots import numpy as np import os import pandas as pd from sklearn.model_selection import cross_val_predict, cross_validate from sklearn.calibration import calibration_curve, CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier from sklearn.ensemble import RandomForestClassifier import xgboost as xgb import lightgbm as lgb from interpret.glassbox import ExplainableBoostingClassifier import mlflow from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID data_dir = '../data/models/model1/' cohort_info_dir = '../data/cohort_info/' output_dir = '../data/models/model1/output' # Load CV folds and train data fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), allow_pickle=True) train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl')) test_data = pd.read_pickle(os.path.join(data_dir, 'test_data.pkl')) # Cross check fold patients with train data cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients, train_data=train_data, id_column='StudyId') mlflow.set_tracking_uri("sqlite:///mlruns.sqlite") mlflow.set_experiment('model_drop2') # Set CV scoring strategies and any model parameters scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'neg_brier_score'] def plot_calibration_curves(calibration_curves, savefig=True, output_dir=None, figname=None, figsize=(8, 7)): fig, ax = plt.subplots(figsize=figsize) # reference line, legends, and axis labels line = mlines.Line2D([0, 1], [0, 1], color='black') transform = ax.transAxes line.set_transform(transform) ax.add_line(line) fig.suptitle('Calibration plot') ax.set_xlabel('Predicted probability') ax.set_ylabel('True probability in each bin') color = iter(plt.cm.rainbow(np.linspace(0, 1, len(calibration_curves)))) for cal_curve in calibration_curves: c = next(color) plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', c=c, linewidth=1, label=cal_curve[1]) plt.xlim(0, 1) plt.ylim(0, 1) plt.legend(frameon=False, bbox_to_anchor=(1, 1), loc="upper left") plt.tight_layout() if savefig: plt.savefig(os.path.join(output_dir, figname)) def plot_calibration_curves_algo(calibration_curves, savefig=True, output_dir=None, figname=None, figsize=(8, 7)): fig, ax = plt.subplots(figsize=figsize) # reference line, legends, and axis labels line = mlines.Line2D([0, 1], [0, 1], color='black') transform = ax.transAxes line.set_transform(transform) ax.add_line(line) fig.suptitle('Calibration plot') ax.set_xlabel('Predicted probability') ax.set_ylabel('True probability in each bin') for cal_curve in calibration_curves: plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', linewidth=1, label=cal_curve[1]) plt.xlim(0, 1) plt.ylim(0, 1) plt.legend(frameon=False) plt.tight_layout() if savefig: plt.savefig(os.path.join(output_dir, figname)) # Create list of model features cols_to_drop = ['StudyId', 'IsExac'] # Get the features list from the preferred model with open('./mlruns/2/7ebf60a5d17f49d9a79e41dd72dda858/artifacts/features.txt') as f: features_list = f.read().splitlines() # Separate features from target features_train = train_data[features_list].astype('float') target_train = train_data.IsExac.astype('float') features_test = test_data[features_list].astype('float') target_test = test_data.IsExac.astype('float') artifact_dir = './tmp' # Create the artifacts directory if it doesn't exist os.makedirs(artifact_dir, exist_ok=True) # Remove any existing directory contents to not mix files between different runs for f in os.listdir(artifact_dir): os.remove(os.path.join(artifact_dir, f)) scale_pos_weight = target_train.value_counts()[0] / target_train.value_counts()[1] # Create list of algos to try models = [] models.append((LogisticRegression(random_state=0, max_iter=200), 'LR')) models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200), 'LR_CW_balanced')) models.append((lgb.LGBMClassifier(random_state=0), 'LGBM')) models.append((BalancedBaggingClassifier(random_state=0), 'Balanced_bagging')) models.append((BalancedRandomForestClassifier(random_state=0), 'Balanced_RF')) models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss'), 'XGB')) models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'XGB_SPW')) models.append((ExplainableBoostingClassifier(random_state=0), 'EBM')) models.append((RandomForestClassifier(random_state=0), 'RF')) models.append((RandomForestClassifier(random_state=0, class_weight='balanced'), 'RF_CW_balanced')) calibration_curves_cv = [] calibration_curves_sigmoid = [] calibration_curves_isotonic = [] cal_curve_strategy = 'uniform' with mlflow.start_run(run_name='sklearn_calibration_in_cv_uniform_bins'): # Perform K-fold cross validation runid = mlflow.active_run().info.run_id for model in models: with mlflow.start_run(run_name=model[1], nested=True, tags={MLFLOW_PARENT_RUN_ID: runid}): # Remove any existing directory contents to not mix files between different # runs for f in os.listdir(artifact_dir): os.remove(os.path.join(artifact_dir, f)) calibration_curves_algo = [] crossval = cross_validate(model[0], features_train, target_train, cv=cross_validation_fold_indices, return_estimator=True, scoring=scoring, error_score='raise') probabilities_cv = cross_val_predict(model[0], features_train, target_train, cv=cross_validation_fold_indices, method='predict_proba')[:, 1] model_scores = pd.DataFrame({'model_score': probabilities_cv, 'true_label': target_train}) model_scores = model_scores.sort_values(by='model_score', ascending=False) # Extract calibration curve calibration_curves_cv.append((calibration_curve(target_train, probabilities_cv, n_bins=10, strategy=cal_curve_strategy), model[1])) # Log metrics averaged across folds for score in scoring: mlflow.log_metric(score, np.mean(crossval['test_' + score])) # Log model parameters params = model[0].get_params() for param in params: mlflow.log_param(param, params[param]) # Calibrate model in CV calibrated_sigmoid = CalibratedClassifierCV(model[0], method='sigmoid', cv=cross_validation_fold_indices) calibrated_sigmoid.fit(features_train, target_train) probabilities_sigmoid = calibrated_sigmoid.predict_proba(features_test)[:, 1] calibrated_isotonic = CalibratedClassifierCV(model[0], method='isotonic', cv=cross_validation_fold_indices) calibrated_isotonic.fit(features_train, target_train) probabilities_isotonic = calibrated_isotonic.predict_proba( features_test)[:, 1] # Extract calibration curve calibration_curves_sigmoid.append((calibration_curve(target_test, probabilities_sigmoid, n_bins=10, strategy=cal_curve_strategy), model[1] + ' sigmoid')) calibration_curves_isotonic.append((calibration_curve(target_test, probabilities_isotonic, n_bins=10, strategy=cal_curve_strategy), model[1] + ' isotonic')) calibration_curves_algo.append((calibration_curve(target_train, probabilities_cv, n_bins=10, strategy=cal_curve_strategy), model[1] + ' uncalibrated')) calibration_curves_algo.append((calibration_curve(target_test, probabilities_sigmoid, n_bins=10, strategy=cal_curve_strategy), model[1] + ' sigmoid')) calibration_curves_algo.append((calibration_curve(target_test, probabilities_isotonic, n_bins=10, strategy=cal_curve_strategy), model[1] + ' isotonic')) # Plot cumulative gains curves plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True, output_dir=artifact_dir, figname='cumulative_gains_curve.png') # Plot lift curves plots.plot_lift_curve(scores=model_scores, savefig=True, output_dir=artifact_dir, figname='lift_curve.png') # Plot distribution of model scores (histogram plus KDE) plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac', negative_class_name='No exac', savefig=True, output_dir=artifact_dir, figname='model_score_distribution.png') # Plot calibration curves for each algo plot_calibration_curves_algo(calibration_curves=calibration_curves_algo, savefig=True, output_dir=artifact_dir, figname='calibration_curves.png', figsize=(8, 7)) # Log artifacts under child runs mlflow.log_artifacts(artifact_dir) mlflow.end_run() # Log artifacts under parent run for f in os.listdir(artifact_dir): os.remove(os.path.join(artifact_dir, f)) plot_calibration_curves(calibration_curves=calibration_curves_cv, savefig=True, output_dir=artifact_dir, figname='calibration_curves_cv.png', figsize=(15, 10)) plot_calibration_curves(calibration_curves=calibration_curves_sigmoid, savefig=True, output_dir=artifact_dir, figname='calibration_curves_sigmoid.png', figsize=(15, 10)) plot_calibration_curves(calibration_curves=calibration_curves_isotonic, savefig=True, output_dir=artifact_dir, figname='calibration_curves_isotonic.png', figsize=(15, 10)) with mlflow.start_run(run_id=runid): mlflow.log_artifacts(artifact_dir) mlflow.end_run()