| | """Perform model calibration in CV on different algorithms and log to mlflow. |
| | |
| | Nests runs for different algos under parent run and logs the following |
| | artifacts as well as metrics and parameters: |
| | 1. Calibration curves for each child algo run (calibration in CV and calibration on |
| | holdout test after applying isotonic and sigmoid calibration) |
| | 2. Calibration curve under parent run to compare all algos in CV and post calibration |
| | 3. Cumulative gains curve |
| | 4. Lift curve |
| | 5. Probability distributions with KDE (CV) |
| | """ |
| | import matplotlib.pyplot as plt |
| | import matplotlib.lines as mlines |
| | from lenusml import splits, plots |
| | import numpy as np |
| | import os |
| | import pandas as pd |
| |
|
| | from sklearn.model_selection import cross_val_predict, cross_validate |
| | from sklearn.calibration import calibration_curve, CalibratedClassifierCV |
| |
|
| | from sklearn.linear_model import LogisticRegression |
| | from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier |
| | from sklearn.ensemble import RandomForestClassifier |
| | import xgboost as xgb |
| | import lightgbm as lgb |
| | from interpret.glassbox import ExplainableBoostingClassifier |
| |
|
| | import mlflow |
| | from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID |
| |
|
| |
|
| | data_dir = '../data/models/model1/' |
| | cohort_info_dir = '../data/cohort_info/' |
| | output_dir = '../data/models/model1/output' |
| |
|
| | |
| | fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), |
| | allow_pickle=True) |
| | train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl')) |
| | test_data = pd.read_pickle(os.path.join(data_dir, 'test_data.pkl')) |
| | |
| | cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients, |
| | train_data=train_data, |
| | id_column='StudyId') |
| |
|
| | mlflow.set_tracking_uri("sqlite:///mlruns.sqlite") |
| | mlflow.set_experiment('model_drop2') |
| |
|
| | |
| | scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', |
| | 'average_precision', 'neg_brier_score'] |
| |
|
| |
|
| | def plot_calibration_curves(calibration_curves, savefig=True, output_dir=None, |
| | figname=None, figsize=(8, 7)): |
| | fig, ax = plt.subplots(figsize=figsize) |
| | |
| | line = mlines.Line2D([0, 1], [0, 1], color='black') |
| | transform = ax.transAxes |
| | line.set_transform(transform) |
| | ax.add_line(line) |
| | fig.suptitle('Calibration plot') |
| | ax.set_xlabel('Predicted probability') |
| | ax.set_ylabel('True probability in each bin') |
| | color = iter(plt.cm.rainbow(np.linspace(0, 1, len(calibration_curves)))) |
| | for cal_curve in calibration_curves: |
| | c = next(color) |
| | plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', c=c, linewidth=1, |
| | label=cal_curve[1]) |
| | plt.xlim(0, 1) |
| | plt.ylim(0, 1) |
| | plt.legend(frameon=False, bbox_to_anchor=(1, 1), loc="upper left") |
| | plt.tight_layout() |
| | if savefig: |
| | plt.savefig(os.path.join(output_dir, figname)) |
| |
|
| |
|
| | def plot_calibration_curves_algo(calibration_curves, savefig=True, output_dir=None, |
| | figname=None, figsize=(8, 7)): |
| | fig, ax = plt.subplots(figsize=figsize) |
| | |
| | line = mlines.Line2D([0, 1], [0, 1], color='black') |
| | transform = ax.transAxes |
| | line.set_transform(transform) |
| | ax.add_line(line) |
| | fig.suptitle('Calibration plot') |
| | ax.set_xlabel('Predicted probability') |
| | ax.set_ylabel('True probability in each bin') |
| | for cal_curve in calibration_curves: |
| | plt.plot(cal_curve[0][1], cal_curve[0][0], marker='o', linewidth=1, |
| | label=cal_curve[1]) |
| | plt.xlim(0, 1) |
| | plt.ylim(0, 1) |
| | plt.legend(frameon=False) |
| | plt.tight_layout() |
| | if savefig: |
| | plt.savefig(os.path.join(output_dir, figname)) |
| |
|
| |
|
| | |
| | cols_to_drop = ['StudyId', 'IsExac'] |
| |
|
| | |
| | with open('./mlruns/2/7ebf60a5d17f49d9a79e41dd72dda858/artifacts/features.txt') as f: |
| | features_list = f.read().splitlines() |
| |
|
| | |
| | features_train = train_data[features_list].astype('float') |
| | target_train = train_data.IsExac.astype('float') |
| | features_test = test_data[features_list].astype('float') |
| | target_test = test_data.IsExac.astype('float') |
| |
|
| | artifact_dir = './tmp' |
| | |
| | os.makedirs(artifact_dir, exist_ok=True) |
| | |
| | for f in os.listdir(artifact_dir): |
| | os.remove(os.path.join(artifact_dir, f)) |
| |
|
| | scale_pos_weight = target_train.value_counts()[0] / target_train.value_counts()[1] |
| |
|
| | |
| | models = [] |
| | models.append((LogisticRegression(random_state=0, max_iter=200), 'LR')) |
| | models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200), |
| | 'LR_CW_balanced')) |
| | models.append((lgb.LGBMClassifier(random_state=0), 'LGBM')) |
| | models.append((BalancedBaggingClassifier(random_state=0), |
| | 'Balanced_bagging')) |
| | models.append((BalancedRandomForestClassifier(random_state=0), 'Balanced_RF')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss'), 'XGB')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'XGB_SPW')) |
| | models.append((ExplainableBoostingClassifier(random_state=0), 'EBM')) |
| | models.append((RandomForestClassifier(random_state=0), 'RF')) |
| | models.append((RandomForestClassifier(random_state=0, class_weight='balanced'), |
| | 'RF_CW_balanced')) |
| |
|
| | calibration_curves_cv = [] |
| | calibration_curves_sigmoid = [] |
| | calibration_curves_isotonic = [] |
| |
|
| | cal_curve_strategy = 'uniform' |
| |
|
| | with mlflow.start_run(run_name='sklearn_calibration_in_cv_uniform_bins'): |
| | |
| | runid = mlflow.active_run().info.run_id |
| | for model in models: |
| | with mlflow.start_run(run_name=model[1], nested=True, |
| | tags={MLFLOW_PARENT_RUN_ID: runid}): |
| | |
| | |
| | for f in os.listdir(artifact_dir): |
| | os.remove(os.path.join(artifact_dir, f)) |
| |
|
| | calibration_curves_algo = [] |
| | crossval = cross_validate(model[0], features_train, target_train, |
| | cv=cross_validation_fold_indices, |
| | return_estimator=True, scoring=scoring, |
| | error_score='raise') |
| | probabilities_cv = cross_val_predict(model[0], features_train, target_train, |
| | cv=cross_validation_fold_indices, |
| | method='predict_proba')[:, 1] |
| |
|
| | model_scores = pd.DataFrame({'model_score': probabilities_cv, |
| | 'true_label': target_train}) |
| | model_scores = model_scores.sort_values(by='model_score', ascending=False) |
| |
|
| | |
| | calibration_curves_cv.append((calibration_curve(target_train, |
| | probabilities_cv, n_bins=10, |
| | strategy=cal_curve_strategy), model[1])) |
| |
|
| | |
| | for score in scoring: |
| | mlflow.log_metric(score, np.mean(crossval['test_' + score])) |
| |
|
| | |
| | params = model[0].get_params() |
| | for param in params: |
| | mlflow.log_param(param, params[param]) |
| |
|
| | |
| | calibrated_sigmoid = CalibratedClassifierCV(model[0], method='sigmoid', |
| | cv=cross_validation_fold_indices) |
| | calibrated_sigmoid.fit(features_train, target_train) |
| | probabilities_sigmoid = calibrated_sigmoid.predict_proba(features_test)[:, 1] |
| |
|
| | calibrated_isotonic = CalibratedClassifierCV(model[0], method='isotonic', |
| | cv=cross_validation_fold_indices) |
| | calibrated_isotonic.fit(features_train, target_train) |
| | probabilities_isotonic = calibrated_isotonic.predict_proba( |
| | features_test)[:, 1] |
| |
|
| | |
| | calibration_curves_sigmoid.append((calibration_curve(target_test, |
| | probabilities_sigmoid, n_bins=10, |
| | strategy=cal_curve_strategy), |
| | model[1] + ' sigmoid')) |
| | calibration_curves_isotonic.append((calibration_curve(target_test, |
| | probabilities_isotonic, n_bins=10, |
| | strategy=cal_curve_strategy), |
| | model[1] + ' isotonic')) |
| | calibration_curves_algo.append((calibration_curve(target_train, |
| | probabilities_cv, n_bins=10, |
| | strategy=cal_curve_strategy), |
| | model[1] + ' uncalibrated')) |
| | calibration_curves_algo.append((calibration_curve(target_test, |
| | probabilities_sigmoid, n_bins=10, |
| | strategy=cal_curve_strategy), |
| | model[1] + ' sigmoid')) |
| | calibration_curves_algo.append((calibration_curve(target_test, |
| | probabilities_isotonic, n_bins=10, |
| | strategy=cal_curve_strategy), |
| | model[1] + ' isotonic')) |
| |
|
| | |
| | plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True, |
| | output_dir=artifact_dir, |
| | figname='cumulative_gains_curve.png') |
| | |
| | plots.plot_lift_curve(scores=model_scores, savefig=True, |
| | output_dir=artifact_dir, figname='lift_curve.png') |
| |
|
| | |
| | plots.plot_score_distribution(scores=model_scores, |
| | postive_class_name='Exac', |
| | negative_class_name='No exac', savefig=True, |
| | output_dir=artifact_dir, |
| | figname='model_score_distribution.png') |
| |
|
| | |
| | plot_calibration_curves_algo(calibration_curves=calibration_curves_algo, |
| | savefig=True, output_dir=artifact_dir, |
| | figname='calibration_curves.png', |
| | figsize=(8, 7)) |
| |
|
| | |
| | mlflow.log_artifacts(artifact_dir) |
| | mlflow.end_run() |
| |
|
| | |
| | for f in os.listdir(artifact_dir): |
| | os.remove(os.path.join(artifact_dir, f)) |
| |
|
| | plot_calibration_curves(calibration_curves=calibration_curves_cv, savefig=True, |
| | output_dir=artifact_dir, |
| | figname='calibration_curves_cv.png', figsize=(15, 10)) |
| | plot_calibration_curves(calibration_curves=calibration_curves_sigmoid, savefig=True, |
| | output_dir=artifact_dir, |
| | figname='calibration_curves_sigmoid.png', figsize=(15, 10)) |
| | plot_calibration_curves(calibration_curves=calibration_curves_isotonic, savefig=True, |
| | output_dir=artifact_dir, |
| | figname='calibration_curves_isotonic.png', figsize=(15, 10)) |
| |
|
| | with mlflow.start_run(run_id=runid): |
| | mlflow.log_artifacts(artifact_dir) |
| | mlflow.end_run() |
| |
|