copd-model-c / training /cross_validation_algorithms.py
IamGrooooot's picture
Initial release: 72-hour COPD exacerbation prediction model
e69d4e4
"""Perform cross validation using a variety of algorithms."""
import os
import pandas as pd
import numpy as np
from lenusml import splits, plots
# Model training and evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_predict
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from interpret.glassbox import ExplainableBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
import mlflow
data_dir = '../data/models/model1/'
cohort_info_dir = '../data/cohort_info/'
output_dir = '../data/models/model1/output'
# Load CV folds and train data
fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
allow_pickle=True)
train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
# Cross check fold patients with train data
cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
train_data=train_data,
id_column='StudyId')
# Create list of model features
cols_to_drop = ['StudyId', 'IsExac']
features_list = [col for col in train_data.columns if col not in cols_to_drop]
# Separate features from target
features = train_data[features_list].astype('float')
target = train_data.IsExac.astype('float')
scale_pos_weight = target.value_counts()[0] / target.value_counts()[1]
mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
mlflow.set_experiment('model_drop2')
# Set CV scoring strategies and any model parameters
scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
'average_precision', 'neg_brier_score']
scale_pos_weight = target.value_counts()[0] / target.value_counts()[1]
models = []
models.append((RandomForestClassifier(random_state=0), 'random_forest'))
models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
'random_forest_class_weight'))
models.append((BalancedBaggingClassifier(random_state=0),
'balanced_bagging'))
models.append((BalancedRandomForestClassifier(random_state=0), 'balanced_random_forest'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
eval_metric='logloss'), 'xgb'))
models.append((lgb.LGBMClassifier(random_state=0), 'lgbm'))
models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'xgb_spw'))
models.append((ExplainableBoostingClassifier(random_state=0), 'ebm'))
with mlflow.start_run(run_name='model_selection'):
# Perform K-fold cross validation with custom folds
for model in models:
with mlflow.start_run(run_name=model[1], nested=True):
# Create the artifacts directory if it doesn't exist
artifact_dir = './tmp'
os.makedirs(artifact_dir, exist_ok=True)
# Remove any existing directory contents to not mix files between different
# runs
for f in os.listdir(artifact_dir):
os.remove(os.path.join(artifact_dir, f))
crossval = cross_validate(model[0], features, target,
cv=cross_validation_fold_indices,
return_estimator=True, scoring=scoring)
# Get the predicted probabilities from each models
probabilities_cv = cross_val_predict(model[0], features, target,
cv=cross_validation_fold_indices,
method='predict_proba')[:, 1]
model_scores = pd.DataFrame({'model_score': probabilities_cv,
'true_label': target})
# Log metrics averaged across folds
for score in scoring:
mlflow.log_metric(score, crossval['test_' + score].mean())
# Log model parameters
params = model[0].get_params()
for param in params:
mlflow.log_param(param, params[param])
plots.plot_lift_curve(scores=model_scores, savefig=True,
output_dir=artifact_dir, figname='lift_curve.png')
plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
output_dir=artifact_dir,
figname='cumulative_gains_curve.png')
# Plot distribution of model scores (histogram plus KDE)
plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
negative_class_name='No exac', savefig=True,
output_dir=artifact_dir,
figname='model_score_distribution.png')
# Log artifacts
mlflow.log_artifacts(artifact_dir)
mlflow.end_run()