"""Perform cross validation using a variety of algorithms.""" import os import pandas as pd import numpy as np from lenusml import splits, plots # Model training and evaluation from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_validate, cross_val_predict from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier from interpret.glassbox import ExplainableBoostingClassifier import lightgbm as lgb import xgboost as xgb import mlflow data_dir = '../data/models/model1/' cohort_info_dir = '../data/cohort_info/' output_dir = '../data/models/model1/output' # Load CV folds and train data fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), allow_pickle=True) train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl')) # Cross check fold patients with train data cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients, train_data=train_data, id_column='StudyId') # Create list of model features cols_to_drop = ['StudyId', 'IsExac'] features_list = [col for col in train_data.columns if col not in cols_to_drop] # Separate features from target features = train_data[features_list].astype('float') target = train_data.IsExac.astype('float') scale_pos_weight = target.value_counts()[0] / target.value_counts()[1] mlflow.set_tracking_uri("sqlite:///mlruns.sqlite") mlflow.set_experiment('model_drop2') # Set CV scoring strategies and any model parameters scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'neg_brier_score'] scale_pos_weight = target.value_counts()[0] / target.value_counts()[1] models = [] models.append((RandomForestClassifier(random_state=0), 'random_forest')) models.append((RandomForestClassifier(random_state=0, class_weight='balanced'), 'random_forest_class_weight')) models.append((BalancedBaggingClassifier(random_state=0), 'balanced_bagging')) models.append((BalancedRandomForestClassifier(random_state=0), 'balanced_random_forest')) models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss'), 'xgb')) models.append((lgb.LGBMClassifier(random_state=0), 'lgbm')) models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'xgb_spw')) models.append((ExplainableBoostingClassifier(random_state=0), 'ebm')) with mlflow.start_run(run_name='model_selection'): # Perform K-fold cross validation with custom folds for model in models: with mlflow.start_run(run_name=model[1], nested=True): # Create the artifacts directory if it doesn't exist artifact_dir = './tmp' os.makedirs(artifact_dir, exist_ok=True) # Remove any existing directory contents to not mix files between different # runs for f in os.listdir(artifact_dir): os.remove(os.path.join(artifact_dir, f)) crossval = cross_validate(model[0], features, target, cv=cross_validation_fold_indices, return_estimator=True, scoring=scoring) # Get the predicted probabilities from each models probabilities_cv = cross_val_predict(model[0], features, target, cv=cross_validation_fold_indices, method='predict_proba')[:, 1] model_scores = pd.DataFrame({'model_score': probabilities_cv, 'true_label': target}) # Log metrics averaged across folds for score in scoring: mlflow.log_metric(score, crossval['test_' + score].mean()) # Log model parameters params = model[0].get_params() for param in params: mlflow.log_param(param, params[param]) plots.plot_lift_curve(scores=model_scores, savefig=True, output_dir=artifact_dir, figname='lift_curve.png') plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True, output_dir=artifact_dir, figname='cumulative_gains_curve.png') # Plot distribution of model scores (histogram plus KDE) plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac', negative_class_name='No exac', savefig=True, output_dir=artifact_dir, figname='model_score_distribution.png') # Log artifacts mlflow.log_artifacts(artifact_dir) mlflow.end_run()