"""Perform CV (with explainability) on different feature sets and log to mlflow. Includes functionality to nest runs under parent run (e.g. different feature sets under a main run) and set a decision threshold for model scores. Logs the following artifacts as well as metrics and parameters: 1. List of model features 2. Feature correlation matrix 3. Global explainability (averaged over K folds) 4. Cumulative gains curve 5. Lift curve 6. Probability distributions with KDE """ from imblearn.ensemble import BalancedRandomForestClassifier from lenusml import splits, crossvalidation, plots import numpy as np import os import pandas as pd import mlflow from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID def get_crossvalidation_importance(*, feature_names, crossval): """ Create dataframe of mean global feature importance for all EBMs used in CV. Args: feature_names (list): list of model feature names crossval (dict): output of cross_validation_return_estimator_and_scores Returns: pd.DataFrame: contains feature names, global importance for each of the K estimators, mean importance across the estimators and scaled mean importance relative to the most important feature. """ # Obtain global importance from each EBM used in cross validation for i, est in enumerate(crossval['estimator']): exp_global = crossval['estimator'][i].feature_importances_ explanations = pd.DataFrame([feature_names, exp_global]).T explanations.columns = ['Feature', 'Score_{}'.format(i)] # Create dataframe with global feature importances for all K estimators if i == 0: explanations_all = explanations.copy() else: explanations_all = explanations_all.merge(explanations, on='Feature') # Average the importances across all models explanations_all['Mean'] = explanations_all.drop(columns=['Feature']).mean(axis=1) explanations_all = explanations_all.sort_values('Mean', ascending=False) # Create a scaled mean importance relative to the most imprtant feature explanations_all['Mean_scaled'] = explanations_all['Mean'] /\ explanations_all['Mean'].abs().max() return explanations_all data_dir = '../data/models/model1/' cohort_info_dir = '../data/cohort_info/' output_dir = '../data/models/model1/output' # Load CV folds and train data fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), allow_pickle=True) train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl')) # Cross check fold patients with train data cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients, id_column='StudyId', train_data=train_data) mlflow.set_tracking_uri("sqlite:///mlruns.sqlite") mlflow.set_experiment('model_drop2') # Set CV scoring strategies and any model parameters scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', 'average_precision'] # Load comorbidity data and get list of conditions captured in COPD service comorbidities = pd.read_csv('/copd-dataset/CopdDatasetCoMorbidityDetails.txt', delimiter='|') comorbidity_list = list(comorbidities.columns) comorbidity_list.remove('Id') comorbidity_list.remove('PatientId') comorbidity_list.remove('Created') # Add the StudyId column for merging with the train data patient_details = pd.read_pickle(os.path.join('/copd-dataset', 'patient_details.pkl')) comorbidities = comorbidities.merge(patient_details[['PatientId', 'StudyId']], on='PatientId', how='left') # Map the True/False columns to 1/0 bool_mapping = {True: 1, False: 0} comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace( bool_mapping) with mlflow.start_run(run_name='individual_comorbidities_no_binned'): runid = mlflow.active_run().info.run_id # Merge each comorbidity separately and train a model nested under the parent run for comorbidity in comorbidity_list: print(comorbidity) # Merge comorb and fill missing data with 0 train_data = train_data.merge(comorbidities[['StudyId', comorbidity]], on='StudyId', how='left') train_data[comorbidity] = train_data[comorbidity].fillna(0) with mlflow.start_run(run_name=comorbidity, nested=True, tags={MLFLOW_PARENT_RUN_ID: runid}): #### # Feature addition/drop out here ##### # Create list of model features cols_to_drop = ['StudyId', 'IsExac', 'Comorbidities_te'] features_list = [col for col in train_data.columns if col not in cols_to_drop] # Separate features from target features = train_data[features_list].astype('float') target = train_data.IsExac.astype('float') # Save the list of features and a correlation heatmap to the artifacts # directory (to be logged in mlflow) artifact_dir = './tmp' # Create the artifacts directory if it doesn't exist os.makedirs(artifact_dir, exist_ok=True) # Remove any existing directory contents to not mix files between different # runs for f in os.listdir(artifact_dir): os.remove(os.path.join(artifact_dir, f)) np.savetxt(os.path.join(artifact_dir, 'features.txt'), features_list, delimiter=",", fmt='%s') plots.plot_feature_correlations( features=features, figsize=(len(features_list) // 2, len(features_list) // 2), savefig=True, output_dir=artifact_dir, figname="feature_correlations.png") # Use the parameters from the best model in previous cross validation model = BalancedRandomForestClassifier(random_state=0) # crossval = cross_validate(model, features, target, # cv=cross_validation_fold_indices, # return_estimator=True, scoring=scoring) # Perform K-fold cross validation with custom folds # Set the probability threshold here if required crossval, model_scores =\ crossvalidation.cross_validation_return_estimator_and_scores( model=model, features=features, target=target, fold_indices=cross_validation_fold_indices) # Log metrics averaged across folds for score in scoring: mlflow.log_metric(score, np.mean(crossval['test_' + score])) # Log model parameters params = model.get_params() for param in params: mlflow.log_param(param, params[param]) # Calculate average global feature importances across K models explainability = get_crossvalidation_importance(feature_names=features_list, crossval=crossval) explainability.to_csv(os.path.join(artifact_dir, 'global_feature_importances.csv'), index=False) plots.plot_global_explainability_cv(importances=explainability, scaled=True, figsize=( len(features_list) // 2.5, len(features_list) // 6), savefig=True, output_dir=artifact_dir) # Plot lift and cumulative gains curves plots.plot_lift_curve(scores=model_scores, savefig=True, output_dir=artifact_dir, figname='lift_curve.png') plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True, output_dir=artifact_dir, figname='cumulative_gains_curve.png') # Plot distribution of model scores (histogram plus KDE) plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac', negative_class_name='No exac', savefig=True, output_dir=artifact_dir, figname='model_score_distribution.png') # Log artifacts mlflow.log_artifacts(artifact_dir) mlflow.end_run() # Drop the comorbidity column train_data = train_data.drop(columns=[comorbidity]) # mlflow.end_run()