File size: 9,061 Bytes
e69d4e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | """Perform CV (with explainability) on different feature sets and log to mlflow.
Includes functionality to nest runs under parent run (e.g. different feature sets
under a main run) and set a decision threshold for model scores. Logs the following
artifacts as well as metrics and parameters:
1. List of model features
2. Feature correlation matrix
3. Global explainability (averaged over K folds)
4. Cumulative gains curve
5. Lift curve
6. Probability distributions with KDE
"""
from imblearn.ensemble import BalancedRandomForestClassifier
from lenusml import splits, crossvalidation, plots
import numpy as np
import os
import pandas as pd
import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
def get_crossvalidation_importance(*, feature_names, crossval):
"""
Create dataframe of mean global feature importance for all EBMs used in CV.
Args:
feature_names (list): list of model feature names
crossval (dict): output of cross_validation_return_estimator_and_scores
Returns:
pd.DataFrame: contains feature names, global importance for each of the K
estimators, mean importance across the estimators and scaled mean importance
relative to the most important feature.
"""
# Obtain global importance from each EBM used in cross validation
for i, est in enumerate(crossval['estimator']):
exp_global = crossval['estimator'][i].feature_importances_
explanations = pd.DataFrame([feature_names, exp_global]).T
explanations.columns = ['Feature', 'Score_{}'.format(i)]
# Create dataframe with global feature importances for all K estimators
if i == 0:
explanations_all = explanations.copy()
else:
explanations_all = explanations_all.merge(explanations, on='Feature')
# Average the importances across all models
explanations_all['Mean'] = explanations_all.drop(columns=['Feature']).mean(axis=1)
explanations_all = explanations_all.sort_values('Mean', ascending=False)
# Create a scaled mean importance relative to the most imprtant feature
explanations_all['Mean_scaled'] = explanations_all['Mean'] /\
explanations_all['Mean'].abs().max()
return explanations_all
data_dir = '../data/models/model1/'
cohort_info_dir = '../data/cohort_info/'
output_dir = '../data/models/model1/output'
# Load CV folds and train data
fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
allow_pickle=True)
train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))
# Cross check fold patients with train data
cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
id_column='StudyId',
train_data=train_data)
mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
mlflow.set_experiment('model_drop2')
# Set CV scoring strategies and any model parameters
scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
'average_precision']
# Load comorbidity data and get list of conditions captured in COPD service
comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt',
delimiter='|')
comorbidity_list = list(comorbidities.columns)
comorbidity_list.remove('Id')
comorbidity_list.remove('PatientId')
comorbidity_list.remove('Created')
# Add the StudyId column for merging with the train data
patient_details = pd.read_pickle(os.path.join('<YOUR_DATA_PATH>/copd-dataset',
'patient_details.pkl'))
comorbidities = comorbidities.merge(patient_details[['PatientId', 'StudyId']],
on='PatientId', how='left')
# Map the True/False columns to 1/0
bool_mapping = {True: 1, False: 0}
comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace(
bool_mapping)
with mlflow.start_run(run_name='individual_comorbidities_no_binned'):
runid = mlflow.active_run().info.run_id
# Merge each comorbidity separately and train a model nested under the parent run
for comorbidity in comorbidity_list:
print(comorbidity)
# Merge comorb and fill missing data with 0
train_data = train_data.merge(comorbidities[['StudyId', comorbidity]],
on='StudyId', how='left')
train_data[comorbidity] = train_data[comorbidity].fillna(0)
with mlflow.start_run(run_name=comorbidity, nested=True,
tags={MLFLOW_PARENT_RUN_ID: runid}):
####
# Feature addition/drop out here
#####
# Create list of model features
cols_to_drop = ['StudyId', 'IsExac', 'Comorbidities_te']
features_list = [col for col in train_data.columns if col not in cols_to_drop]
# Separate features from target
features = train_data[features_list].astype('float')
target = train_data.IsExac.astype('float')
# Save the list of features and a correlation heatmap to the artifacts
# directory (to be logged in mlflow)
artifact_dir = './tmp'
# Create the artifacts directory if it doesn't exist
os.makedirs(artifact_dir, exist_ok=True)
# Remove any existing directory contents to not mix files between different
# runs
for f in os.listdir(artifact_dir):
os.remove(os.path.join(artifact_dir, f))
np.savetxt(os.path.join(artifact_dir, 'features.txt'), features_list,
delimiter=",", fmt='%s')
plots.plot_feature_correlations(
features=features, figsize=(len(features_list) // 2,
len(features_list) // 2),
savefig=True, output_dir=artifact_dir,
figname="feature_correlations.png")
# Use the parameters from the best model in previous cross validation
model = BalancedRandomForestClassifier(random_state=0)
# crossval = cross_validate(model, features, target,
# cv=cross_validation_fold_indices,
# return_estimator=True, scoring=scoring)
# Perform K-fold cross validation with custom folds
# Set the probability threshold here if required
crossval, model_scores =\
crossvalidation.cross_validation_return_estimator_and_scores(
model=model, features=features,
target=target,
fold_indices=cross_validation_fold_indices)
# Log metrics averaged across folds
for score in scoring:
mlflow.log_metric(score, np.mean(crossval['test_' + score]))
# Log model parameters
params = model.get_params()
for param in params:
mlflow.log_param(param, params[param])
# Calculate average global feature importances across K models
explainability = get_crossvalidation_importance(feature_names=features_list,
crossval=crossval)
explainability.to_csv(os.path.join(artifact_dir,
'global_feature_importances.csv'), index=False)
plots.plot_global_explainability_cv(importances=explainability,
scaled=True,
figsize=(
len(features_list) // 2.5,
len(features_list) // 6),
savefig=True, output_dir=artifact_dir)
# Plot lift and cumulative gains curves
plots.plot_lift_curve(scores=model_scores, savefig=True,
output_dir=artifact_dir, figname='lift_curve.png')
plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
output_dir=artifact_dir,
figname='cumulative_gains_curve.png')
# Plot distribution of model scores (histogram plus KDE)
plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
negative_class_name='No exac', savefig=True,
output_dir=artifact_dir,
figname='model_score_distribution.png')
# Log artifacts
mlflow.log_artifacts(artifact_dir)
mlflow.end_run()
# Drop the comorbidity column
train_data = train_data.drop(columns=[comorbidity])
# mlflow.end_run()
|