File size: 8,968 Bytes
e69d4e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""Perform CV (with explainability) on different feature sets and log to mlflow.

Includes functionality to nest runs under parent run (e.g. different feature sets
under a main run) and set a decision threshold for model scores. Logs the following
artifacts as well as metrics and parameters:
1. List of model features
2. Feature correlation matrix
3. Global explainability (averaged over K folds)
4. Cumulative gains curve
5. Lift curve
6. Probability distributions with KDE
"""
from imblearn.ensemble import BalancedRandomForestClassifier
from lenusml import splits, crossvalidation, plots
import numpy as np
import os
import pandas as pd
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
import mlflow
import matplotlib.pyplot as plt
# from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID


def get_crossvalidation_importance(*, feature_names, crossval):
    """
    Create dataframe of mean global feature importance for all EBMs used in CV.

    Args:
        feature_names (list): list of model feature names
        crossval (dict): output of cross_validation_return_estimator_and_scores

    Returns:
        pd.DataFrame: contains feature names, global importance for each of the K
            estimators, mean importance across the estimators and scaled mean importance
            relative to the most important feature.
    """
    # Obtain global importance from each EBM used in cross validation
    for i, est in enumerate(crossval['estimator']):
        exp_global = crossval['estimator'][i].feature_importances_

        explanations = pd.DataFrame([feature_names, exp_global]).T
        explanations.columns = ['Feature', 'Score_{}'.format(i)]

        # Create dataframe with global feature importances for all K estimators
        if i == 0:
            explanations_all = explanations.copy()
        else:
            explanations_all = explanations_all.merge(explanations, on='Feature')

    # Average the importances across all models
    explanations_all['Mean'] = explanations_all.drop(columns=['Feature']).mean(axis=1)
    explanations_all = explanations_all.sort_values('Mean', ascending=False)
    # Create a scaled mean importance relative to the most imprtant feature
    explanations_all['Mean_scaled'] = explanations_all['Mean'] /\
        explanations_all['Mean'].abs().max()
    return explanations_all


data_dir = '../data/models/model1/'
cohort_info_dir = '../data/cohort_info/'
output_dir = '../data/models/model1/output'

# Load CV folds and train data
fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'),
                        allow_pickle=True)
train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl'))

# Cross check fold patients with train data
cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients,
                                                              id_column='StudyId',
                                                              train_data=train_data)

mlflow.set_tracking_uri("sqlite:///mlruns.sqlite")
mlflow.set_experiment('model_drop2')

# Set CV scoring strategies and any model parameters
scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
           'average_precision']

####
# Feature drop out here
#####

# Create list of model features
cols_to_drop = ['StudyId', 'IsExac']
features_list = [col for col in train_data.columns if col not in cols_to_drop]

# Separate features from target
features = train_data[features_list].astype('float')
target = train_data.IsExac.astype('float')

# Save the list of features and a correlation heatmap to the artifacts directory (to be
# logged in mlflow)
artifact_dir = './tmp'
# Create the artifacts directory if it doesn't exist
os.makedirs(artifact_dir, exist_ok=True)
# Remove any existing directory contents to not mix files between different runs
for f in os.listdir(artifact_dir):
    os.remove(os.path.join(artifact_dir, f))

np.savetxt(os.path.join(artifact_dir, 'features.txt'), features_list,
           delimiter=",", fmt='%s')

plots.plot_feature_correlations(features=features, figsize=(
                                len(features_list) // 2, len(features_list) // 2),
                                savefig=True, output_dir=artifact_dir,
                                figname='features_correlations.png')

# # Get the run_id of the best model from hyperparameter tuning and its parameters
# best_run = mlflow.search_runs([8], order_by=["metrics.precision DESC"]).iloc[0].run_id
# best_params = mlflow.get_run(best_run).data.params
# best_params

# params = {'inner_bags': 1,
#           'interactions': 4,
#           'learning_rate': 0.0012416471483555312,
#           'max_leaves': 12,
#           'max_rounds': 5000,
#           'min_samples_leaf': 5,
#           'outer_bags': 3,
#           'random_state': 0}

with mlflow.start_run(run_name='eosinophil_count_0.3_threshold'):
    # runid = mlflow.active_run().info.run_id
    # with mlflow.start_run(run_name='simplified_with_nanox', nested=True,
    #                       tags={MLFLOW_PARENT_RUN_ID: runid}):

    # Use the parameters from the best model in previous cross validation
    model = BalancedRandomForestClassifier(random_state=0)
    # crossval = cross_validate(model, features, target,
    #                           cv=cross_validation_fold_indices,
    #                           return_estimator=True, scoring=scoring)

    # Perform K-fold cross validation with custom folds
    # Set the probability threshold here if required
    crossval, model_scores =\
        crossvalidation.cross_validation_return_estimator_and_scores(
            model=model, features=features,
            target=target,
            fold_indices=cross_validation_fold_indices)

    # Log metrics averaged across folds
    for score in scoring:
        mlflow.log_metric(score, np.mean(crossval['test_' + score]))

    # Log model parameters
    params = model.get_params()
    for param in params:
        mlflow.log_param(param, params[param])

    # Calculate average global feature importances across K models
    explainability = get_crossvalidation_importance(feature_names=features_list,
                                                    crossval=crossval)
    explainability.to_csv(os.path.join(artifact_dir,
                          'global_feature_importances.csv'), index=False)
    plots.plot_global_explainability_cv(importances=explainability,
                                        scaled=True,
                                        figsize=(
                                            len(features_list) // 2.5,
                                            len(features_list) // 6),
                                        savefig=True, output_dir=artifact_dir)
    # Plot lift and cumulative gains curves
    plots.plot_lift_curve(scores=model_scores, savefig=True, output_dir=artifact_dir,
                          figname='cumulative_gains_curve.png')
    plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True,
                                      output_dir=artifact_dir,
                                      figname='lift_curve.png')

    # Plot distribution of model scores (histogram plus KDE)
    plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac',
                                  negative_class_name='No exac', savefig=True,
                                  output_dir=artifact_dir,
                                  figname='model_score_distribution.png')

    # Plot CV confusion matrices with different decision thresholds
    for threshold in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
        plots.plot_confusion_matrix(
            target_true=model_scores.true_label,
            target_predicted=np.where(model_scores.model_score > threshold, 1, 0),
            classes=['No exac', 'Exac'], savefig=True,
            output_dir=artifact_dir,
            figname='confusion_matrix_{}.png'.format(threshold))

    # Plot the ROC and Precision-Recall curves
    fig, ax = plt.subplots(figsize=(8, 6))
    RocCurveDisplay.from_predictions(y_true=model_scores.true_label,
                                     y_pred=model_scores.model_score, ax=ax)
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    plt.legend(frameon=False)
    plt.tight_layout()
    plt.savefig(os.path.join(artifact_dir, 'roc_curve.png'), dpi=150)
    plt.close()

    fig, ax = plt.subplots(figsize=(8, 6))
    PrecisionRecallDisplay.from_predictions(y_true=model_scores.true_label,
                                            y_pred=model_scores.model_score, ax=ax)
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    plt.legend(frameon=False)
    plt.tight_layout()
    plt.savefig(os.path.join(artifact_dir, 'precision_recall_curve.png'), dpi=150)
    plt.close()

    # Log artifacts
    mlflow.log_artifacts(artifact_dir)
    mlflow.end_run()
# mlflow.end_run()