File size: 9,198 Bytes
000de75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | import os
import numpy as np
import pandas as pd
import mlflow
import shutil
import model_h
# Model training and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer
##############################################################
# Specify which model to perform cross validation on
##############################################################
model_only_hosp = False
if model_only_hosp is True:
file_suffix = "_only_hosp"
else:
file_suffix = "_hosp_comm"
##############################################################
# Load data
##############################################################
# Load CV folds
fold_patients = np.load(
'./data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True)
# Load imputed train data
train_data_imp = model_h.load_data_for_modelling(
'./data/model_data/train_data_cv_imp' + file_suffix + '.pkl')
# Load not imputed train data
train_data_no_imp = model_h.load_data_for_modelling(
'./data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl')
# Load imputed test data
test_data_imp = model_h.load_data_for_modelling(
'./data/model_data/test_data_imp' + file_suffix + '.pkl')
# Load not imputed test data
test_data_no_imp = model_h.load_data_for_modelling(
'./data/model_data/test_data_no_imp' + file_suffix + '.pkl')
# Create a tuple with training and validation indicies for each fold. Can be done with
# either imputed or not imputed data as both have same patients
cross_val_fold_indices = []
for fold in fold_patients:
fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
fold_train_ids = train_data_no_imp[~(
train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]
# Get index of rows in val and train
fold_val_index = fold_val_ids.index
fold_train_index = fold_train_ids.index
# Append tuple of training and val indices
cross_val_fold_indices.append((fold_train_index, fold_val_index))
# Create list of model features
cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate']
features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]
# Train data
# Separate features from target for data with no imputation performed
train_features_no_imp = train_data_no_imp[features_list].astype('float')
train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
train_features_imp = train_data_imp[features_list].astype('float')
train_target_imp = train_data_imp.ExacWithin3Months.astype('float')
# Test data
# Separate features from target for data with no imputation performed
test_features_no_imp = test_data_no_imp[features_list].astype('float')
test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
test_features_imp = test_data_imp[features_list].astype('float')
test_target_imp = test_data_imp.ExacWithin3Months.astype('float')
# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not train_target_no_imp.equals(train_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the train set.')
if not test_target_no_imp.equals(test_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the test set.')
train_target = train_target_no_imp
test_target = test_target_no_imp
# Make sure all features are numeric
for features in [train_features_no_imp, train_features_imp,
test_features_no_imp, test_features_imp]:
for col in features:
features[col] = pd.to_numeric(features[col], errors='coerce')
##############################################################
# Specify which models to evaluate
##############################################################
# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('model_h_drop_1' + file_suffix)
# Set CV scoring strategies and any model parameters
scoring_methods = ['average_precision']
scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1]
# Set up models, each tuple contains 4 elements: model, model name, imputation status,
# type of model
models = []
# Run different models depending on which parallel model is being used.
if model_only_hosp is True:
# Logistic regression
models.append((LogisticRegression(),
'logistic_regression', 'imputed', 'linear'))
# Balanced random forest
models.append((BalancedRandomForestClassifier(),
'balanced_random_forest', 'imputed', 'tree'))
# XGBoost
models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
'xgb', 'not_imputed', 'tree'))
if model_only_hosp is False:
# Logistic regression
models.append((LogisticRegression(),
'logistic_regression', 'imputed', 'linear'))
# Random forest
models.append((RandomForestClassifier(),
'random_forest', 'imputed', 'tree'))
# Balanced random forest
models.append((BalancedRandomForestClassifier(),
'balanced_random_forest', 'imputed', 'tree'))
# XGBoost
models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
'xgb', 'not_imputed', 'tree'))
# Define search spaces
log_reg_search_spaces = {'penalty': ['l2', None],
'class_weight': ['balanced', None],
'max_iter': Integer(50, 300),
'C': Real(0.001, 10),
}
rf_search_spaces = {'max_depth': Integer(4, 10),
'n_estimators': Integer(70, 850),
'min_samples_split': Integer(2, 10),
'class_weight': ['balanced', None],
}
xgb_search_spaces = {'max_depth': Integer(4, 10),
'n_estimators': Integer(70, 850),
'subsample': Real(0.55, 0.95),
'colsample_bytree': Real(0.55, 0.95),
'learning_rate': Real(0.05, 0.14),
'scale_pos_weight': Real(1, scale_pos_weight),
}
##############################################################
# Run models
##############################################################
#In MLflow run, perform K-fold cross validation and capture mean score across folds.
with mlflow.start_run(run_name='hyperparameter_tuning_2023_tot_length'):
for scoring_method in scoring_methods:
for model in models:
with mlflow.start_run(run_name=model[1], nested=True):
print(model[1])
# Create the artifacts directory if it doesn't exist
artifact_dir = './tmp'
os.makedirs(artifact_dir, exist_ok=True)
# Remove existing directory contents to not mix files between different runs
shutil.rmtree(artifact_dir)
# Run hyperparameter tuning
if (model[1] == 'balanced_random_forest') | (model[1] == 'random_forest'):
opt = BayesSearchCV(model[0],
search_spaces= rf_search_spaces,
n_iter=200,
random_state=0,
cv=cross_val_fold_indices,
scoring=scoring_method,
)
# Execute bayesian optimization
np.int = int
opt.fit(train_features_imp, train_target)
if model[1] == 'logistic_regression':
opt = BayesSearchCV(model[0],
search_spaces= log_reg_search_spaces,
n_iter=200,
random_state=0,
cv=cross_val_fold_indices,
scoring=scoring_method,
)
np.int = int
opt.fit(train_features_imp, train_target)
if model[1] == 'xgb':
opt = BayesSearchCV(model[0],
search_spaces= xgb_search_spaces,
n_iter=200,
random_state=0,
cv=cross_val_fold_indices,
scoring=scoring_method,
)
np.int = int
opt.fit(train_features_no_imp, train_target)
# Get scores from hyperparameter tuning
print(opt.best_params_)
print(opt.best_score_)
# Log scores from hyperparameter tuning
mlflow.log_param('opt_scorer', scoring_method)
mlflow.log_params(opt.best_params_)
mlflow.log_metric("opt_best_score", opt.best_score_)
mlflow.end_run()
|