copd-model-h / training /perform_hyper_param_tuning.py
IamGrooooot's picture
Inital Upload
000de75
import os
import numpy as np
import pandas as pd
import mlflow
import shutil
import model_h
# Model training and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer
##############################################################
# Specify which model to perform cross validation on
##############################################################
model_only_hosp = False
if model_only_hosp is True:
file_suffix = "_only_hosp"
else:
file_suffix = "_hosp_comm"
##############################################################
# Load data
##############################################################
# Load CV folds
fold_patients = np.load(
'./data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True)
# Load imputed train data
train_data_imp = model_h.load_data_for_modelling(
'./data/model_data/train_data_cv_imp' + file_suffix + '.pkl')
# Load not imputed train data
train_data_no_imp = model_h.load_data_for_modelling(
'./data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl')
# Load imputed test data
test_data_imp = model_h.load_data_for_modelling(
'./data/model_data/test_data_imp' + file_suffix + '.pkl')
# Load not imputed test data
test_data_no_imp = model_h.load_data_for_modelling(
'./data/model_data/test_data_no_imp' + file_suffix + '.pkl')
# Create a tuple with training and validation indicies for each fold. Can be done with
# either imputed or not imputed data as both have same patients
cross_val_fold_indices = []
for fold in fold_patients:
fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
fold_train_ids = train_data_no_imp[~(
train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]
# Get index of rows in val and train
fold_val_index = fold_val_ids.index
fold_train_index = fold_train_ids.index
# Append tuple of training and val indices
cross_val_fold_indices.append((fold_train_index, fold_val_index))
# Create list of model features
cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate']
features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]
# Train data
# Separate features from target for data with no imputation performed
train_features_no_imp = train_data_no_imp[features_list].astype('float')
train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
train_features_imp = train_data_imp[features_list].astype('float')
train_target_imp = train_data_imp.ExacWithin3Months.astype('float')
# Test data
# Separate features from target for data with no imputation performed
test_features_no_imp = test_data_no_imp[features_list].astype('float')
test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
test_features_imp = test_data_imp[features_list].astype('float')
test_target_imp = test_data_imp.ExacWithin3Months.astype('float')
# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not train_target_no_imp.equals(train_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the train set.')
if not test_target_no_imp.equals(test_target_imp):
raise ValueError(
'Target variable is not the same in imputed and non imputed datasets in the test set.')
train_target = train_target_no_imp
test_target = test_target_no_imp
# Make sure all features are numeric
for features in [train_features_no_imp, train_features_imp,
test_features_no_imp, test_features_imp]:
for col in features:
features[col] = pd.to_numeric(features[col], errors='coerce')
##############################################################
# Specify which models to evaluate
##############################################################
# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('model_h_drop_1' + file_suffix)
# Set CV scoring strategies and any model parameters
scoring_methods = ['average_precision']
scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1]
# Set up models, each tuple contains 4 elements: model, model name, imputation status,
# type of model
models = []
# Run different models depending on which parallel model is being used.
if model_only_hosp is True:
# Logistic regression
models.append((LogisticRegression(),
'logistic_regression', 'imputed', 'linear'))
# Balanced random forest
models.append((BalancedRandomForestClassifier(),
'balanced_random_forest', 'imputed', 'tree'))
# XGBoost
models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
'xgb', 'not_imputed', 'tree'))
if model_only_hosp is False:
# Logistic regression
models.append((LogisticRegression(),
'logistic_regression', 'imputed', 'linear'))
# Random forest
models.append((RandomForestClassifier(),
'random_forest', 'imputed', 'tree'))
# Balanced random forest
models.append((BalancedRandomForestClassifier(),
'balanced_random_forest', 'imputed', 'tree'))
# XGBoost
models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
'xgb', 'not_imputed', 'tree'))
# Define search spaces
log_reg_search_spaces = {'penalty': ['l2', None],
'class_weight': ['balanced', None],
'max_iter': Integer(50, 300),
'C': Real(0.001, 10),
}
rf_search_spaces = {'max_depth': Integer(4, 10),
'n_estimators': Integer(70, 850),
'min_samples_split': Integer(2, 10),
'class_weight': ['balanced', None],
}
xgb_search_spaces = {'max_depth': Integer(4, 10),
'n_estimators': Integer(70, 850),
'subsample': Real(0.55, 0.95),
'colsample_bytree': Real(0.55, 0.95),
'learning_rate': Real(0.05, 0.14),
'scale_pos_weight': Real(1, scale_pos_weight),
}
##############################################################
# Run models
##############################################################
#In MLflow run, perform K-fold cross validation and capture mean score across folds.
with mlflow.start_run(run_name='hyperparameter_tuning_2023_tot_length'):
for scoring_method in scoring_methods:
for model in models:
with mlflow.start_run(run_name=model[1], nested=True):
print(model[1])
# Create the artifacts directory if it doesn't exist
artifact_dir = './tmp'
os.makedirs(artifact_dir, exist_ok=True)
# Remove existing directory contents to not mix files between different runs
shutil.rmtree(artifact_dir)
# Run hyperparameter tuning
if (model[1] == 'balanced_random_forest') | (model[1] == 'random_forest'):
opt = BayesSearchCV(model[0],
search_spaces= rf_search_spaces,
n_iter=200,
random_state=0,
cv=cross_val_fold_indices,
scoring=scoring_method,
)
# Execute bayesian optimization
np.int = int
opt.fit(train_features_imp, train_target)
if model[1] == 'logistic_regression':
opt = BayesSearchCV(model[0],
search_spaces= log_reg_search_spaces,
n_iter=200,
random_state=0,
cv=cross_val_fold_indices,
scoring=scoring_method,
)
np.int = int
opt.fit(train_features_imp, train_target)
if model[1] == 'xgb':
opt = BayesSearchCV(model[0],
search_spaces= xgb_search_spaces,
n_iter=200,
random_state=0,
cv=cross_val_fold_indices,
scoring=scoring_method,
)
np.int = int
opt.fit(train_features_no_imp, train_target)
# Get scores from hyperparameter tuning
print(opt.best_params_)
print(opt.best_score_)
# Log scores from hyperparameter tuning
mlflow.log_param('opt_scorer', scoring_method)
mlflow.log_params(opt.best_params_)
mlflow.log_metric("opt_best_score", opt.best_score_)
mlflow.end_run()