File size: 9,198 Bytes

000de75

import os
import numpy as np
import pandas as pd
import mlflow
import shutil
import model_h

# Model training and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer

##############################################################
# Specify which model to perform cross validation on
##############################################################
model_only_hosp = False
if model_only_hosp is True:
    file_suffix = "_only_hosp"
else:
    file_suffix = "_hosp_comm"

##############################################################
# Load data
##############################################################
# Load CV folds
fold_patients = np.load(
    './data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True)

# Load imputed train data
train_data_imp = model_h.load_data_for_modelling(
    './data/model_data/train_data_cv_imp' + file_suffix + '.pkl')

# Load not imputed train data
train_data_no_imp = model_h.load_data_for_modelling(
    './data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl')

# Load imputed test data
test_data_imp = model_h.load_data_for_modelling(
    './data/model_data/test_data_imp' + file_suffix + '.pkl')
# Load not imputed test data
test_data_no_imp = model_h.load_data_for_modelling(
    './data/model_data/test_data_no_imp' + file_suffix + '.pkl')

# Create a tuple with training and validation indicies for each fold. Can be done with
# either imputed or not imputed data as both have same patients
cross_val_fold_indices = []
for fold in fold_patients:
    fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
    fold_train_ids = train_data_no_imp[~(
        train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]

    # Get index of rows in val and train
    fold_val_index = fold_val_ids.index
    fold_train_index = fold_train_ids.index

    # Append tuple of training and val indices
    cross_val_fold_indices.append((fold_train_index, fold_val_index))

# Create list of model features
cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate']
features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]

# Train data
# Separate features from target for data with no imputation performed
train_features_no_imp = train_data_no_imp[features_list].astype('float')
train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
train_features_imp = train_data_imp[features_list].astype('float')
train_target_imp = train_data_imp.ExacWithin3Months.astype('float')

# Test data
# Separate features from target for data with no imputation performed
test_features_no_imp = test_data_no_imp[features_list].astype('float')
test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
# Separate features from target for data with no imputation performed
test_features_imp = test_data_imp[features_list].astype('float')
test_target_imp = test_data_imp.ExacWithin3Months.astype('float')

# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not train_target_no_imp.equals(train_target_imp):
    raise ValueError(
        'Target variable is not the same in imputed and non imputed datasets in the train set.')
if not test_target_no_imp.equals(test_target_imp):
    raise ValueError(
        'Target variable is not the same in imputed and non imputed datasets in the test set.')
train_target = train_target_no_imp
test_target = test_target_no_imp

# Make sure all features are numeric
for features in [train_features_no_imp, train_features_imp,
                 test_features_no_imp, test_features_imp]:
    for col in features:
        features[col] = pd.to_numeric(features[col], errors='coerce')

##############################################################
# Specify which models to evaluate
##############################################################
# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('model_h_drop_1' + file_suffix)

# Set CV scoring strategies and any model parameters
scoring_methods = ['average_precision']
scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1]

# Set up models, each tuple contains 4 elements: model, model name, imputation status,
# type of model
models = []
# Run different models depending on which parallel model is being used.
if model_only_hosp is True:
    # Logistic regression
    models.append((LogisticRegression(),
                   'logistic_regression', 'imputed', 'linear'))
    # Balanced random forest
    models.append((BalancedRandomForestClassifier(),
                   'balanced_random_forest', 'imputed', 'tree'))
    # XGBoost
    models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                                     'xgb', 'not_imputed', 'tree'))
if model_only_hosp is False:
    # Logistic regression
    models.append((LogisticRegression(),
                   'logistic_regression', 'imputed', 'linear'))
    # Random forest
    models.append((RandomForestClassifier(),
               'random_forest', 'imputed', 'tree'))
    # Balanced random forest
    models.append((BalancedRandomForestClassifier(),
               'balanced_random_forest', 'imputed', 'tree'))
    # XGBoost
    models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                                     'xgb', 'not_imputed', 'tree'))

# Define search spaces
log_reg_search_spaces = {'penalty': ['l2', None],
                         'class_weight': ['balanced', None],
                         'max_iter': Integer(50, 300),
                         'C': Real(0.001, 10),
                         }
rf_search_spaces = {'max_depth': Integer(4, 10),
                     'n_estimators': Integer(70, 850),
                     'min_samples_split': Integer(2, 10),
                     'class_weight': ['balanced', None],
                     }
xgb_search_spaces = {'max_depth': Integer(4, 10),
                     'n_estimators': Integer(70, 850),
                     'subsample': Real(0.55, 0.95),
                     'colsample_bytree': Real(0.55, 0.95),
                     'learning_rate': Real(0.05, 0.14),
                     'scale_pos_weight': Real(1, scale_pos_weight),
                     }

##############################################################
# Run models
##############################################################
#In MLflow run, perform K-fold cross validation and capture mean score across folds.
with mlflow.start_run(run_name='hyperparameter_tuning_2023_tot_length'):
    for scoring_method in scoring_methods:
        for model in models:
            with mlflow.start_run(run_name=model[1], nested=True):
                print(model[1])
                # Create the artifacts directory if it doesn't exist
                artifact_dir = './tmp'
                os.makedirs(artifact_dir, exist_ok=True)
                # Remove existing directory contents to not mix files between different runs
                shutil.rmtree(artifact_dir)

                # Run hyperparameter tuning
                if (model[1] == 'balanced_random_forest') | (model[1] == 'random_forest'):
                    opt = BayesSearchCV(model[0],
                        search_spaces= rf_search_spaces,
                        n_iter=200,
                        random_state=0,
                        cv=cross_val_fold_indices,
                        scoring=scoring_method,
                        )
                    # Execute bayesian optimization
                    np.int = int
                    opt.fit(train_features_imp, train_target)

                if model[1] == 'logistic_regression':
                    opt = BayesSearchCV(model[0],
                        search_spaces= log_reg_search_spaces,
                        n_iter=200,
                        random_state=0,
                        cv=cross_val_fold_indices,
                        scoring=scoring_method,
                        )
                    np.int = int
                    opt.fit(train_features_imp, train_target)

                if model[1] == 'xgb':
                    opt = BayesSearchCV(model[0],
                        search_spaces= xgb_search_spaces,
                        n_iter=200,
                        random_state=0,
                        cv=cross_val_fold_indices,
                        scoring=scoring_method,
                        )
                    np.int = int
                    opt.fit(train_features_no_imp, train_target)

                # Get scores from hyperparameter tuning
                print(opt.best_params_)
                print(opt.best_score_)

                # Log scores from hyperparameter tuning
                mlflow.log_param('opt_scorer', scoring_method)
                mlflow.log_params(opt.best_params_)
                mlflow.log_metric("opt_best_score", opt.best_score_)
mlflow.end_run()