import os import numpy as np import pandas as pd import mlflow import shutil import model_h # Model training and evaluation from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from imblearn.ensemble import BalancedRandomForestClassifier import xgboost as xgb from skopt import BayesSearchCV from skopt.space import Real, Integer ############################################################## # Specify which model to perform cross validation on ############################################################## model_only_hosp = False if model_only_hosp is True: file_suffix = "_only_hosp" else: file_suffix = "_hosp_comm" ############################################################## # Load data ############################################################## # Load CV folds fold_patients = np.load( './data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True) # Load imputed train data train_data_imp = model_h.load_data_for_modelling( './data/model_data/train_data_cv_imp' + file_suffix + '.pkl') # Load not imputed train data train_data_no_imp = model_h.load_data_for_modelling( './data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl') # Load imputed test data test_data_imp = model_h.load_data_for_modelling( './data/model_data/test_data_imp' + file_suffix + '.pkl') # Load not imputed test data test_data_no_imp = model_h.load_data_for_modelling( './data/model_data/test_data_no_imp' + file_suffix + '.pkl') # Create a tuple with training and validation indicies for each fold. Can be done with # either imputed or not imputed data as both have same patients cross_val_fold_indices = [] for fold in fold_patients: fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)] fold_train_ids = train_data_no_imp[~( train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))] # Get index of rows in val and train fold_val_index = fold_val_ids.index fold_train_index = fold_train_ids.index # Append tuple of training and val indices cross_val_fold_indices.append((fold_train_index, fold_val_index)) # Create list of model features cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate'] features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop] # Train data # Separate features from target for data with no imputation performed train_features_no_imp = train_data_no_imp[features_list].astype('float') train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float') # Separate features from target for data with no imputation performed train_features_imp = train_data_imp[features_list].astype('float') train_target_imp = train_data_imp.ExacWithin3Months.astype('float') # Test data # Separate features from target for data with no imputation performed test_features_no_imp = test_data_no_imp[features_list].astype('float') test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float') # Separate features from target for data with no imputation performed test_features_imp = test_data_imp[features_list].astype('float') test_target_imp = test_data_imp.ExacWithin3Months.astype('float') # Check that the target in imputed and not imputed datasets are the same. If not, # raise an error if not train_target_no_imp.equals(train_target_imp): raise ValueError( 'Target variable is not the same in imputed and non imputed datasets in the train set.') if not test_target_no_imp.equals(test_target_imp): raise ValueError( 'Target variable is not the same in imputed and non imputed datasets in the test set.') train_target = train_target_no_imp test_target = test_target_no_imp # Make sure all features are numeric for features in [train_features_no_imp, train_features_imp, test_features_no_imp, test_features_imp]: for col in features: features[col] = pd.to_numeric(features[col], errors='coerce') ############################################################## # Specify which models to evaluate ############################################################## # Set up MLflow mlflow.set_tracking_uri("sqlite:///mlruns.db") mlflow.set_experiment('model_h_drop_1' + file_suffix) # Set CV scoring strategies and any model parameters scoring_methods = ['average_precision'] scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1] # Set up models, each tuple contains 4 elements: model, model name, imputation status, # type of model models = [] # Run different models depending on which parallel model is being used. if model_only_hosp is True: # Logistic regression models.append((LogisticRegression(), 'logistic_regression', 'imputed', 'linear')) # Balanced random forest models.append((BalancedRandomForestClassifier(), 'balanced_random_forest', 'imputed', 'tree')) # XGBoost models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 'xgb', 'not_imputed', 'tree')) if model_only_hosp is False: # Logistic regression models.append((LogisticRegression(), 'logistic_regression', 'imputed', 'linear')) # Random forest models.append((RandomForestClassifier(), 'random_forest', 'imputed', 'tree')) # Balanced random forest models.append((BalancedRandomForestClassifier(), 'balanced_random_forest', 'imputed', 'tree')) # XGBoost models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 'xgb', 'not_imputed', 'tree')) # Define search spaces log_reg_search_spaces = {'penalty': ['l2', None], 'class_weight': ['balanced', None], 'max_iter': Integer(50, 300), 'C': Real(0.001, 10), } rf_search_spaces = {'max_depth': Integer(4, 10), 'n_estimators': Integer(70, 850), 'min_samples_split': Integer(2, 10), 'class_weight': ['balanced', None], } xgb_search_spaces = {'max_depth': Integer(4, 10), 'n_estimators': Integer(70, 850), 'subsample': Real(0.55, 0.95), 'colsample_bytree': Real(0.55, 0.95), 'learning_rate': Real(0.05, 0.14), 'scale_pos_weight': Real(1, scale_pos_weight), } ############################################################## # Run models ############################################################## #In MLflow run, perform K-fold cross validation and capture mean score across folds. with mlflow.start_run(run_name='hyperparameter_tuning_2023_tot_length'): for scoring_method in scoring_methods: for model in models: with mlflow.start_run(run_name=model[1], nested=True): print(model[1]) # Create the artifacts directory if it doesn't exist artifact_dir = './tmp' os.makedirs(artifact_dir, exist_ok=True) # Remove existing directory contents to not mix files between different runs shutil.rmtree(artifact_dir) # Run hyperparameter tuning if (model[1] == 'balanced_random_forest') | (model[1] == 'random_forest'): opt = BayesSearchCV(model[0], search_spaces= rf_search_spaces, n_iter=200, random_state=0, cv=cross_val_fold_indices, scoring=scoring_method, ) # Execute bayesian optimization np.int = int opt.fit(train_features_imp, train_target) if model[1] == 'logistic_regression': opt = BayesSearchCV(model[0], search_spaces= log_reg_search_spaces, n_iter=200, random_state=0, cv=cross_val_fold_indices, scoring=scoring_method, ) np.int = int opt.fit(train_features_imp, train_target) if model[1] == 'xgb': opt = BayesSearchCV(model[0], search_spaces= xgb_search_spaces, n_iter=200, random_state=0, cv=cross_val_fold_indices, scoring=scoring_method, ) np.int = int opt.fit(train_features_no_imp, train_target) # Get scores from hyperparameter tuning print(opt.best_params_) print(opt.best_score_) # Log scores from hyperparameter tuning mlflow.log_param('opt_scorer', scoring_method) mlflow.log_params(opt.best_params_) mlflow.log_metric("opt_best_score", opt.best_score_) mlflow.end_run()