copd-model-h / training /perform_hyper_param_tuning.py

Inital Upload

000de75 2 days ago

9.2 kB

	import os
	import numpy as np
	import pandas as pd
	import mlflow
	import shutil
	import model_h

	# Model training and evaluation
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from imblearn.ensemble import BalancedRandomForestClassifier
	import xgboost as xgb
	from skopt import BayesSearchCV
	from skopt.space import Real, Integer

	##############################################################
	# Specify which model to perform cross validation on
	##############################################################
	model_only_hosp = False
	if model_only_hosp is True:
	file_suffix = "_only_hosp"
	else:
	file_suffix = "_hosp_comm"

	##############################################################
	# Load data
	##############################################################
	# Load CV folds
	fold_patients = np.load(
	'./data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True)

	# Load imputed train data
	train_data_imp = model_h.load_data_for_modelling(
	'./data/model_data/train_data_cv_imp' + file_suffix + '.pkl')

	# Load not imputed train data
	train_data_no_imp = model_h.load_data_for_modelling(
	'./data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl')

	# Load imputed test data
	test_data_imp = model_h.load_data_for_modelling(
	'./data/model_data/test_data_imp' + file_suffix + '.pkl')
	# Load not imputed test data
	test_data_no_imp = model_h.load_data_for_modelling(
	'./data/model_data/test_data_no_imp' + file_suffix + '.pkl')

	# Create a tuple with training and validation indicies for each fold. Can be done with
	# either imputed or not imputed data as both have same patients
	cross_val_fold_indices = []
	for fold in fold_patients:
	fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
	fold_train_ids = train_data_no_imp[~(
	train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]

	# Get index of rows in val and train
	fold_val_index = fold_val_ids.index
	fold_train_index = fold_train_ids.index

	# Append tuple of training and val indices
	cross_val_fold_indices.append((fold_train_index, fold_val_index))

	# Create list of model features
	cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate']
	features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]

	# Train data
	# Separate features from target for data with no imputation performed
	train_features_no_imp = train_data_no_imp[features_list].astype('float')
	train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
	# Separate features from target for data with no imputation performed
	train_features_imp = train_data_imp[features_list].astype('float')
	train_target_imp = train_data_imp.ExacWithin3Months.astype('float')

	# Test data
	# Separate features from target for data with no imputation performed
	test_features_no_imp = test_data_no_imp[features_list].astype('float')
	test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
	# Separate features from target for data with no imputation performed
	test_features_imp = test_data_imp[features_list].astype('float')
	test_target_imp = test_data_imp.ExacWithin3Months.astype('float')

	# Check that the target in imputed and not imputed datasets are the same. If not,
	# raise an error
	if not train_target_no_imp.equals(train_target_imp):
	raise ValueError(
	'Target variable is not the same in imputed and non imputed datasets in the train set.')
	if not test_target_no_imp.equals(test_target_imp):
	raise ValueError(
	'Target variable is not the same in imputed and non imputed datasets in the test set.')
	train_target = train_target_no_imp
	test_target = test_target_no_imp

	# Make sure all features are numeric
	for features in [train_features_no_imp, train_features_imp,
	test_features_no_imp, test_features_imp]:
	for col in features:
	features[col] = pd.to_numeric(features[col], errors='coerce')

	##############################################################
	# Specify which models to evaluate
	##############################################################
	# Set up MLflow
	mlflow.set_tracking_uri("sqlite:///mlruns.db")
	mlflow.set_experiment('model_h_drop_1' + file_suffix)

	# Set CV scoring strategies and any model parameters
	scoring_methods = ['average_precision']
	scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1]

	# Set up models, each tuple contains 4 elements: model, model name, imputation status,
	# type of model
	models = []
	# Run different models depending on which parallel model is being used.
	if model_only_hosp is True:
	# Logistic regression
	models.append((LogisticRegression(),
	'logistic_regression', 'imputed', 'linear'))
	# Balanced random forest
	models.append((BalancedRandomForestClassifier(),
	'balanced_random_forest', 'imputed', 'tree'))
	# XGBoost
	models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
	'xgb', 'not_imputed', 'tree'))
	if model_only_hosp is False:
	# Logistic regression
	models.append((LogisticRegression(),
	'logistic_regression', 'imputed', 'linear'))
	# Random forest
	models.append((RandomForestClassifier(),
	'random_forest', 'imputed', 'tree'))
	# Balanced random forest
	models.append((BalancedRandomForestClassifier(),
	'balanced_random_forest', 'imputed', 'tree'))
	# XGBoost
	models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
	'xgb', 'not_imputed', 'tree'))

	# Define search spaces
	log_reg_search_spaces = {'penalty': ['l2', None],
	'class_weight': ['balanced', None],
	'max_iter': Integer(50, 300),
	'C': Real(0.001, 10),
	}
	rf_search_spaces = {'max_depth': Integer(4, 10),
	'n_estimators': Integer(70, 850),
	'min_samples_split': Integer(2, 10),
	'class_weight': ['balanced', None],
	}
	xgb_search_spaces = {'max_depth': Integer(4, 10),
	'n_estimators': Integer(70, 850),
	'subsample': Real(0.55, 0.95),
	'colsample_bytree': Real(0.55, 0.95),
	'learning_rate': Real(0.05, 0.14),
	'scale_pos_weight': Real(1, scale_pos_weight),
	}

	##############################################################
	# Run models
	##############################################################
	#In MLflow run, perform K-fold cross validation and capture mean score across folds.
	with mlflow.start_run(run_name='hyperparameter_tuning_2023_tot_length'):
	for scoring_method in scoring_methods:
	for model in models:
	with mlflow.start_run(run_name=model[1], nested=True):
	print(model[1])
	# Create the artifacts directory if it doesn't exist
	artifact_dir = './tmp'
	os.makedirs(artifact_dir, exist_ok=True)
	# Remove existing directory contents to not mix files between different runs
	shutil.rmtree(artifact_dir)

	# Run hyperparameter tuning
	if (model[1] == 'balanced_random_forest') \| (model[1] == 'random_forest'):
	opt = BayesSearchCV(model[0],
	search_spaces= rf_search_spaces,
	n_iter=200,
	random_state=0,
	cv=cross_val_fold_indices,
	scoring=scoring_method,
	)
	# Execute bayesian optimization
	np.int = int
	opt.fit(train_features_imp, train_target)

	if model[1] == 'logistic_regression':
	opt = BayesSearchCV(model[0],
	search_spaces= log_reg_search_spaces,
	n_iter=200,
	random_state=0,
	cv=cross_val_fold_indices,
	scoring=scoring_method,
	)
	np.int = int
	opt.fit(train_features_imp, train_target)

	if model[1] == 'xgb':
	opt = BayesSearchCV(model[0],
	search_spaces= xgb_search_spaces,
	n_iter=200,
	random_state=0,
	cv=cross_val_fold_indices,
	scoring=scoring_method,
	)
	np.int = int
	opt.fit(train_features_no_imp, train_target)

	# Get scores from hyperparameter tuning
	print(opt.best_params_)
	print(opt.best_score_)

	# Log scores from hyperparameter tuning
	mlflow.log_param('opt_scorer', scoring_method)
	mlflow.log_params(opt.best_params_)
	mlflow.log_metric("opt_best_score", opt.best_score_)
	mlflow.end_run()