copd-model-h / training /cross_val_first_models.py

Inital Upload

000de75 3 days ago

23.8 kB

	import os
	import sys
	import numpy as np
	import pandas as pd
	import mlflow
	import model_h

	# Plotting
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Model training and evaluation
	from sklearn.dummy import DummyClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import cross_validate, cross_val_predict
	from sklearn.metrics import confusion_matrix, precision_recall_curve
	from sklearn.calibration import calibration_curve, CalibratedClassifierCV
	from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
	import lightgbm as lgb
	import xgboost as xgb
	from catboost import CatBoostClassifier
	import ml_insights as mli

	# Explainability
	import shap

	##############################################################
	# Specify which model to perform cross validation on
	##############################################################
	model_only_hosp = True
	if model_only_hosp is True:
	file_suffix = "_only_hosp"
	else:
	file_suffix = "_hosp_comm"

	##############################################################
	# Load data
	##############################################################
	# Setup log file
	log = open("./training/logging/modelling" + file_suffix + ".log", "w")
	sys.stdout = log

	# Load CV folds
	fold_patients = np.load(
	'./data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True)

	# Load imputed train data
	train_data_imp = model_h.load_data_for_modelling(
	'./data/model_data/train_data_cv_imp' + file_suffix + '.pkl')
	train_data_imp = train_data_imp.drop(columns=['Sex_F', 'Age_TEnc'])

	# Load not imputed train data
	train_data_no_imp = model_h.load_data_for_modelling(
	'./data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl')
	train_data_no_imp = train_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc'])

	# Load imputed test data
	test_data_imp = model_h.load_data_for_modelling(
	'./data/model_data/test_data_imp' + file_suffix + '.pkl')
	test_data_imp = test_data_imp.drop(columns=['Sex_F', 'Age_TEnc'])

	# Load not imputed test data
	test_data_no_imp = model_h.load_data_for_modelling(
	'./data/model_data/test_data_no_imp' + file_suffix + '.pkl')
	test_data_no_imp = test_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc'])

	# Create a tuple with training and validation indicies for each fold. Can be done with
	# either imputed or not imputed data as both have same patients
	cross_val_fold_indices = []
	for fold in fold_patients:
	fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)]
	fold_train_ids = train_data_no_imp[~(
	train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))]

	# Get index of rows in val and train
	fold_val_index = fold_val_ids.index
	fold_train_index = fold_train_ids.index

	# Append tuple of training and val indices
	cross_val_fold_indices.append((fold_train_index, fold_val_index))

	# Create list of model features
	cols_to_drop = ['StudyId', 'ExacWithin3Months']
	features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop]

	# Train data
	# Separate features from target for data with no imputation performed
	train_features_no_imp = train_data_no_imp[features_list].astype('float')
	train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float')
	# Separate features from target for data with no imputation performed
	train_features_imp = train_data_imp[features_list].astype('float')
	train_target_imp = train_data_imp.ExacWithin3Months.astype('float')

	# Test data
	# Separate features from target for data with no imputation performed
	test_features_no_imp = test_data_no_imp[features_list].astype('float')
	test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float')
	# Separate features from target for data with no imputation performed
	test_features_imp = test_data_imp[features_list].astype('float')
	test_target_imp = test_data_imp.ExacWithin3Months.astype('float')

	# Check that the target in imputed and not imputed datasets are the same. If not,
	# raise an error
	if not train_target_no_imp.equals(train_target_imp):
	raise ValueError(
	'Target variable is not the same in imputed and non imputed datasets in the train set.')
	if not test_target_no_imp.equals(test_target_imp):
	raise ValueError(
	'Target variable is not the same in imputed and non imputed datasets in the test set.')
	train_target = train_target_no_imp
	test_target = test_target_no_imp

	# Make sure all features are numeric
	for features in [train_features_no_imp, train_features_imp,
	test_features_no_imp, test_features_imp]:
	for col in features:
	features[col] = pd.to_numeric(features[col], errors='coerce')

	##############################################################
	# Specify which models to evaluate
	##############################################################
	# Set up MLflow
	mlflow.set_tracking_uri("sqlite:///mlruns.db")
	mlflow.set_experiment('model_h_drop_1' + file_suffix)

	# Set CV scoring strategies and any model parameters
	scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc',
	'average_precision', 'neg_brier_score']
	scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1]

	# Set up models, each tuple contains 4 elements: model, model name, imputation status,
	# type of model
	models = []
	# Dummy classifier
	models.append((DummyClassifier(strategy='stratified'),
	'dummy_classifier', 'imputed'))
	# Logistic regression
	models.append((LogisticRegression(random_state=0, max_iter=200),
	'logistic_regression', 'imputed', 'linear'))
	models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200),
	'logistic_regression_CW_balanced', 'imputed', 'linear'))
	# Random forest
	models.append((RandomForestClassifier(random_state=0),
	'random_forest', 'imputed', 'tree'))
	models.append((RandomForestClassifier(random_state=0, class_weight='balanced'),
	'random_forest_CW_balanced', 'imputed', 'tree'))
	models.append((BalancedRandomForestClassifier(random_state=0),
	'balanced_random_forest', 'imputed', 'tree'))
	# Bagging
	models.append((BalancedBaggingClassifier(random_state=0),
	'balanced_bagging', 'imputed', 'tree'))
	# XGBoost
	models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
	eval_metric='logloss', learning_rate=0.1),
	'xgb', 'not_imputed', 'tree'))
	models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
	eval_metric='logloss', learning_rate=0.1, max_depth=4),
	'xgb_mdepth_4', 'not_imputed', 'tree'))
	models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
	eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1),
	'xgb_spw', 'not_imputed', 'tree'))
	models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False,
	eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1,
	max_depth=4),
	'xgb_spw_mdepth_4', 'not_imputed', 'tree'))
	# Light GBM
	models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1, verbose_eval=-1),
	'lgbm', 'not_imputed', 'tree'))
	models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1,
	scale_pos_weight=scale_pos_weight, verbose_eval=-1),
	'lgbm_spw', 'not_imputed', 'tree'))
	# CatBoost
	models.append((CatBoostClassifier(random_state=0, learning_rate=0.1),
	'catboost', 'not_imputed', 'tree'))

	# Convert features and target to a numpy array
	# Train data
	#train_features_no_imp = train_features_no_imp.to_numpy()
	#train_features_imp = train_features_imp.to_numpy()
	#train_target = train_target.to_numpy()
	# Test data
	#test_features_no_imp = test_features_no_imp.to_numpy()
	#test_features_imp = test_features_imp.to_numpy()
	#test_target = test_target.to_numpy()

	##############################################################
	# Run models
	##############################################################
	#In MLflow run, perform K-fold cross validation and capture mean score across folds.
	with mlflow.start_run(run_name='model_selection_less_features_3rd_iter_minus_sex'):
	for model in models:
	with mlflow.start_run(run_name=model[1], nested=True):
	print(model[1])
	# Create the artifacts directory if it doesn't exist
	artifact_dir = './tmp'
	os.makedirs(artifact_dir, exist_ok=True)
	# Remove existing directory contents to not mix files between different runs
	for f in os.listdir(artifact_dir):
	os.remove(os.path.join(artifact_dir, f))

	# Perform K-fold cross validation with custom folds using imputed dataset for
	# non-sparsity aware models
	if model[2] == 'imputed':
	crossval = cross_validate(model[0], train_features_imp, train_target,
	cv=cross_val_fold_indices,
	return_estimator=True, scoring=scoring,
	return_indices=True)

	# Get the predicted probabilities from each models
	probabilities_cv = cross_val_predict(model[0], train_features_imp,
	train_target, cv=cross_val_fold_indices,
	method='predict_proba')[:, 1]
	else:
	crossval = cross_validate(model[0], train_features_no_imp, train_target,
	cv=cross_val_fold_indices, return_estimator=True,
	scoring=scoring, return_indices=True)

	# Get the predicted probabilities from each models
	probabilities_cv = cross_val_predict(model[0], train_features_no_imp,
	train_target, cv=cross_val_fold_indices,
	method='predict_proba')[:, 1]

	# Get threshold that gives best F1 score
	precision, recall, thresholds = precision_recall_curve(
	train_target, probabilities_cv)
	fscore = (2 * precision * recall) / (precision + recall)
	# When getting the max fscore, if fscore is nan, nan will be returned as the
	# max. Iterate until nan not returned.
	fscore_zero = True
	position = -1
	while fscore_zero is True:
	best_thres_idx = np.argsort(fscore, axis=0)[position]
	if np.isnan(fscore[best_thres_idx]) == True:
	position = position - 1
	else:
	fscore_zero = False
	best_threshold = thresholds[best_thres_idx]
	print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (
	best_threshold, fscore[best_thres_idx], precision[best_thres_idx],
	recall[best_thres_idx]))
	# Save f1 score, precision and recall for the best threshold
	mlflow.log_metric('best_threshold', best_threshold)
	mlflow.log_metric('f1_best_thres', fscore[best_thres_idx])
	mlflow.log_metric('precision_best_thres', precision[best_thres_idx])
	mlflow.log_metric('recall_best_thres', recall[best_thres_idx])

	# Plot confusion matrix at different thresholds
	thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_threshold]
	for threshold in thresholds:
	y_predicted = probabilities_cv > threshold
	model_h.plot_confusion_matrix(
	train_target, y_predicted, model[1], threshold, file_suffix)

	# Generate calibration curves
	if model[1] != 'dummy_classifier':
	# Calibrated model (Sigmoid)
	model_sig = CalibratedClassifierCV(
	model[0], method='sigmoid',cv=cross_val_fold_indices)
	if model[2] == 'imputed':
	model_sig.fit(train_features_imp, train_target)
	probs_sig = model_sig.predict_proba(test_features_imp)[:, 1]
	else:
	model_sig.fit(train_features_no_imp, train_target)
	probs_sig = model_sig.predict_proba(test_features_no_imp)[:, 1]

	# Calibrated model (Isotonic)
	model_iso = CalibratedClassifierCV(
	model[0], method='isotonic', cv=cross_val_fold_indices)
	if model[2] == 'imputed':
	model_iso.fit(train_features_imp, train_target)
	probs_iso = model_iso.predict_proba(test_features_imp)[:, 1]
	else:
	model_iso.fit(train_features_no_imp, train_target)
	probs_iso = model_iso.predict_proba(test_features_no_imp)[:, 1]

	# Spline calibration
	spline_calib = mli.SplineCalib()
	spline_calib.fit(probabilities_cv, train_target)

	if model[2] == 'imputed':
	model[0].fit(train_features_imp,train_target)
	preds_test_uncalib = model[0].predict_proba(test_features_imp)[:,1]
	else:
	model[0].fit(train_features_no_imp,train_target)
	preds_test_uncalib = model[0].predict_proba(test_features_no_imp)[:,1]
	probs_spline = spline_calib.calibrate(preds_test_uncalib)

	# Plot calibration curves for equal width bins (each bin has same width) and
	# equal frequency bins (each bin has same number of observations)
	for strategy in ['uniform', 'quantile']:
	for bin_num in [5, 10]:
	if strategy == 'uniform':
	print('--- Creating calibration curve with equal width bins ---')
	print('-- Num bins:', bin_num, ' --')
	else:
	print('--- Creating calibration curve with equal frequency bins ---')
	print('-- Num bins:', bin_num, ' --')
	print('Uncalibrated model:')
	prob_true_uncal, prob_pred_uncal = calibration_curve(
	train_target, probabilities_cv,n_bins=bin_num, strategy=strategy)
	print('Calibrated model (sigmoid):')
	prob_true_sig, prob_pred_sig = calibration_curve(
	test_target, probs_sig, n_bins=bin_num, strategy=strategy)
	print('Calibrated model (isotonic):')
	prob_true_iso, prob_pred_iso = calibration_curve(
	test_target, probs_iso, n_bins=bin_num, strategy=strategy)
	print('Calibrated model (spline):')
	prob_true_spline, prob_pred_spline = calibration_curve(
	test_target, probs_spline, n_bins=bin_num, strategy=strategy)

	plt.figure(figsize=(8,8))
	plt.plot([0, 1], [0, 1], linestyle='--')
	plt.plot(prob_pred_uncal, prob_true_uncal, marker='.',
	label='Uncalibrated\n' + model[1])
	plt.plot(prob_pred_sig, prob_true_sig, marker='.',
	label='Calibrated (Sigmoid)\n' + model[1])
	plt.plot(prob_pred_iso, prob_true_iso, marker='.',
	label='Calibrated (Isotonic)\n' + model[1])
	plt.plot(prob_pred_spline, prob_true_spline, marker='.',
	label='Calibrated (Spline)\n' + model[1])
	plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
	plt.tight_layout()
	plt.savefig(os.path.join(artifact_dir, model[1] + '_uncal_' +
	strategy + '_bins' + str(bin_num) +
	file_suffix + '.png'))
	plt.close()

	# Get total gain and total cover for boosting machine models
	if model[1].startswith("xgb"):
	feat_importance_tot_gain_df = model_h.plot_feat_importance_model(
	model[0], model[1], file_suffix=file_suffix)
	if (model[1].startswith("lgbm")):
	feature_names = train_features_no_imp.columns.tolist()
	feat_importance_tot_gain_df = model_h.plot_feat_importance_model(
	model[0], model[1], file_suffix=file_suffix, feature_names=feature_names)
	# Save feature importance by total gain
	if (model[1].startswith("xgb")) \| (model[1].startswith("lgbm")):
	feat_importance_tot_gain_df.to_csv(
	'./data/feature_importance_tot_gain' + file_suffix + '.csv', index=False)

	# SHAP
	if model[1] not in ['dummy_classifier', 'balanced_bagging']:
	shap_values_list_train = []
	shap_vals_per_cv = {}

	# Create a dictionary to contain shap values. Dictionary is structured as
	# index : fold_num : shap_values
	for idx in range(0, len(train_data_imp)):
	shap_vals_per_cv[idx] = {}
	for n_fold in range(0, 5):
	shap_vals_per_cv[idx][n_fold] = {}

	# Get SHAP values for each fold
	fold_num = 0
	for i, estimator in enumerate(crossval['estimator']):
	fold_num = fold_num + 1
	# If imputation needed for model, use imputed features
	if model[1] in ['logistic_regression',
	'logistic_regression_CW_balanced', 'random_forest',
	'random_forest_CW_balanced', 'balanced_bagging',
	'balanced_random_forest']:
	#X_test = train_features_imp[crossval['indices']['test'][i]]
	X_train = train_features_imp.iloc[crossval['indices']['train'][i]]
	X_test = train_features_imp.iloc[crossval['indices']['test'][i]]
	else:
	X_train = train_features_no_imp.iloc[crossval['indices']['train'][i]]
	X_test = train_features_no_imp.iloc[crossval['indices']['test'][i]]

	# Apply different explainers depending on type of model
	if model[3] == 'linear':
	explainer = shap.LinearExplainer(estimator, X_train)
	if model[3] == 'tree':
	explainer = shap.TreeExplainer(estimator)

	# Get shap values
	shap_values_train = explainer.shap_values(X_train)
	# Output of shap values for some models is (class, num samples,
	# num features). Get these in the format of (num samples, num features)
	if len(np.shape(shap_values_train)) == 3:
	shap_values_train = shap_values_train[1]

	# Plot SHAP plots for each cv fold
	shap.summary_plot(np.array(shap_values_train), X_train, show=False)
	plt.savefig(os.path.join(artifact_dir, model[1] + '_shap_cv_fold_' +
	str(fold_num) + file_suffix + '.png'))
	plt.close()

	# Add shap values to a dictionary.
	train_idxs = X_train.index.tolist()
	for n, train_idx in enumerate(train_idxs):
	shap_vals_per_cv[train_idx][i] = shap_values_train[n]

	# Calculate average shap values
	average_shap_values, stds, ranges = [],[],[]
	for i in range(0,len(train_data_imp)):
	for n in range(0,5):
	# If a cv fold is empty as that set has not been used in training,
	# replace empty fold with NaN
	try:
	if not shap_vals_per_cv[i][n]:
	shap_vals_per_cv[i][n] = np.NaN
	except:
	pass
	# Create a df for each index that contains all shap values for each cv
	# fold
	df_per_obs = pd.DataFrame.from_dict(shap_vals_per_cv[i])
	# Get relevant statistics for every sample
	average_shap_values.append(df_per_obs.mean(axis=1).values)
	stds.append(df_per_obs.std(axis=1).values)
	ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)

	# Plot SHAP plots
	if model[2] == 'imputed':
	shap.summary_plot(np.array(average_shap_values), train_data_imp.drop(
	columns=['StudyId', 'ExacWithin3Months']), show=False)
	if model[2] == 'not_imputed':
	shap.summary_plot(np.array(average_shap_values), train_data_no_imp.drop(
	columns=['StudyId', 'ExacWithin3Months']), show=False)
	plt.savefig(
	os.path.join(artifact_dir, model[1] + '_shap' + file_suffix + '.png'))
	plt.close()

	# Get list of most important features in order
	feat_importance_df = model_h.get_shap_feat_importance(
	model[1], average_shap_values, features_list, file_suffix)
	feat_importance_df.to_csv(
	'./data/feature_importance_shap' + file_suffix + '.csv', index=False)

	# Plot distribution of model scores (histogram plus KDE)
	model_scores = pd.DataFrame({'model_score': probabilities_cv,
	'true_label': train_target})
	sns.displot(model_scores, x="model_score", hue="true_label", kde=True)
	plt.savefig(os.path.join(artifact_dir, model[1] + 'score_distribution' +
	file_suffix + '.png'))
	plt.close()

	# Log metrics averaged across folds
	for score in scoring:
	mlflow.log_metric(score, crossval['test_' + score].mean())
	mlflow.log_metric(score + '_std', crossval['test_' + score].std())
	# Log model parameters
	params = model[0].get_params()
	for param in params:
	mlflow.log_param(param, params[param])
	# Log artifacts
	mlflow.log_artifacts(artifact_dir)

	mlflow.end_run()

	# Join shap feature importance and total gain
	shap_feat_importance = pd.read_csv(
	'./data/feature_importance_shap' + file_suffix + '.csv')
	tot_gain_feat_importance = pd.read_csv(
	'./data/feature_importance_tot_gain' + file_suffix + '.csv')
	tot_gain_feat_importance = tot_gain_feat_importance.rename(columns={'index':'col_name'})
	feat_importance_hierarchy = shap_feat_importance.merge(
	tot_gain_feat_importance, on='col_name', how='left')
	feat_importance_hierarchy.to_csv(
	'./data/feat_importance_hierarchy' + file_suffix + '.csv', index=False)