| | import os |
| | import sys |
| | import numpy as np |
| | import pandas as pd |
| | import mlflow |
| | import model_h |
| |
|
| | |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| |
|
| | |
| | from sklearn.dummy import DummyClassifier |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.ensemble import RandomForestClassifier |
| | from sklearn.model_selection import cross_validate, cross_val_predict |
| | from sklearn.metrics import confusion_matrix, precision_recall_curve |
| | from sklearn.calibration import calibration_curve, CalibratedClassifierCV |
| | from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier |
| | import lightgbm as lgb |
| | import xgboost as xgb |
| | from catboost import CatBoostClassifier |
| | import ml_insights as mli |
| |
|
| | |
| | import shap |
| |
|
| | |
| | |
| | |
| | model_only_hosp = True |
| | if model_only_hosp is True: |
| | file_suffix = "_only_hosp" |
| | else: |
| | file_suffix = "_hosp_comm" |
| |
|
| | |
| | |
| | |
| | |
| | log = open("./training/logging/modelling" + file_suffix + ".log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | fold_patients = np.load( |
| | './data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True) |
| |
|
| | |
| | train_data_imp = model_h.load_data_for_modelling( |
| | './data/model_data/train_data_cv_imp' + file_suffix + '.pkl') |
| | train_data_imp = train_data_imp.drop(columns=['Sex_F', 'Age_TEnc']) |
| |
|
| | |
| | train_data_no_imp = model_h.load_data_for_modelling( |
| | './data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl') |
| | train_data_no_imp = train_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc']) |
| |
|
| | |
| | test_data_imp = model_h.load_data_for_modelling( |
| | './data/model_data/test_data_imp' + file_suffix + '.pkl') |
| | test_data_imp = test_data_imp.drop(columns=['Sex_F', 'Age_TEnc']) |
| |
|
| | |
| | test_data_no_imp = model_h.load_data_for_modelling( |
| | './data/model_data/test_data_no_imp' + file_suffix + '.pkl') |
| | test_data_no_imp = test_data_no_imp.drop(columns=['Sex_F', 'Age_TEnc']) |
| |
|
| | |
| | |
| | cross_val_fold_indices = [] |
| | for fold in fold_patients: |
| | fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)] |
| | fold_train_ids = train_data_no_imp[~( |
| | train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))] |
| |
|
| | |
| | fold_val_index = fold_val_ids.index |
| | fold_train_index = fold_train_ids.index |
| |
|
| | |
| | cross_val_fold_indices.append((fold_train_index, fold_val_index)) |
| |
|
| | |
| | cols_to_drop = ['StudyId', 'ExacWithin3Months'] |
| | features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop] |
| |
|
| | |
| | |
| | train_features_no_imp = train_data_no_imp[features_list].astype('float') |
| | train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float') |
| | |
| | train_features_imp = train_data_imp[features_list].astype('float') |
| | train_target_imp = train_data_imp.ExacWithin3Months.astype('float') |
| |
|
| | |
| | |
| | test_features_no_imp = test_data_no_imp[features_list].astype('float') |
| | test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float') |
| | |
| | test_features_imp = test_data_imp[features_list].astype('float') |
| | test_target_imp = test_data_imp.ExacWithin3Months.astype('float') |
| |
|
| | |
| | |
| | if not train_target_no_imp.equals(train_target_imp): |
| | raise ValueError( |
| | 'Target variable is not the same in imputed and non imputed datasets in the train set.') |
| | if not test_target_no_imp.equals(test_target_imp): |
| | raise ValueError( |
| | 'Target variable is not the same in imputed and non imputed datasets in the test set.') |
| | train_target = train_target_no_imp |
| | test_target = test_target_no_imp |
| |
|
| | |
| | for features in [train_features_no_imp, train_features_imp, |
| | test_features_no_imp, test_features_imp]: |
| | for col in features: |
| | features[col] = pd.to_numeric(features[col], errors='coerce') |
| |
|
| | |
| | |
| | |
| | |
| | mlflow.set_tracking_uri("sqlite:///mlruns.db") |
| | mlflow.set_experiment('model_h_drop_1' + file_suffix) |
| |
|
| | |
| | scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', |
| | 'average_precision', 'neg_brier_score'] |
| | scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1] |
| |
|
| | |
| | |
| | models = [] |
| | |
| | models.append((DummyClassifier(strategy='stratified'), |
| | 'dummy_classifier', 'imputed')) |
| | |
| | models.append((LogisticRegression(random_state=0, max_iter=200), |
| | 'logistic_regression', 'imputed', 'linear')) |
| | models.append((LogisticRegression(random_state=0, class_weight='balanced', max_iter=200), |
| | 'logistic_regression_CW_balanced', 'imputed', 'linear')) |
| | |
| | models.append((RandomForestClassifier(random_state=0), |
| | 'random_forest', 'imputed', 'tree')) |
| | models.append((RandomForestClassifier(random_state=0, class_weight='balanced'), |
| | 'random_forest_CW_balanced', 'imputed', 'tree')) |
| | models.append((BalancedRandomForestClassifier(random_state=0), |
| | 'balanced_random_forest', 'imputed', 'tree')) |
| | |
| | models.append((BalancedBaggingClassifier(random_state=0), |
| | 'balanced_bagging', 'imputed', 'tree')) |
| | |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss', learning_rate=0.1), |
| | 'xgb', 'not_imputed', 'tree')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss', learning_rate=0.1, max_depth=4), |
| | 'xgb_mdepth_4', 'not_imputed', 'tree')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1), |
| | 'xgb_spw', 'not_imputed', 'tree')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss', scale_pos_weight=scale_pos_weight, learning_rate=0.1, |
| | max_depth=4), |
| | 'xgb_spw_mdepth_4', 'not_imputed', 'tree')) |
| | |
| | models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1, verbose_eval=-1), |
| | 'lgbm', 'not_imputed', 'tree')) |
| | models.append((lgb.LGBMClassifier(random_state=0, learning_rate=0.1, |
| | scale_pos_weight=scale_pos_weight, verbose_eval=-1), |
| | 'lgbm_spw', 'not_imputed', 'tree')) |
| | |
| | models.append((CatBoostClassifier(random_state=0, learning_rate=0.1), |
| | 'catboost', 'not_imputed', 'tree')) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | with mlflow.start_run(run_name='model_selection_less_features_3rd_iter_minus_sex'): |
| | for model in models: |
| | with mlflow.start_run(run_name=model[1], nested=True): |
| | print(model[1]) |
| | |
| | artifact_dir = './tmp' |
| | os.makedirs(artifact_dir, exist_ok=True) |
| | |
| | for f in os.listdir(artifact_dir): |
| | os.remove(os.path.join(artifact_dir, f)) |
| | |
| | |
| | |
| | if model[2] == 'imputed': |
| | crossval = cross_validate(model[0], train_features_imp, train_target, |
| | cv=cross_val_fold_indices, |
| | return_estimator=True, scoring=scoring, |
| | return_indices=True) |
| | |
| | |
| | probabilities_cv = cross_val_predict(model[0], train_features_imp, |
| | train_target, cv=cross_val_fold_indices, |
| | method='predict_proba')[:, 1] |
| | else: |
| | crossval = cross_validate(model[0], train_features_no_imp, train_target, |
| | cv=cross_val_fold_indices, return_estimator=True, |
| | scoring=scoring, return_indices=True) |
| | |
| | |
| | probabilities_cv = cross_val_predict(model[0], train_features_no_imp, |
| | train_target, cv=cross_val_fold_indices, |
| | method='predict_proba')[:, 1] |
| |
|
| | |
| | precision, recall, thresholds = precision_recall_curve( |
| | train_target, probabilities_cv) |
| | fscore = (2 * precision * recall) / (precision + recall) |
| | |
| | |
| | fscore_zero = True |
| | position = -1 |
| | while fscore_zero is True: |
| | best_thres_idx = np.argsort(fscore, axis=0)[position] |
| | if np.isnan(fscore[best_thres_idx]) == True: |
| | position = position - 1 |
| | else: |
| | fscore_zero = False |
| | best_threshold = thresholds[best_thres_idx] |
| | print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % ( |
| | best_threshold, fscore[best_thres_idx], precision[best_thres_idx], |
| | recall[best_thres_idx])) |
| | |
| | mlflow.log_metric('best_threshold', best_threshold) |
| | mlflow.log_metric('f1_best_thres', fscore[best_thres_idx]) |
| | mlflow.log_metric('precision_best_thres', precision[best_thres_idx]) |
| | mlflow.log_metric('recall_best_thres', recall[best_thres_idx]) |
| | |
| | |
| | thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_threshold] |
| | for threshold in thresholds: |
| | y_predicted = probabilities_cv > threshold |
| | model_h.plot_confusion_matrix( |
| | train_target, y_predicted, model[1], threshold, file_suffix) |
| |
|
| | |
| | if model[1] != 'dummy_classifier': |
| | |
| | model_sig = CalibratedClassifierCV( |
| | model[0], method='sigmoid',cv=cross_val_fold_indices) |
| | if model[2] == 'imputed': |
| | model_sig.fit(train_features_imp, train_target) |
| | probs_sig = model_sig.predict_proba(test_features_imp)[:, 1] |
| | else: |
| | model_sig.fit(train_features_no_imp, train_target) |
| | probs_sig = model_sig.predict_proba(test_features_no_imp)[:, 1] |
| |
|
| | |
| | model_iso = CalibratedClassifierCV( |
| | model[0], method='isotonic', cv=cross_val_fold_indices) |
| | if model[2] == 'imputed': |
| | model_iso.fit(train_features_imp, train_target) |
| | probs_iso = model_iso.predict_proba(test_features_imp)[:, 1] |
| | else: |
| | model_iso.fit(train_features_no_imp, train_target) |
| | probs_iso = model_iso.predict_proba(test_features_no_imp)[:, 1] |
| |
|
| | |
| | spline_calib = mli.SplineCalib() |
| | spline_calib.fit(probabilities_cv, train_target) |
| |
|
| | if model[2] == 'imputed': |
| | model[0].fit(train_features_imp,train_target) |
| | preds_test_uncalib = model[0].predict_proba(test_features_imp)[:,1] |
| | else: |
| | model[0].fit(train_features_no_imp,train_target) |
| | preds_test_uncalib = model[0].predict_proba(test_features_no_imp)[:,1] |
| | probs_spline = spline_calib.calibrate(preds_test_uncalib) |
| |
|
| | |
| | |
| | for strategy in ['uniform', 'quantile']: |
| | for bin_num in [5, 10]: |
| | if strategy == 'uniform': |
| | print('--- Creating calibration curve with equal width bins ---') |
| | print('-- Num bins:', bin_num, ' --') |
| | else: |
| | print('--- Creating calibration curve with equal frequency bins ---') |
| | print('-- Num bins:', bin_num, ' --') |
| | print('Uncalibrated model:') |
| | prob_true_uncal, prob_pred_uncal = calibration_curve( |
| | train_target, probabilities_cv,n_bins=bin_num, strategy=strategy) |
| | print('Calibrated model (sigmoid):') |
| | prob_true_sig, prob_pred_sig = calibration_curve( |
| | test_target, probs_sig, n_bins=bin_num, strategy=strategy) |
| | print('Calibrated model (isotonic):') |
| | prob_true_iso, prob_pred_iso = calibration_curve( |
| | test_target, probs_iso, n_bins=bin_num, strategy=strategy) |
| | print('Calibrated model (spline):') |
| | prob_true_spline, prob_pred_spline = calibration_curve( |
| | test_target, probs_spline, n_bins=bin_num, strategy=strategy) |
| | |
| | plt.figure(figsize=(8,8)) |
| | plt.plot([0, 1], [0, 1], linestyle='--') |
| | plt.plot(prob_pred_uncal, prob_true_uncal, marker='.', |
| | label='Uncalibrated\n' + model[1]) |
| | plt.plot(prob_pred_sig, prob_true_sig, marker='.', |
| | label='Calibrated (Sigmoid)\n' + model[1]) |
| | plt.plot(prob_pred_iso, prob_true_iso, marker='.', |
| | label='Calibrated (Isotonic)\n' + model[1]) |
| | plt.plot(prob_pred_spline, prob_true_spline, marker='.', |
| | label='Calibrated (Spline)\n' + model[1]) |
| | plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') |
| | plt.tight_layout() |
| | plt.savefig(os.path.join(artifact_dir, model[1] + '_uncal_' + |
| | strategy + '_bins' + str(bin_num) + |
| | file_suffix + '.png')) |
| | plt.close() |
| |
|
| | |
| | if model[1].startswith("xgb"): |
| | feat_importance_tot_gain_df = model_h.plot_feat_importance_model( |
| | model[0], model[1], file_suffix=file_suffix) |
| | if (model[1].startswith("lgbm")): |
| | feature_names = train_features_no_imp.columns.tolist() |
| | feat_importance_tot_gain_df = model_h.plot_feat_importance_model( |
| | model[0], model[1], file_suffix=file_suffix, feature_names=feature_names) |
| | |
| | if (model[1].startswith("xgb")) | (model[1].startswith("lgbm")): |
| | feat_importance_tot_gain_df.to_csv( |
| | './data/feature_importance_tot_gain' + file_suffix + '.csv', index=False) |
| | |
| | |
| | if model[1] not in ['dummy_classifier', 'balanced_bagging']: |
| | shap_values_list_train = [] |
| | shap_vals_per_cv = {} |
| |
|
| | |
| | |
| | for idx in range(0, len(train_data_imp)): |
| | shap_vals_per_cv[idx] = {} |
| | for n_fold in range(0, 5): |
| | shap_vals_per_cv[idx][n_fold] = {} |
| | |
| | |
| | fold_num = 0 |
| | for i, estimator in enumerate(crossval['estimator']): |
| | fold_num = fold_num + 1 |
| | |
| | if model[1] in ['logistic_regression', |
| | 'logistic_regression_CW_balanced', 'random_forest', |
| | 'random_forest_CW_balanced', 'balanced_bagging', |
| | 'balanced_random_forest']: |
| | |
| | X_train = train_features_imp.iloc[crossval['indices']['train'][i]] |
| | X_test = train_features_imp.iloc[crossval['indices']['test'][i]] |
| | else: |
| | X_train = train_features_no_imp.iloc[crossval['indices']['train'][i]] |
| | X_test = train_features_no_imp.iloc[crossval['indices']['test'][i]] |
| |
|
| | |
| | if model[3] == 'linear': |
| | explainer = shap.LinearExplainer(estimator, X_train) |
| | if model[3] == 'tree': |
| | explainer = shap.TreeExplainer(estimator) |
| | |
| | |
| | shap_values_train = explainer.shap_values(X_train) |
| | |
| | |
| | if len(np.shape(shap_values_train)) == 3: |
| | shap_values_train = shap_values_train[1] |
| |
|
| | |
| | shap.summary_plot(np.array(shap_values_train), X_train, show=False) |
| | plt.savefig(os.path.join(artifact_dir, model[1] + '_shap_cv_fold_' + |
| | str(fold_num) + file_suffix + '.png')) |
| | plt.close() |
| |
|
| | |
| | train_idxs = X_train.index.tolist() |
| | for n, train_idx in enumerate(train_idxs): |
| | shap_vals_per_cv[train_idx][i] = shap_values_train[n] |
| |
|
| | |
| | average_shap_values, stds, ranges = [],[],[] |
| | for i in range(0,len(train_data_imp)): |
| | for n in range(0,5): |
| | |
| | |
| | try: |
| | if not shap_vals_per_cv[i][n]: |
| | shap_vals_per_cv[i][n] = np.NaN |
| | except: |
| | pass |
| | |
| | |
| | df_per_obs = pd.DataFrame.from_dict(shap_vals_per_cv[i]) |
| | |
| | average_shap_values.append(df_per_obs.mean(axis=1).values) |
| | stds.append(df_per_obs.std(axis=1).values) |
| | ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values) |
| |
|
| | |
| | if model[2] == 'imputed': |
| | shap.summary_plot(np.array(average_shap_values), train_data_imp.drop( |
| | columns=['StudyId', 'ExacWithin3Months']), show=False) |
| | if model[2] == 'not_imputed': |
| | shap.summary_plot(np.array(average_shap_values), train_data_no_imp.drop( |
| | columns=['StudyId', 'ExacWithin3Months']), show=False) |
| | plt.savefig( |
| | os.path.join(artifact_dir, model[1] + '_shap' + file_suffix + '.png')) |
| | plt.close() |
| |
|
| | |
| | feat_importance_df = model_h.get_shap_feat_importance( |
| | model[1], average_shap_values, features_list, file_suffix) |
| | feat_importance_df.to_csv( |
| | './data/feature_importance_shap' + file_suffix + '.csv', index=False) |
| |
|
| | |
| | model_scores = pd.DataFrame({'model_score': probabilities_cv, |
| | 'true_label': train_target}) |
| | sns.displot(model_scores, x="model_score", hue="true_label", kde=True) |
| | plt.savefig(os.path.join(artifact_dir, model[1] + 'score_distribution' + |
| | file_suffix + '.png')) |
| | plt.close() |
| |
|
| | |
| | for score in scoring: |
| | mlflow.log_metric(score, crossval['test_' + score].mean()) |
| | mlflow.log_metric(score + '_std', crossval['test_' + score].std()) |
| | |
| | params = model[0].get_params() |
| | for param in params: |
| | mlflow.log_param(param, params[param]) |
| | |
| | mlflow.log_artifacts(artifact_dir) |
| |
|
| | mlflow.end_run() |
| |
|
| | |
| | shap_feat_importance = pd.read_csv( |
| | './data/feature_importance_shap' + file_suffix + '.csv') |
| | tot_gain_feat_importance = pd.read_csv( |
| | './data/feature_importance_tot_gain' + file_suffix + '.csv') |
| | tot_gain_feat_importance = tot_gain_feat_importance.rename(columns={'index':'col_name'}) |
| | feat_importance_hierarchy = shap_feat_importance.merge( |
| | tot_gain_feat_importance, on='col_name', how='left') |
| | feat_importance_hierarchy.to_csv( |
| | './data/feat_importance_hierarchy' + file_suffix + '.csv', index=False) |