| | import os |
| | import sys |
| | import numpy as np |
| | import pandas as pd |
| | import model_h |
| | import shutil |
| | import pickle |
| | import yaml |
| |
|
| | |
| | import matplotlib.pyplot as plt |
| |
|
| | |
| | from sklearn.ensemble import RandomForestClassifier |
| | from sklearn.model_selection import cross_validate, cross_val_predict |
| | from sklearn.metrics import precision_recall_curve, auc |
| | from sklearn.calibration import CalibratedClassifierCV |
| | from imblearn.ensemble import BalancedRandomForestClassifier |
| | import xgboost as xgb |
| | import ml_insights as mli |
| | import mlflow |
| |
|
| | |
| | from sklearn.inspection import permutation_importance |
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | model_type = config['model_settings']['model_type'] |
| |
|
| | |
| | |
| | |
| | |
| | log = open( |
| | os.path.join(config['outputs']['logging_dir'], "modelling_" + model_type + ".log"), "w") |
| | sys.stdout = log |
| |
|
| | |
| | fold_patients = np.load(os.path.join(config['outputs']['cohort_info_dir'], |
| | 'fold_patients_' + model_type + '.npy'), allow_pickle=True) |
| |
|
| | |
| | train_data_imp = model_h.load_data_for_modelling(os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "train_imputed_cv_{}.pkl".format(model_type), |
| | )) |
| |
|
| | |
| | train_data_no_imp = model_h.load_data_for_modelling(os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "train_not_imputed_cv_{}.pkl".format(model_type), |
| | )) |
| |
|
| | |
| | test_data_imp = model_h.load_data_for_modelling(os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "test_imputed_{}.pkl".format(model_type), |
| | )) |
| |
|
| | |
| | test_data_no_imp = model_h.load_data_for_modelling(os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "test_not_imputed_{}.pkl".format(model_type), |
| | )) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | print('Train date range', |
| | train_data_imp['IndexDate'].min(), train_data_imp['IndexDate'].max()) |
| | print('Test date range', |
| | test_data_imp['IndexDate'].min(), test_data_imp['IndexDate'].max()) |
| |
|
| | |
| | tags = {"prediction_window": config['model_settings']['prediction_window'], |
| | "lookback_period": config['model_settings']['lookback_period'], |
| | "min_index_date": train_data_imp['IndexDate'].min(), |
| | "max_index_date": train_data_imp['IndexDate'].max(), |
| | "1_row_per_length_in_service_days": config['model_settings']['one_row_per_days_in_service'], |
| | } |
| |
|
| | |
| | |
| | cross_val_fold_indices = [] |
| | for fold in fold_patients: |
| | fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)] |
| | fold_train_ids = train_data_no_imp[~( |
| | train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))] |
| |
|
| | |
| | fold_val_index = fold_val_ids.index |
| | fold_train_index = fold_train_ids.index |
| |
|
| | |
| | cross_val_fold_indices.append((fold_train_index, fold_val_index)) |
| |
|
| | |
| | cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate', 'HospExacWithin3Months', |
| | 'CommExacWithin3Months'] |
| | features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop] |
| |
|
| | |
| | |
| | train_features_no_imp = train_data_no_imp[features_list].astype('float') |
| | train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float') |
| | |
| | train_features_imp = train_data_imp[features_list].astype('float') |
| | train_target_imp = train_data_imp.ExacWithin3Months.astype('float') |
| |
|
| | |
| | |
| | test_features_no_imp = test_data_no_imp[features_list].astype('float') |
| | test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float') |
| | |
| | test_features_imp = test_data_imp[features_list].astype('float') |
| | test_target_imp = test_data_imp.ExacWithin3Months.astype('float') |
| |
|
| | |
| | |
| | if not train_target_no_imp.equals(train_target_imp): |
| | raise ValueError( |
| | 'Target variable is not the same in imputed and non imputed datasets in the train set.') |
| | if not test_target_no_imp.equals(test_target_imp): |
| | raise ValueError( |
| | 'Target variable is not the same in imputed and non imputed datasets in the test set.') |
| | train_target = train_target_no_imp |
| | test_target = test_target_no_imp |
| |
|
| | |
| | for features in [train_features_no_imp, train_features_imp, |
| | test_features_no_imp, test_features_imp]: |
| | for col in features: |
| | features[col] = pd.to_numeric(features[col], errors='coerce') |
| |
|
| | |
| | |
| | |
| | |
| | mlflow.set_tracking_uri("sqlite:///mlruns.db") |
| | mlflow.set_experiment('model_h_drop_1_' + model_type) |
| |
|
| | |
| | scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', |
| | 'average_precision', 'neg_brier_score'] |
| |
|
| | |
| | |
| | models = [] |
| | |
| | models.append((BalancedRandomForestClassifier(random_state=0), |
| | 'balanced_random_forest', 'imputed', 'tree')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss'), |
| | 'xgb', 'not_imputed', 'tree')) |
| | models.append((RandomForestClassifier(), |
| | 'random_forest', 'imputed', 'tree')) |
| |
|
| | |
| | if model_type == 'only_hosp': |
| | parent_run_id = 'ba2d7244654c4b84a815932a3167648f' |
| | if model_type == 'hosp_comm': |
| | parent_run_id = 'f71edd4c72f14c0692431dca297ec131' |
| |
|
| | |
| | |
| | |
| | |
| | with mlflow.start_run(run_name='hyperparameter_optimised_models_12'): |
| | for model in models: |
| | |
| | best_params = model_h.get_mlflow_run_params( |
| | model[1], parent_run_id, 'sqlite:///mlruns.db', model_type) |
| | |
| | for n, scorer in enumerate(best_params): |
| | params = best_params[scorer] |
| | model[0].set_params(**params) |
| | with mlflow.start_run(run_name=model[1] + '_tuning_scorer_' + scorer, nested=True): |
| | print(model[1], scorer) |
| | |
| | os.makedirs(config['outputs']['artifact_dir'], exist_ok=True) |
| | |
| | shutil.rmtree(config['outputs']['artifact_dir']) |
| |
|
| | |
| | |
| | if model[2] == 'imputed': |
| | train_features = train_features_imp |
| | test_features = test_features_imp |
| | train_data = train_data_imp |
| | test_data = test_data_imp |
| | else: |
| | train_features = train_features_no_imp |
| | test_features = test_features_no_imp |
| | train_data = train_data_no_imp |
| | test_data = test_data_no_imp |
| | |
| | |
| | mlflow.set_tags(tags=tags) |
| |
|
| | |
| | crossval = cross_validate(model[0], train_features, train_target, |
| | cv=cross_val_fold_indices, |
| | return_estimator=True, scoring=scoring, |
| | return_indices=True) |
| | |
| | |
| | probabilities_cv = cross_val_predict(model[0], train_features, |
| | train_target, |
| | cv=cross_val_fold_indices, |
| | method='predict_proba')[:, 1] |
| |
|
| | |
| | for iter_num, estimator in enumerate(crossval['estimator']): |
| | probs_test = estimator.predict_proba(test_features)[:,1] |
| | preds_test = estimator.predict(test_features) |
| | uncalib_metrics_test = model_h.calc_eval_metrics_for_model( |
| | test_target, preds_test, probs_test, 'uncalib_test') |
| | if iter_num == 0: |
| | uncalib_metrics_test_df = pd.DataFrame( |
| | uncalib_metrics_test, index=[iter_num]) |
| | else: |
| | uncalib_metrics_test_df_iter = pd.DataFrame( |
| | uncalib_metrics_test, index=[iter_num]) |
| | uncalib_metrics_test_df = pd.concat( |
| | [uncalib_metrics_test_df, uncalib_metrics_test_df_iter]) |
| | uncalib_metrics_test_mean = uncalib_metrics_test_df.mean() |
| | uncalib_metrics_test_mean = uncalib_metrics_test_mean.to_dict() |
| |
|
| | |
| | best_thres_uncal, f1_bt, prec_bt, rec_bt = model_h.get_threshold_with_best_f1_score( |
| | train_target, probabilities_cv) |
| | |
| | mlflow.log_metric('best_thres_uncal', best_thres_uncal) |
| | mlflow.log_metric('f1_best_thres', f1_bt) |
| | mlflow.log_metric('precision_best_thres', prec_bt) |
| | mlflow.log_metric('recall_best_thres', rec_bt) |
| |
|
| | |
| | model_h.plot_confusion_matrix( |
| | [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_uncal], probabilities_cv, |
| | train_target, model[1], model_type, 'uncalib') |
| | |
| | |
| | precision, recall, thresholds = precision_recall_curve( |
| | train_target, probabilities_cv) |
| | auc_pr = auc(recall, precision) |
| | mlflow.log_metric('auc_pr', auc_pr) |
| |
|
| | |
| | if model[1] != 'dummy_classifier': |
| | |
| | |
| | model_sig = CalibratedClassifierCV( |
| | model[0], method='sigmoid',cv=cross_val_fold_indices) |
| | model_sig.fit(train_features, train_target) |
| | probs_sig = model_sig.predict_proba(test_features)[:, 1] |
| | probs_sig_2 = model_sig.predict_proba(test_features) |
| | preds_sig = model_sig.predict(test_features) |
| | |
| | calib_metrics_sig = model_h.calc_eval_metrics_for_model( |
| | test_target, preds_sig, probs_sig, 'sig') |
| | |
| | best_thres_sig, _, _, _ = model_h.get_threshold_with_best_f1_score( |
| | test_target, probs_sig) |
| | mlflow.log_metric('best_thres_sig', best_thres_sig) |
| | |
| | model_h.plot_confusion_matrix( |
| | [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_sig], probs_sig, |
| | test_target, model[1], model_type, "sig") |
| | |
| | model_h.plot_score_distribution( |
| | test_target, probs_sig, config['outputs']['artifact_dir'], model[1], model_type, 'sig') |
| | |
| | model_h.calc_std_for_calibrated_classifiers( |
| | model_sig, 'sig', test_features, test_target) |
| |
|
| | |
| | |
| | model_iso = CalibratedClassifierCV( |
| | model[0], method='isotonic', cv=cross_val_fold_indices) |
| | model_iso.fit(train_features, train_target) |
| | probs_iso = model_iso.predict_proba(test_features)[:, 1] |
| | preds_iso = model_iso.predict(test_features) |
| | |
| | calib_metrics_iso = model_h.calc_eval_metrics_for_model( |
| | test_target, preds_iso, probs_iso, 'iso') |
| | |
| | best_thres_iso, _, _, _ = model_h.get_threshold_with_best_f1_score( |
| | test_target, probs_iso) |
| | mlflow.log_metric('best_thres_iso', best_thres_iso) |
| | |
| | model_h.plot_confusion_matrix( |
| | [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_iso], probs_iso, |
| | test_target, model[1], model_type, "iso") |
| | |
| | model_h.plot_score_distribution( |
| | test_target, probs_iso, config['outputs']['artifact_dir'], model[1], model_type, 'iso') |
| | |
| | model_h.calc_std_for_calibrated_classifiers( |
| | model_iso, 'iso', test_features, test_target) |
| |
|
| | |
| | |
| | spline_calib = mli.SplineCalib() |
| | spline_calib.fit(probabilities_cv, train_target) |
| | model[0].fit(train_features, train_target) |
| | preds_test_uncalib = model[0].predict_proba(test_features)[:,1] |
| | probs_spline = spline_calib.calibrate(preds_test_uncalib) |
| | preds_spline = probs_spline > 0.5 |
| | preds_spline = preds_spline.astype(int) |
| | |
| | calib_metrics_spline = model_h.calc_eval_metrics_for_model( |
| | test_target, preds_spline, probs_spline, 'spline') |
| | |
| | best_thres_spline, _, _, _ = model_h.get_threshold_with_best_f1_score( |
| | test_target, probs_spline) |
| | mlflow.log_metric('best_thres_spline', best_thres_spline) |
| | |
| | model_h.plot_confusion_matrix( |
| | [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, best_thres_spline], probs_spline, |
| | test_target, model[1], model_type, "spline") |
| | |
| | model_h.plot_score_distribution( |
| | test_target, probs_spline, config['outputs']['artifact_dir'], model[1], model_type, 'spline') |
| |
|
| | |
| | |
| | |
| | for strategy in ['uniform', 'quantile']: |
| | for bins in [5, 6, 10]: |
| | plt.figure(figsize=(8,8)) |
| | plt.plot([0, 1], [0, 1], linestyle='--') |
| | model_h.plot_calibration_curve( |
| | train_target, probabilities_cv, bins, strategy, 'Uncalibrated') |
| | model_h.plot_calibration_curve( |
| | test_target, probs_sig, bins, strategy,'Sigmoid') |
| | model_h.plot_calibration_curve( |
| | test_target, probs_iso, bins, strategy, 'Isotonic') |
| | model_h.plot_calibration_curve( |
| | test_target, probs_spline, bins, strategy, 'Spline') |
| | plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') |
| | plt.title(model[1]) |
| | plt.tight_layout() |
| | plt.savefig( |
| | os.path.join(config['outputs']['artifact_dir'], model[1] + |
| | '_' + strategy + '_bins' + str(bins) + '_' + |
| | model_type + '.png')) |
| | plt.close() |
| |
|
| | |
| | |
| | fig, (ax1,ax2) = plt.subplots(ncols=2, sharex=True, figsize=(15,10)) |
| | |
| | for ax in [ax1, ax2]: |
| | ax.plot([0, 1], [0, 1], linestyle='--') |
| | for bins in [5, 6, 7, 8, 9]: |
| | model_h.plot_calibration_curve( |
| | train_target, probabilities_cv, bins, 'quantile', 'Bins=' + |
| | str(bins), ax1) |
| | for bins in [5, 6, 7, 8, 9]: |
| | model_h.plot_calibration_curve( |
| | train_target, probabilities_cv, bins, 'uniform', 'Bins=' + |
| | str(bins), ax2) |
| | ax1.title.set_text(model[1] + ' uncalibrated model quantile bins') |
| | ax2.title.set_text(model[1] + ' uncalibrated model uniform bins') |
| | plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') |
| | plt.tight_layout() |
| | plt.savefig( |
| | os.path.join(config['outputs']['artifact_dir'], model[1] + '_uncal_' |
| | + model_type + '.png')) |
| | plt.close() |
| |
|
| | |
| | model_h.plot_calibration_plot_with_error_bars( |
| | probabilities_cv, probs_sig, probs_iso, probs_spline, train_target, |
| | test_target, model[1]) |
| | plt.close() |
| |
|
| | |
| | if model[1].startswith("xgb"): |
| | feat_importance_tot_gain_df = model_h.plot_feat_importance_model( |
| | model[0], model[1], model_type) |
| | |
| | if model[1].startswith("xgb"): |
| | feat_importance_tot_gain_df.to_csv( |
| | './data/feature_importance_tot_gain_' + model_type + '.csv', index=False) |
| | |
| | |
| | if model[1] not in ['dummy_classifier']: |
| | |
| | preds_event_df_uncalib = model_h.create_df_probabilities_and_predictions( |
| | probabilities_cv, best_thres_uncal, |
| | train_data['StudyId'].tolist(), |
| | train_target, |
| | train_data[['ExacWithin3Months','HospExacWithin3Months','CommExacWithin3Months']], |
| | model[1], model_type, output_dir='./data/prediction_and_events/') |
| | preds_events_df_sig = model_h.create_df_probabilities_and_predictions( |
| | probs_sig, best_thres_sig, test_data['StudyId'].tolist(), |
| | test_target, |
| | test_data[['ExacWithin3Months', 'HospExacWithin3Months','CommExacWithin3Months']], |
| | model[1], model_type, output_dir='./data/prediction_and_events/', |
| | calib_type='sig') |
| | preds_events_df_iso = model_h.create_df_probabilities_and_predictions( |
| | probs_iso, best_thres_iso, test_data['StudyId'].tolist(), |
| | test_target, |
| | test_data[['ExacWithin3Months', 'HospExacWithin3Months','CommExacWithin3Months']], |
| | model[1], model_type, output_dir='./data/prediction_and_events/', |
| | calib_type='iso') |
| | preds_events_df_spline = model_h.create_df_probabilities_and_predictions( |
| | probs_spline, best_thres_spline, test_data['StudyId'].tolist(), |
| | test_target, |
| | test_data[['ExacWithin3Months', 'HospExacWithin3Months','CommExacWithin3Months']], |
| | model[1], model_type, output_dir='./data/prediction_and_events/', |
| | calib_type='spline') |
| | |
| | metrics_by_event_type_uncalib = model_h.calc_metrics_by_event_type( |
| | preds_event_df_uncalib, calib_type="uncalib") |
| | metrics_by_event_type_sig = model_h.calc_metrics_by_event_type( |
| | preds_events_df_sig, calib_type='sig') |
| | metrics_by_event_type_iso = model_h.calc_metrics_by_event_type( |
| | preds_events_df_iso, calib_type='iso') |
| | metrics_by_event_type_spline = model_h.calc_metrics_by_event_type( |
| | preds_events_df_spline, calib_type='spline') |
| | |
| | model_h.plot_roc_curve_by_event_type( |
| | preds_event_df_uncalib, model[1], 'uncalib') |
| | model_h.plot_roc_curve_by_event_type( |
| | preds_events_df_sig, model[1], 'sig') |
| | model_h.plot_roc_curve_by_event_type( |
| | preds_events_df_iso, model[1], 'iso') |
| | model_h.plot_roc_curve_by_event_type( |
| | preds_events_df_spline, model[1], 'spline') |
| | |
| | model_h.plot_prec_recall_by_event_type( |
| | preds_event_df_uncalib, model[1], 'uncalib') |
| | model_h.plot_prec_recall_by_event_type( |
| | preds_events_df_sig, model[1], 'sig') |
| | model_h.plot_prec_recall_by_event_type( |
| | preds_events_df_iso, model[1], 'iso') |
| | model_h.plot_prec_recall_by_event_type( |
| | preds_events_df_spline, model[1], 'spline') |
| | |
| |
|
| | |
| | if model[1] not in ['dummy_classifier']: |
| | |
| | |
| | shap_values_v_uncal, shap_values_t_uncal = model_h.get_uncalibrated_shap( |
| | crossval['estimator'], test_features, train_features, |
| | train_data[features_list].columns, |
| | model[1], model_type) |
| |
|
| | |
| | model_h.plot_averaged_summary_plot( |
| | shap_values_t_uncal, |
| | train_data[features_list], |
| | model[1], 'uncalib', model_type) |
| |
|
| | |
| | model_h.plot_shap_interaction_value_heatmap( |
| | crossval['estimator'], train_features, |
| | train_data[features_list].columns, |
| | model[1], model_type) |
| | |
| | |
| | calib_models = {'sig':model_sig, 'iso':model_iso} |
| | for calib_model_name in calib_models: |
| | |
| | shap_values_v, shap_values_t = model_h.get_calibrated_shap_by_classifier( |
| | calib_models[calib_model_name], test_features, train_features, |
| | train_data.drop( |
| | columns=['StudyId', 'ExacWithin3Months', 'IndexDate', |
| | 'HospExacWithin3Months', |
| | 'CommExacWithin3Months']).columns, |
| | calib_model_name, model[1], model_type) |
| | |
| | |
| | model_h.plot_averaged_summary_plot( |
| | shap_values_t, |
| | train_data.drop( |
| | columns=['StudyId', 'ExacWithin3Months', 'IndexDate', |
| | 'HospExacWithin3Months','CommExacWithin3Months']), |
| | model[1], calib_model_name, model_type) |
| | |
| | |
| | feature_imp_df = model_h.get_local_shap_values( |
| | model[1], model_type, shap_values_v, test_features, |
| | calib_model_name,shap_ids_dir='./data/prediction_and_events/') |
| | feature_imp_df.to_csv( |
| | './data/prediction_and_events/local_feature_imp_df' + model[1] + |
| | '_' + calib_model_name + '.csv') |
| | |
| | |
| | test_feat_enc_conv = model_h.plot_local_shap( |
| | model[1], model_type, shap_values_v, test_features, train_features, |
| | calib_model_name, |
| | row_ids_to_plot=['missed', 'incorrect', 'correct'], |
| | artifact_dir=config['outputs']['artifact_dir'], |
| | shap_ids_dir='./data/prediction_and_events/', |
| | reverse_scaling_flag=False, |
| | convert_target_encodings=True, imputation=model[2], |
| | target_enc_path="./data/artifacts/target_encodings_" + model_type + ".json", |
| | return_enc_converted_df=False) |
| | |
| | |
| | """ |
| | ### Plot SHAP dependency plots ### |
| | os.makedirs( "./tmp/dependence_plots", exist_ok=True) |
| | categorical_cols = [ |
| | "DaysSinceLastExac_te", "FEV1PercentPredicted_te"] |
| | for categorical_col in categorical_cols: |
| | shap.dependence_plot( |
| | categorical_col, shap_values_v, test_feat_enc_conv, |
| | interaction_index=None, show=False) |
| | plt.tight_layout() |
| | plt.savefig( |
| | "./tmp/dependence_plots/dependence_plot_" + categorical_col |
| | + "_" + model[1] + "_" + calib_model_name + file_suffix + ".png") |
| | plt.close() |
| | """ |
| | |
| | model_h.plot_score_distribution( |
| | train_target, probabilities_cv, config['outputs']['artifact_dir'], |
| | model[1], model_type) |
| |
|
| | """ |
| | ### Permutation feature importance ### |
| | def calc_permutation_importance(model, features, target, scoring, n_repeats): |
| | permutation_imp = permutation_importance(model, features, target, random_state=0, scoring=scoring, n_repeats=n_repeats) |
| | for n, score in enumerate(permutation_imp): |
| | if n == 0: |
| | df = pd.DataFrame(data=permutation_imp[score]['importances_mean'], index=features.columns) |
| | df = df.rename(columns={0:score}) |
| | else: |
| | df[score] = permutation_imp[score]['importances_mean'] |
| | return df, permutation_imp |
| | def plot_permutation_feature_importance(permutation_imp_full, metric, col_names, n_repeats, train_or_test): |
| | os.makedirs("./tmp/permutation_feat_imp", exist_ok=True) |
| | sorted_importances_idx = permutation_imp_full[metric].importances_mean.argsort() |
| | importances = pd.DataFrame( |
| | permutation_imp_full[metric].importances[sorted_importances_idx].T, |
| | columns=col_names[sorted_importances_idx], |
| | ) |
| | ax = importances.plot.box(vert=False, whis=10) |
| | ax.set_title("Permutation Importances(" + train_or_test + ")") |
| | ax.axvline(x=0, color="k", linestyle="--") |
| | ax.set_xlabel("Decrease in accuracy score") |
| | ax.figure.tight_layout() |
| | plt.savefig('./tmp/permutation_feat_imp/' + train_or_test + '_' + metric + '_repeats' + str(n_repeats) +'.png') |
| | |
| | from scipy.cluster import hierarchy |
| | from scipy.spatial.distance import squareform |
| | from scipy.stats import spearmanr |
| | full_dataset_feat = pd.concat([train_features, test_features], axis=0) |
| | print(train_features) |
| | print(full_dataset_feat) |
| | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) |
| | corr = spearmanr(full_dataset_feat).correlation |
| | |
| | # Ensure the correlation matrix is symmetric |
| | corr = (corr + corr.T) / 2 |
| | np.fill_diagonal(corr, 1) |
| | |
| | # We convert the correlation matrix to a distance matrix before performing |
| | # hierarchical clustering using Ward's linkage. |
| | distance_matrix = 1 - np.abs(corr) |
| | dist_linkage = hierarchy.ward(squareform(distance_matrix)) |
| | dendro = hierarchy.dendrogram( |
| | dist_linkage, labels=full_dataset_feat.columns.to_list(), ax=ax1, leaf_rotation=90 |
| | ) |
| | dendro_idx = np.arange(0, len(dendro["ivl"])) |
| | |
| | ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]]) |
| | ax2.set_xticks(dendro_idx) |
| | ax2.set_yticks(dendro_idx) |
| | ax2.set_xticklabels(dendro["ivl"], rotation="vertical") |
| | ax2.set_yticklabels(dendro["ivl"]) |
| | _ = fig.tight_layout() |
| | plt.show() |
| | plt.close() |
| | |
| | #features_to_drop = ["TotalEngagementMRC", "NumCommExacPrior6mo", "WeekAvgCATQ2", "WeekAvgCATQ4"] |
| | |
| | #X_train_sel = train_features.drop(columns=features_to_drop) |
| | #X_test_sel = test_features.drop(columns=features_to_drop) |
| | |
| | from collections import defaultdict |
| | |
| | cluster_ids = hierarchy.fcluster(dist_linkage, 0.5, criterion="distance") |
| | cluster_id_to_feature_ids = defaultdict(list) |
| | for idx, cluster_id in enumerate(cluster_ids): |
| | cluster_id_to_feature_ids[cluster_id].append(idx) |
| | selected_features = [v[0] for v in cluster_id_to_feature_ids.values()] |
| | selected_features_names = full_dataset_feat.columns[selected_features] |
| | |
| | X_train_sel = train_features[selected_features_names] |
| | X_test_sel = test_features[selected_features_names] |
| | print(selected_features_names) |
| | # retrain |
| | # Perform calibration |
| | model_sig_perm = CalibratedClassifierCV( |
| | model[0], method='sigmoid',cv=cross_val_fold_indices) |
| | model_sig_perm.fit(X_train_sel, train_target) |
| | probs_sig = model_sig_perm.predict_proba(X_test_sel)[:, 1] |
| | probs_sig_2 = model_sig_perm.predict_proba(X_test_sel) |
| | preds_sig = model_sig_perm.predict(X_test_sel) |
| | print('before') |
| | print(calib_metrics_sig) |
| | # Generate metrics for calibrated model |
| | calib_metrics_sig = copd.calc_eval_metrics_for_model( |
| | test_target, preds_sig, probs_sig, 'sig') |
| | print(calib_metrics_sig) |
| | |
| | def plot_permutation_importance(clf, X, y, ax): |
| | result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=2,scoring='average_precision') |
| | perm_sorted_idx = result.importances_mean.argsort() |
| | |
| | ax.boxplot( |
| | result.importances[perm_sorted_idx].T, |
| | vert=False, |
| | labels=X.columns[perm_sorted_idx], |
| | ) |
| | ax.axvline(x=0, color="k", linestyle="--") |
| | return ax |
| | fig, ax = plt.subplots(figsize=(7, 6)) |
| | plot_permutation_importance(model_sig_perm, X_test_sel, test_target, ax) |
| | ax.set_title("Permutation Importances on selected subset of features\n(test set)") |
| | ax.set_xlabel("Decrease in accuracy score") |
| | ax.figure.tight_layout() |
| | plt.savefig('./tmp/permutation_feat_imp.png') |
| | |
| | #for metric in ['f1', 'average_precision', 'roc_auc']: |
| | # for n_repeats in [5,10, 50]: |
| | # permutation_imp_train_df, permutation_imp_train_dict = calc_permutation_importance(model_sig, train_features, train_target, scoring=scoring, n_repeats=n_repeats) |
| | # plot_permutation_feature_importance(permutation_imp_train_dict, metric, train_features.columns, n_repeats, 'train') |
| | # for n_repeats in [5,10, 50]: |
| | # permutation_imp_test_df, permutation_imp_test_dict = calc_permutation_importance(model_sig, test_features, test_target, scoring=scoring, n_repeats=n_repeats) |
| | # plot_permutation_feature_importance(permutation_imp_test_dict, metric, test_features.columns, n_repeats, 'test') |
| | """ |
| | |
| | |
| | for score in scoring: |
| | mlflow.log_metric(score, crossval['test_' + score].mean()) |
| | mlflow.log_metric(score + '_std', crossval['test_' + score].std()) |
| | |
| | if model[1] != 'dummy_classifier': |
| | mlflow.log_metrics(uncalib_metrics_test_mean) |
| | mlflow.log_metrics(calib_metrics_sig) |
| | mlflow.log_metrics(calib_metrics_iso) |
| | mlflow.log_metrics(calib_metrics_spline) |
| | mlflow.log_metrics(metrics_by_event_type_uncalib) |
| | mlflow.log_metrics(metrics_by_event_type_sig) |
| | mlflow.log_metrics(metrics_by_event_type_iso) |
| | mlflow.log_metrics(metrics_by_event_type_spline) |
| | |
| | params = model[0].get_params() |
| | for param in params: |
| | mlflow.log_param(param, params[param]) |
| | |
| | mlflow.log_artifacts(config['outputs']['artifact_dir']) |
| |
|
| | |
| | with open('./data/model/trained_sig_' + model[1] + '_pkl', 'wb') as files: |
| | pickle.dump(model_sig, files) |
| | with open('./data/model/trained_iso_' + model[1] + '_pkl', 'wb') as files: |
| | pickle.dump(model_iso, files) |
| | with open('./data/model/trained_spline_' + model[1] + '_pkl', 'wb') as files: |
| | pickle.dump(spline_calib, files) |
| |
|
| | mlflow.end_run() |