| | import numpy as np |
| | import pandas as pd |
| | import pickle |
| | import model_h |
| | import mlflow |
| | import os |
| | import shutil |
| | import matplotlib.pyplot as plt |
| | import sys |
| | import scipy |
| | import yaml |
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| |
|
| | def perform_ks_test(train_data, forward_val_data): |
| | """Perform Kolmogorov-Smirnov test. |
| | |
| | Args: |
| | train_data (pd.DataFrame): data used to train model. |
| | forward_val_data (pd.DataFrame): data used for the forward validation. |
| | |
| | Returns: |
| | pd.DataFrame: dataframe containing the results of the K-S test. |
| | """ |
| | for num, feature_name in enumerate(train_data.columns.tolist()): |
| | statistic, pvalue = scipy.stats.ks_2samp( |
| | train_data[feature_name], forward_val_data[feature_name] |
| | ) |
| | pvalue = round(pvalue, 4) |
| | if num == 0: |
| | df_ks = pd.DataFrame( |
| | { |
| | "FeatureName": feature_name, |
| | "KS_PValue": pvalue, |
| | "KS_TestStatistic": statistic, |
| | }, |
| | index=[num], |
| | ) |
| | else: |
| | df_ks_feat = pd.DataFrame( |
| | { |
| | "FeatureName": feature_name, |
| | "KS_PValue": pvalue, |
| | "KS_TestStatistic": statistic, |
| | }, |
| | index=[num], |
| | ) |
| | df_ks = pd.concat([df_ks, df_ks_feat]) |
| | df_ks["KS_DistributionsIdentical"] = np.where(df_ks["KS_PValue"] < 0.05, 0, 1) |
| | return df_ks |
| |
|
| |
|
| | def compute_wasserstein_distance(train_data, forward_val_data): |
| | """Calculate the wasserstein distance. |
| | |
| | Args: |
| | train_data (pd.DataFrame): data used to train model. |
| | forward_val_data (pd.DataFrame): data used for the forward validation. |
| | |
| | Returns: |
| | pd.DataFrame: dataframe containing the wasserstein distance results. |
| | """ |
| | for num, feature_name in enumerate(train_data.columns.tolist()): |
| | w_distance = scipy.stats.wasserstein_distance( |
| | train_data[feature_name], forward_val_data[feature_name] |
| | ) |
| | if num == 0: |
| | df_wd = pd.DataFrame( |
| | {"FeatureName": feature_name, "WassersteinDistance": w_distance}, |
| | index=[num], |
| | ) |
| | else: |
| | df_wd_feat = pd.DataFrame( |
| | {"FeatureName": feature_name, "WassersteinDistance": w_distance}, |
| | index=[num], |
| | ) |
| | df_wd = pd.concat([df_wd, df_wd_feat]) |
| | df_wd = df_wd.sort_values(by="WassersteinDistance", ascending=True) |
| | return df_wd |
| |
|
| |
|
| | |
| | |
| | |
| | model_type = config["model_settings"]["model_type"] |
| |
|
| | |
| | log = open( |
| | os.path.join( |
| | config["outputs"]["logging_dir"], "run_forward_val_" + model_type + ".log" |
| | ), |
| | "w", |
| | ) |
| | sys.stdout = log |
| |
|
| | |
| | forward_val_data_imputed = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "forward_val_imputed_{}.pkl".format(model_type), |
| | ) |
| | ) |
| | forward_val_data_not_imputed = pd.read_pickle( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "forward_val_not_imputed_{}.pkl".format(model_type), |
| | ) |
| | ) |
| |
|
| | |
| | |
| |
|
| | |
| | train_data = model_h.load_data_for_modelling( |
| | os.path.join( |
| | config["outputs"]["model_input_data_dir"], |
| | "crossval_imputed_{}.pkl".format(model_type), |
| | ) |
| | ) |
| |
|
| | |
| | |
| | |
| | train_data_for_data_drift = train_data.drop(columns=["StudyId", "IndexDate"]) |
| | forward_val_data_for_data_drift = forward_val_data_imputed.drop(columns=["StudyId", "IndexDate"]) |
| |
|
| | df_ks = perform_ks_test(train_data_for_data_drift, forward_val_data_for_data_drift) |
| | df_wd = compute_wasserstein_distance( |
| | train_data_for_data_drift, forward_val_data_for_data_drift |
| | ) |
| | df_data_drift = df_wd.merge(df_ks, on="FeatureName", how="left") |
| | print(df_data_drift) |
| |
|
| | |
| | |
| | |
| | |
| | print(forward_val_data_imputed["ExacWithin3Months"].value_counts()) |
| | print( |
| | forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][ |
| | "HospExacWithin3Months" |
| | ].value_counts() |
| | ) |
| | print( |
| | forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][ |
| | "CommExacWithin3Months" |
| | ].value_counts() |
| | ) |
| |
|
| | |
| | forward_val_features_imp = forward_val_data_imputed.drop( |
| | columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months', |
| | 'CommExacWithin3Months'] |
| | ) |
| | forward_val_target_imp = forward_val_data_imputed["ExacWithin3Months"] |
| | forward_val_features_no_imp = forward_val_data_not_imputed.drop( |
| | columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months', |
| | 'CommExacWithin3Months'] |
| | ) |
| | forward_val_target_no_imp = forward_val_data_not_imputed["ExacWithin3Months"] |
| |
|
| | |
| | |
| | if not forward_val_target_no_imp.equals(forward_val_target_imp): |
| | raise ValueError( |
| | "Target variable is not the same in imputed and non imputed datasets in the test set." |
| | ) |
| | test_target = forward_val_target_no_imp |
| |
|
| | |
| | for features in [forward_val_features_imp, forward_val_features_no_imp]: |
| | for col in features: |
| | features[col] = pd.to_numeric(features[col], errors="coerce") |
| |
|
| | |
| | |
| | models = [ |
| | ("balanced_random_forest", "imputed", 0.27), |
| | |
| | |
| | ] |
| |
|
| | |
| | |
| | |
| | mlflow.set_tracking_uri("sqlite:///mlruns.db") |
| | mlflow.set_experiment("model_h_drop_1_hosp_comm") |
| |
|
| | with mlflow.start_run(run_name="sig_forward_val_models_10_2023"): |
| | for model_info in models: |
| | print(model_info[0]) |
| | with mlflow.start_run(run_name=model_info[0], nested=True): |
| | |
| | os.makedirs(config["outputs"]["artifact_dir"], exist_ok=True) |
| | |
| | shutil.rmtree(config["outputs"]["artifact_dir"]) |
| |
|
| | |
| | with open("./data/model/trained_iso_" + model_info[0] + "_pkl", "rb") as f: |
| | model = pickle.load(f) |
| |
|
| | |
| | if model_info[1] == "imputed": |
| | test_features = forward_val_features_imp |
| | else: |
| | test_features = forward_val_features_no_imp |
| |
|
| | |
| | test_probs = model.predict_proba(test_features)[:, 1] |
| | test_preds = model.predict(test_features) |
| |
|
| | |
| | metrics = model_h.calc_eval_metrics_for_model( |
| | test_target, |
| | test_preds, |
| | test_probs, |
| | "forward_val", |
| | best_threshold=model_info[2], |
| | ) |
| |
|
| | |
| | model_h.plot_confusion_matrix( |
| | [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, model_info[2]], |
| | test_probs, |
| | test_target, |
| | model_info[0], |
| | model_type, |
| | "forward_val", |
| | ) |
| |
|
| | |
| | for bins in [6, 10]: |
| | plt.figure(figsize=(8, 8)) |
| | plt.plot([0, 1], [0, 1], linestyle="--") |
| | model_h.plot_calibration_curve( |
| | test_target, test_probs, bins, "quantile", "Forward Validation" |
| | ) |
| | plt.legend(bbox_to_anchor=(1.05, 1.0), loc="upper left") |
| | plt.title(model_info[0]) |
| | plt.tight_layout() |
| | plt.savefig( |
| | os.path.join( |
| | config["outputs"]["artifact_dir"], |
| | model_info[0] |
| | + "_" |
| | + "quantile" |
| | + "_bins" |
| | + str(bins) |
| | + model_type |
| | + ".png", |
| | ) |
| | ) |
| | plt.close() |
| |
|
| | |
| | |
| | preds_events_df_forward_val = model_h.create_df_probabilities_and_predictions( |
| | test_probs, |
| | model_info[2], |
| | forward_val_data_imputed["StudyId"].tolist(), |
| | test_target, |
| | forward_val_data_imputed[["ExacWithin3Months", 'HospExacWithin3Months', |
| | 'CommExacWithin3Months']], |
| | model_info[0], |
| | model_type, |
| | output_dir="./data/prediction_and_events/", |
| | calib_type="forward_val", |
| | ) |
| | |
| | metrics_by_event_type_forward_val = model_h.calc_metrics_by_event_type( |
| | preds_events_df_forward_val, calib_type="forward_val" |
| | ) |
| | |
| | model_h.plot_roc_curve_by_event_type( |
| | preds_events_df_forward_val, model_info[0], "forward_val" |
| | ) |
| | |
| | model_h.plot_prec_recall_by_event_type( |
| | preds_events_df_forward_val, model_info[0], "forward_val" |
| | ) |
| |
|
| | |
| | model_h.plot_score_distribution( |
| | test_target, |
| | test_probs, |
| | config["outputs"]["artifact_dir"], |
| | model_info[0], |
| | model_type, |
| | ) |
| |
|
| | |
| | mlflow.log_metrics(metrics) |
| | mlflow.log_artifacts(config["outputs"]["artifact_dir"]) |
| | mlflow.end_run() |
| |
|