copd-model-h / training /perform_forward_validation.py
IamGrooooot's picture
Inital Upload
000de75
import numpy as np
import pandas as pd
import pickle
import model_h
import mlflow
import os
import shutil
import matplotlib.pyplot as plt
import sys
import scipy
import yaml
with open("./training/config.yaml", "r") as config:
config = yaml.safe_load(config)
def perform_ks_test(train_data, forward_val_data):
"""Perform Kolmogorov-Smirnov test.
Args:
train_data (pd.DataFrame): data used to train model.
forward_val_data (pd.DataFrame): data used for the forward validation.
Returns:
pd.DataFrame: dataframe containing the results of the K-S test.
"""
for num, feature_name in enumerate(train_data.columns.tolist()):
statistic, pvalue = scipy.stats.ks_2samp(
train_data[feature_name], forward_val_data[feature_name]
)
pvalue = round(pvalue, 4)
if num == 0:
df_ks = pd.DataFrame(
{
"FeatureName": feature_name,
"KS_PValue": pvalue,
"KS_TestStatistic": statistic,
},
index=[num],
)
else:
df_ks_feat = pd.DataFrame(
{
"FeatureName": feature_name,
"KS_PValue": pvalue,
"KS_TestStatistic": statistic,
},
index=[num],
)
df_ks = pd.concat([df_ks, df_ks_feat])
df_ks["KS_DistributionsIdentical"] = np.where(df_ks["KS_PValue"] < 0.05, 0, 1)
return df_ks
def compute_wasserstein_distance(train_data, forward_val_data):
"""Calculate the wasserstein distance.
Args:
train_data (pd.DataFrame): data used to train model.
forward_val_data (pd.DataFrame): data used for the forward validation.
Returns:
pd.DataFrame: dataframe containing the wasserstein distance results.
"""
for num, feature_name in enumerate(train_data.columns.tolist()):
w_distance = scipy.stats.wasserstein_distance(
train_data[feature_name], forward_val_data[feature_name]
)
if num == 0:
df_wd = pd.DataFrame(
{"FeatureName": feature_name, "WassersteinDistance": w_distance},
index=[num],
)
else:
df_wd_feat = pd.DataFrame(
{"FeatureName": feature_name, "WassersteinDistance": w_distance},
index=[num],
)
df_wd = pd.concat([df_wd, df_wd_feat])
df_wd = df_wd.sort_values(by="WassersteinDistance", ascending=True)
return df_wd
##############################################################
# Load data
##############################################################
model_type = config["model_settings"]["model_type"]
# Setup log file
log = open(
os.path.join(
config["outputs"]["logging_dir"], "run_forward_val_" + model_type + ".log"
),
"w",
)
sys.stdout = log
# Load test data
forward_val_data_imputed = pd.read_pickle(
os.path.join(
config["outputs"]["model_input_data_dir"],
"forward_val_imputed_{}.pkl".format(model_type),
)
)
forward_val_data_not_imputed = pd.read_pickle(
os.path.join(
config["outputs"]["model_input_data_dir"],
"forward_val_not_imputed_{}.pkl".format(model_type),
)
)
# Load exac event type data
#test_exac_data = pd.read_pickle("./data/forward_val_exac_data.pkl")
# Load data the model was trained on
train_data = model_h.load_data_for_modelling(
os.path.join(
config["outputs"]["model_input_data_dir"],
"crossval_imputed_{}.pkl".format(model_type),
)
)
##############################################################
# Check for data drift
##############################################################
train_data_for_data_drift = train_data.drop(columns=["StudyId", "IndexDate"])
forward_val_data_for_data_drift = forward_val_data_imputed.drop(columns=["StudyId", "IndexDate"])
df_ks = perform_ks_test(train_data_for_data_drift, forward_val_data_for_data_drift)
df_wd = compute_wasserstein_distance(
train_data_for_data_drift, forward_val_data_for_data_drift
)
df_data_drift = df_wd.merge(df_ks, on="FeatureName", how="left")
print(df_data_drift)
##############################################################
# Prepare data for running model
##############################################################
# Value counts for hospital and community exacerbations
print(forward_val_data_imputed["ExacWithin3Months"].value_counts())
print(
forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][
"HospExacWithin3Months"
].value_counts()
)
print(
forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][
"CommExacWithin3Months"
].value_counts()
)
# Separate features and target
forward_val_features_imp = forward_val_data_imputed.drop(
columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months',
'CommExacWithin3Months']
)
forward_val_target_imp = forward_val_data_imputed["ExacWithin3Months"]
forward_val_features_no_imp = forward_val_data_not_imputed.drop(
columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months',
'CommExacWithin3Months']
)
forward_val_target_no_imp = forward_val_data_not_imputed["ExacWithin3Months"]
# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not forward_val_target_no_imp.equals(forward_val_target_imp):
raise ValueError(
"Target variable is not the same in imputed and non imputed datasets in the test set."
)
test_target = forward_val_target_no_imp
# Make sure all features are numeric
for features in [forward_val_features_imp, forward_val_features_no_imp]:
for col in features:
features[col] = pd.to_numeric(features[col], errors="coerce")
# Make a list of models to perform forward validation on. Contains model name, whether
# imputation was performed, and the threshold used in the original model
models = [
("balanced_random_forest", "imputed", 0.27),
# ("xgb", "not_imputed", 0.44),
# ("random_forest", "imputed", 0.30),
]
##############################################################
# Run models
##############################################################
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment("model_h_drop_1_hosp_comm")
with mlflow.start_run(run_name="sig_forward_val_models_10_2023"):
for model_info in models:
print(model_info[0])
with mlflow.start_run(run_name=model_info[0], nested=True):
# Create the artifacts directory if it doesn't exist
os.makedirs(config["outputs"]["artifact_dir"], exist_ok=True)
# Remove existing directory contents to not mix files between different runs
shutil.rmtree(config["outputs"]["artifact_dir"])
#### Load model ####
with open("./data/model/trained_iso_" + model_info[0] + "_pkl", "rb") as f:
model = pickle.load(f)
# Select the correct data based on model used
if model_info[1] == "imputed":
test_features = forward_val_features_imp
else:
test_features = forward_val_features_no_imp
#### Run model and get predictions for forward validation data ####
test_probs = model.predict_proba(test_features)[:, 1]
test_preds = model.predict(test_features)
#### Calculate metrics ####
metrics = model_h.calc_eval_metrics_for_model(
test_target,
test_preds,
test_probs,
"forward_val",
best_threshold=model_info[2],
)
#### Plot confusion matrix ####
model_h.plot_confusion_matrix(
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, model_info[2]],
test_probs,
test_target,
model_info[0],
model_type,
"forward_val",
)
#### Plot calibration curves ####
for bins in [6, 10]:
plt.figure(figsize=(8, 8))
plt.plot([0, 1], [0, 1], linestyle="--")
model_h.plot_calibration_curve(
test_target, test_probs, bins, "quantile", "Forward Validation"
)
plt.legend(bbox_to_anchor=(1.05, 1.0), loc="upper left")
plt.title(model_info[0])
plt.tight_layout()
plt.savefig(
os.path.join(
config["outputs"]["artifact_dir"],
model_info[0]
+ "_"
+ "quantile"
+ "_bins"
+ str(bins)
+ model_type
+ ".png",
)
)
plt.close()
#### Calculate model performance by event type ####
# Create df to contain prediction data and event type data
preds_events_df_forward_val = model_h.create_df_probabilities_and_predictions(
test_probs,
model_info[2],
forward_val_data_imputed["StudyId"].tolist(),
test_target,
forward_val_data_imputed[["ExacWithin3Months", 'HospExacWithin3Months',
'CommExacWithin3Months']],
model_info[0],
model_type,
output_dir="./data/prediction_and_events/",
calib_type="forward_val",
)
# Subset to each event type and calculate metrics
metrics_by_event_type_forward_val = model_h.calc_metrics_by_event_type(
preds_events_df_forward_val, calib_type="forward_val"
)
# Subset to each event type and plot ROC curve
model_h.plot_roc_curve_by_event_type(
preds_events_df_forward_val, model_info[0], "forward_val"
)
# Subset to each event type and plot PR curve
model_h.plot_prec_recall_by_event_type(
preds_events_df_forward_val, model_info[0], "forward_val"
)
#### Plot distribution of model scores for uncalibrated model ####
model_h.plot_score_distribution(
test_target,
test_probs,
config["outputs"]["artifact_dir"],
model_info[0],
model_type,
)
#### Log to MLFlow ####
mlflow.log_metrics(metrics)
mlflow.log_artifacts(config["outputs"]["artifact_dir"])
mlflow.end_run()