import numpy as np import pandas as pd import pickle import model_h import mlflow import os import shutil import matplotlib.pyplot as plt import sys import scipy import yaml with open("./training/config.yaml", "r") as config: config = yaml.safe_load(config) def perform_ks_test(train_data, forward_val_data): """Perform Kolmogorov-Smirnov test. Args: train_data (pd.DataFrame): data used to train model. forward_val_data (pd.DataFrame): data used for the forward validation. Returns: pd.DataFrame: dataframe containing the results of the K-S test. """ for num, feature_name in enumerate(train_data.columns.tolist()): statistic, pvalue = scipy.stats.ks_2samp( train_data[feature_name], forward_val_data[feature_name] ) pvalue = round(pvalue, 4) if num == 0: df_ks = pd.DataFrame( { "FeatureName": feature_name, "KS_PValue": pvalue, "KS_TestStatistic": statistic, }, index=[num], ) else: df_ks_feat = pd.DataFrame( { "FeatureName": feature_name, "KS_PValue": pvalue, "KS_TestStatistic": statistic, }, index=[num], ) df_ks = pd.concat([df_ks, df_ks_feat]) df_ks["KS_DistributionsIdentical"] = np.where(df_ks["KS_PValue"] < 0.05, 0, 1) return df_ks def compute_wasserstein_distance(train_data, forward_val_data): """Calculate the wasserstein distance. Args: train_data (pd.DataFrame): data used to train model. forward_val_data (pd.DataFrame): data used for the forward validation. Returns: pd.DataFrame: dataframe containing the wasserstein distance results. """ for num, feature_name in enumerate(train_data.columns.tolist()): w_distance = scipy.stats.wasserstein_distance( train_data[feature_name], forward_val_data[feature_name] ) if num == 0: df_wd = pd.DataFrame( {"FeatureName": feature_name, "WassersteinDistance": w_distance}, index=[num], ) else: df_wd_feat = pd.DataFrame( {"FeatureName": feature_name, "WassersteinDistance": w_distance}, index=[num], ) df_wd = pd.concat([df_wd, df_wd_feat]) df_wd = df_wd.sort_values(by="WassersteinDistance", ascending=True) return df_wd ############################################################## # Load data ############################################################## model_type = config["model_settings"]["model_type"] # Setup log file log = open( os.path.join( config["outputs"]["logging_dir"], "run_forward_val_" + model_type + ".log" ), "w", ) sys.stdout = log # Load test data forward_val_data_imputed = pd.read_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "forward_val_imputed_{}.pkl".format(model_type), ) ) forward_val_data_not_imputed = pd.read_pickle( os.path.join( config["outputs"]["model_input_data_dir"], "forward_val_not_imputed_{}.pkl".format(model_type), ) ) # Load exac event type data #test_exac_data = pd.read_pickle("./data/forward_val_exac_data.pkl") # Load data the model was trained on train_data = model_h.load_data_for_modelling( os.path.join( config["outputs"]["model_input_data_dir"], "crossval_imputed_{}.pkl".format(model_type), ) ) ############################################################## # Check for data drift ############################################################## train_data_for_data_drift = train_data.drop(columns=["StudyId", "IndexDate"]) forward_val_data_for_data_drift = forward_val_data_imputed.drop(columns=["StudyId", "IndexDate"]) df_ks = perform_ks_test(train_data_for_data_drift, forward_val_data_for_data_drift) df_wd = compute_wasserstein_distance( train_data_for_data_drift, forward_val_data_for_data_drift ) df_data_drift = df_wd.merge(df_ks, on="FeatureName", how="left") print(df_data_drift) ############################################################## # Prepare data for running model ############################################################## # Value counts for hospital and community exacerbations print(forward_val_data_imputed["ExacWithin3Months"].value_counts()) print( forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][ "HospExacWithin3Months" ].value_counts() ) print( forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][ "CommExacWithin3Months" ].value_counts() ) # Separate features and target forward_val_features_imp = forward_val_data_imputed.drop( columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months', 'CommExacWithin3Months'] ) forward_val_target_imp = forward_val_data_imputed["ExacWithin3Months"] forward_val_features_no_imp = forward_val_data_not_imputed.drop( columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months', 'CommExacWithin3Months'] ) forward_val_target_no_imp = forward_val_data_not_imputed["ExacWithin3Months"] # Check that the target in imputed and not imputed datasets are the same. If not, # raise an error if not forward_val_target_no_imp.equals(forward_val_target_imp): raise ValueError( "Target variable is not the same in imputed and non imputed datasets in the test set." ) test_target = forward_val_target_no_imp # Make sure all features are numeric for features in [forward_val_features_imp, forward_val_features_no_imp]: for col in features: features[col] = pd.to_numeric(features[col], errors="coerce") # Make a list of models to perform forward validation on. Contains model name, whether # imputation was performed, and the threshold used in the original model models = [ ("balanced_random_forest", "imputed", 0.27), # ("xgb", "not_imputed", 0.44), # ("random_forest", "imputed", 0.30), ] ############################################################## # Run models ############################################################## mlflow.set_tracking_uri("sqlite:///mlruns.db") mlflow.set_experiment("model_h_drop_1_hosp_comm") with mlflow.start_run(run_name="sig_forward_val_models_10_2023"): for model_info in models: print(model_info[0]) with mlflow.start_run(run_name=model_info[0], nested=True): # Create the artifacts directory if it doesn't exist os.makedirs(config["outputs"]["artifact_dir"], exist_ok=True) # Remove existing directory contents to not mix files between different runs shutil.rmtree(config["outputs"]["artifact_dir"]) #### Load model #### with open("./data/model/trained_iso_" + model_info[0] + "_pkl", "rb") as f: model = pickle.load(f) # Select the correct data based on model used if model_info[1] == "imputed": test_features = forward_val_features_imp else: test_features = forward_val_features_no_imp #### Run model and get predictions for forward validation data #### test_probs = model.predict_proba(test_features)[:, 1] test_preds = model.predict(test_features) #### Calculate metrics #### metrics = model_h.calc_eval_metrics_for_model( test_target, test_preds, test_probs, "forward_val", best_threshold=model_info[2], ) #### Plot confusion matrix #### model_h.plot_confusion_matrix( [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, model_info[2]], test_probs, test_target, model_info[0], model_type, "forward_val", ) #### Plot calibration curves #### for bins in [6, 10]: plt.figure(figsize=(8, 8)) plt.plot([0, 1], [0, 1], linestyle="--") model_h.plot_calibration_curve( test_target, test_probs, bins, "quantile", "Forward Validation" ) plt.legend(bbox_to_anchor=(1.05, 1.0), loc="upper left") plt.title(model_info[0]) plt.tight_layout() plt.savefig( os.path.join( config["outputs"]["artifact_dir"], model_info[0] + "_" + "quantile" + "_bins" + str(bins) + model_type + ".png", ) ) plt.close() #### Calculate model performance by event type #### # Create df to contain prediction data and event type data preds_events_df_forward_val = model_h.create_df_probabilities_and_predictions( test_probs, model_info[2], forward_val_data_imputed["StudyId"].tolist(), test_target, forward_val_data_imputed[["ExacWithin3Months", 'HospExacWithin3Months', 'CommExacWithin3Months']], model_info[0], model_type, output_dir="./data/prediction_and_events/", calib_type="forward_val", ) # Subset to each event type and calculate metrics metrics_by_event_type_forward_val = model_h.calc_metrics_by_event_type( preds_events_df_forward_val, calib_type="forward_val" ) # Subset to each event type and plot ROC curve model_h.plot_roc_curve_by_event_type( preds_events_df_forward_val, model_info[0], "forward_val" ) # Subset to each event type and plot PR curve model_h.plot_prec_recall_by_event_type( preds_events_df_forward_val, model_info[0], "forward_val" ) #### Plot distribution of model scores for uncalibrated model #### model_h.plot_score_distribution( test_target, test_probs, config["outputs"]["artifact_dir"], model_info[0], model_type, ) #### Log to MLFlow #### mlflow.log_metrics(metrics) mlflow.log_artifacts(config["outputs"]["artifact_dir"]) mlflow.end_run()