import numpy as np
import pandas as pd
import pickle
import model_h
import mlflow
import os
import shutil
import matplotlib.pyplot as plt
import sys
import scipy
import yaml

with open("./training/config.yaml", "r") as config:
    config = yaml.safe_load(config)


def perform_ks_test(train_data, forward_val_data):
    """Perform Kolmogorov-Smirnov test.

    Args:
        train_data (pd.DataFrame): data used to train model.
        forward_val_data (pd.DataFrame): data used for the forward validation.

    Returns:
        pd.DataFrame: dataframe containing the results of the K-S test.
    """
    for num, feature_name in enumerate(train_data.columns.tolist()):
        statistic, pvalue = scipy.stats.ks_2samp(
            train_data[feature_name], forward_val_data[feature_name]
        )
        pvalue = round(pvalue, 4)
        if num == 0:
            df_ks = pd.DataFrame(
                {
                    "FeatureName": feature_name,
                    "KS_PValue": pvalue,
                    "KS_TestStatistic": statistic,
                },
                index=[num],
            )
        else:
            df_ks_feat = pd.DataFrame(
                {
                    "FeatureName": feature_name,
                    "KS_PValue": pvalue,
                    "KS_TestStatistic": statistic,
                },
                index=[num],
            )
            df_ks = pd.concat([df_ks, df_ks_feat])
    df_ks["KS_DistributionsIdentical"] = np.where(df_ks["KS_PValue"] < 0.05, 0, 1)
    return df_ks


def compute_wasserstein_distance(train_data, forward_val_data):
    """Calculate the wasserstein distance.

    Args:
        train_data (pd.DataFrame): data used to train model.
        forward_val_data (pd.DataFrame): data used for the forward validation.

    Returns:
        pd.DataFrame: dataframe containing the wasserstein distance results.
    """
    for num, feature_name in enumerate(train_data.columns.tolist()):
        w_distance = scipy.stats.wasserstein_distance(
            train_data[feature_name], forward_val_data[feature_name]
        )
        if num == 0:
            df_wd = pd.DataFrame(
                {"FeatureName": feature_name, "WassersteinDistance": w_distance},
                index=[num],
            )
        else:
            df_wd_feat = pd.DataFrame(
                {"FeatureName": feature_name, "WassersteinDistance": w_distance},
                index=[num],
            )
            df_wd = pd.concat([df_wd, df_wd_feat])
    df_wd = df_wd.sort_values(by="WassersteinDistance", ascending=True)
    return df_wd


##############################################################
# Load data
##############################################################
model_type = config["model_settings"]["model_type"]

# Setup log file
log = open(
    os.path.join(
        config["outputs"]["logging_dir"], "run_forward_val_" + model_type + ".log"
    ),
    "w",
)
sys.stdout = log

# Load test data
forward_val_data_imputed = pd.read_pickle(
    os.path.join(
        config["outputs"]["model_input_data_dir"],
        "forward_val_imputed_{}.pkl".format(model_type),
    )
)
forward_val_data_not_imputed = pd.read_pickle(
    os.path.join(
        config["outputs"]["model_input_data_dir"],
        "forward_val_not_imputed_{}.pkl".format(model_type),
    )
)

# Load exac event type data
#test_exac_data = pd.read_pickle("./data/forward_val_exac_data.pkl")

# Load data the model was trained on
train_data = model_h.load_data_for_modelling(
    os.path.join(
        config["outputs"]["model_input_data_dir"],
        "crossval_imputed_{}.pkl".format(model_type),
    )
)

##############################################################
# Check for data drift
##############################################################
train_data_for_data_drift = train_data.drop(columns=["StudyId", "IndexDate"])
forward_val_data_for_data_drift = forward_val_data_imputed.drop(columns=["StudyId", "IndexDate"])

df_ks = perform_ks_test(train_data_for_data_drift, forward_val_data_for_data_drift)
df_wd = compute_wasserstein_distance(
    train_data_for_data_drift, forward_val_data_for_data_drift
)
df_data_drift = df_wd.merge(df_ks, on="FeatureName", how="left")
print(df_data_drift)

##############################################################
# Prepare data for running model
##############################################################
# Value counts for hospital and community exacerbations
print(forward_val_data_imputed["ExacWithin3Months"].value_counts())
print(
    forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][
        "HospExacWithin3Months"
    ].value_counts()
)
print(
    forward_val_data_imputed[forward_val_data_imputed["ExacWithin3Months"] == 1][
        "CommExacWithin3Months"
    ].value_counts()
)

# Separate features and target
forward_val_features_imp = forward_val_data_imputed.drop(
    columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months',
       'CommExacWithin3Months']
)
forward_val_target_imp = forward_val_data_imputed["ExacWithin3Months"]
forward_val_features_no_imp = forward_val_data_not_imputed.drop(
    columns=["StudyId", "IndexDate", "ExacWithin3Months", 'HospExacWithin3Months',
       'CommExacWithin3Months']
)
forward_val_target_no_imp = forward_val_data_not_imputed["ExacWithin3Months"]

# Check that the target in imputed and not imputed datasets are the same. If not,
# raise an error
if not forward_val_target_no_imp.equals(forward_val_target_imp):
    raise ValueError(
        "Target variable is not the same in imputed and non imputed datasets in the test set."
    )
test_target = forward_val_target_no_imp

# Make sure all features are numeric
for features in [forward_val_features_imp, forward_val_features_no_imp]:
    for col in features:
        features[col] = pd.to_numeric(features[col], errors="coerce")

# Make a list of models to perform forward validation on. Contains model name, whether
# imputation was performed, and the threshold used in the original model
models = [
    ("balanced_random_forest", "imputed", 0.27),
    #    ("xgb", "not_imputed", 0.44),
    #    ("random_forest", "imputed", 0.30),
]

##############################################################
# Run models
##############################################################
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment("model_h_drop_1_hosp_comm")

with mlflow.start_run(run_name="sig_forward_val_models_10_2023"):
    for model_info in models:
        print(model_info[0])
        with mlflow.start_run(run_name=model_info[0], nested=True):
            # Create the artifacts directory if it doesn't exist
            os.makedirs(config["outputs"]["artifact_dir"], exist_ok=True)
            # Remove existing directory contents to not mix files between different runs
            shutil.rmtree(config["outputs"]["artifact_dir"])

            #### Load model ####
            with open("./data/model/trained_iso_" + model_info[0] + "_pkl", "rb") as f:
                model = pickle.load(f)

            # Select the correct data based on model used
            if model_info[1] == "imputed":
                test_features = forward_val_features_imp
            else:
                test_features = forward_val_features_no_imp

            #### Run model and get predictions for forward validation data ####
            test_probs = model.predict_proba(test_features)[:, 1]
            test_preds = model.predict(test_features)

            #### Calculate metrics ####
            metrics = model_h.calc_eval_metrics_for_model(
                test_target,
                test_preds,
                test_probs,
                "forward_val",
                best_threshold=model_info[2],
            )

            #### Plot confusion matrix ####
            model_h.plot_confusion_matrix(
                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, model_info[2]],
                test_probs,
                test_target,
                model_info[0],
                model_type,
                "forward_val",
            )

            #### Plot calibration curves ####
            for bins in [6, 10]:
                plt.figure(figsize=(8, 8))
                plt.plot([0, 1], [0, 1], linestyle="--")
                model_h.plot_calibration_curve(
                    test_target, test_probs, bins, "quantile", "Forward Validation"
                )
                plt.legend(bbox_to_anchor=(1.05, 1.0), loc="upper left")
                plt.title(model_info[0])
                plt.tight_layout()
                plt.savefig(
                    os.path.join(
                        config["outputs"]["artifact_dir"],
                        model_info[0]
                        + "_"
                        + "quantile"
                        + "_bins"
                        + str(bins)
                        + model_type
                        + ".png",
                    )
                )
                plt.close()

            #### Calculate model performance by event type ####
            # Create df to contain prediction data and event type data
            preds_events_df_forward_val = model_h.create_df_probabilities_and_predictions(
                test_probs,
                model_info[2],
                forward_val_data_imputed["StudyId"].tolist(),
                test_target,
                forward_val_data_imputed[["ExacWithin3Months", 'HospExacWithin3Months',
       'CommExacWithin3Months']],
                model_info[0],
                model_type,
                output_dir="./data/prediction_and_events/",
                calib_type="forward_val",
            )
            # Subset to each event type and calculate metrics
            metrics_by_event_type_forward_val = model_h.calc_metrics_by_event_type(
                preds_events_df_forward_val, calib_type="forward_val"
            )
            # Subset to each event type and plot ROC curve
            model_h.plot_roc_curve_by_event_type(
                preds_events_df_forward_val, model_info[0], "forward_val"
            )
            # Subset to each event type and plot PR curve
            model_h.plot_prec_recall_by_event_type(
                preds_events_df_forward_val, model_info[0], "forward_val"
            )

            #### Plot distribution of model scores for uncalibrated model ####
            model_h.plot_score_distribution(
                test_target,
                test_probs,
                config["outputs"]["artifact_dir"],
                model_info[0],
                model_type,
            )

            #### Log to MLFlow ####
            mlflow.log_metrics(metrics)
            mlflow.log_artifacts(config["outputs"]["artifact_dir"])
mlflow.end_run()