"""Script that combines features, performs encoding of categorical features and imputation.

Demographics, exacerbation history, comorbidities, spirometry, labs, and pro datasets
combined. Splitting of dataset performed if the data_to_process specified in config.yaml is
not forward_val. Performs encoding of categorical features, and imputation of missing
values. Two versions of the data is saved: imputed and not imputed dataframes.
"""

import pandas as pd
import numpy as np
import os
import sys
import yaml
import json
import joblib
import encoding
import imputation


with open("./training/config.yaml", "r") as config:
    config = yaml.safe_load(config)

# Specify which model to generate features for
model_type = config["model_settings"]["model_type"]

# Setup log file
log = open("./training/logging/combine_features_" + model_type + ".log", "w")
sys.stdout = log

# Dataset to process - set through config file
data_to_process = config["model_settings"]["data_to_process"]

############################################################################
# Combine features
############################################################################

# Load cohort data
if data_to_process == "forward_val":
    demographics = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "demographics_forward_val_{}.pkl".format(model_type),
        )
    )
    exac_history = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "exac_history_forward_val_{}.pkl".format(model_type),
        )
    )
    comorbidities = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "comorbidities_forward_val_{}.pkl".format(model_type),
        )
    )
    spirometry = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "spirometry_forward_val_{}.pkl".format(model_type),
        )
    )
    labs = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "labs_forward_val_{}.pkl".format(model_type),
        )
    )
    pros = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "pros_forward_val_{}.pkl".format(model_type),
        )
    )
else:
    demographics = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "demographics_{}.pkl".format(model_type),
        )
    )
    exac_history = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "exac_history_{}.pkl".format(model_type),
        )
    )
    comorbidities = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "comorbidities_{}.pkl".format(model_type),
        )
    )
    spirometry = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "spirometry_{}.pkl".format(model_type),
        )
    )
    labs = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"], "labs_{}.pkl".format(model_type)
        )
    )
    pros = pd.read_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"], "pros_{}.pkl".format(model_type)
        )
    )

data_combined = demographics.merge(
    exac_history, on=["StudyId", "IndexDate"], how="left"
)
data_combined = data_combined.merge(
    comorbidities, on=["StudyId", "IndexDate"], how="left"
)
data_combined = data_combined.merge(spirometry, on=["StudyId", "IndexDate"], how="left")
data_combined = data_combined.merge(labs, on=["StudyId", "IndexDate"], how="left")
data_combined = data_combined.merge(pros, on=["StudyId", "IndexDate"], how="left")

# Print dataset info
print(
    "Data date range",
    data_combined["IndexDate"].min(),
    data_combined["IndexDate"].max(),
)
print("Mean age", data_combined["Age"].mean())
print("Sex Female:", data_combined["Sex_F"].value_counts())

if data_to_process != "forward_val":
    # Load training and test ids
    train_ids = pd.read_pickle(
        os.path.join(
            config["outputs"]["cohort_info_dir"], "train_ids_{}.pkl".format(model_type)
        )
    )
    test_ids = pd.read_pickle(
        os.path.join(
            config["outputs"]["cohort_info_dir"], "test_ids_{}.pkl".format(model_type)
        )
    )
    fold_patients = np.load(
        os.path.join(
            config["outputs"]["cohort_info_dir"],
            "fold_patients_{}.npy".format(model_type),
        ),
        allow_pickle=True,
    )

    # Split data into training and test sets
    train_data = data_combined[data_combined["StudyId"].isin(train_ids)]
    test_data = data_combined[data_combined["StudyId"].isin(test_ids)]
    train_data = train_data.sort_values(by=["StudyId", "IndexDate"]).reset_index(
        drop=True
    )
    test_data = test_data.sort_values(by=["StudyId", "IndexDate"]).reset_index(
        drop=True
    )

    # Save data
    train_data.to_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "train_combined_{}.pkl".format(model_type),
        )
    )
    test_data.to_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "test_combined_{}.pkl".format(model_type),
        )
    )
else:
    data_combined.to_pickle(
        os.path.join(
            config["outputs"]["processed_data_dir"],
            "forward_val_combined_{}.pkl".format(model_type),
        )
    )