"""
Validation process
"""
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from matplotlib import rcParams
from tableone import TableOne


# Set-up figures
rcParams['figure.figsize'] = 20, 5
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False


def plot_cluster_size(df, data_type):
    """
    Produce a bar plot of cluster size
    --------
    :param df: dataframe to plot
    :param data_type: type of data - train, test, val, rec, sup
    """
    # Number of patients
    fig, ax = plt.subplots()
    df.groupby('cluster').size().plot(ax=ax, kind='barh')
    title = "Patient Cohorts"
    ax.set_title(title)
    ax.set_xlabel("Number of Patients", size=20)
    ax.set_ylabel("Cluster")
    plt.tight_layout()
    mlflow.log_figure(fig, 'fig/' + title.replace(' ', '_') + '_' + data_type + '.png')


def plot_feature_hist(df, col, data_type):
    """
    Produce a histogram plot for a chosen feature
    --------
    :param df: dataframe to plot
    :param col: feature column to plot
    :param data_type: type of data - train, test, val, rec, sup
    """
    fig, ax = plt.subplots()
    df.groupby('cluster')[col].plot(ax=ax, kind='hist', alpha=0.5)
    ax.set_xlabel(col)
    title = col + ' Histogram'
    ax.set_title(title, size=20)
    ax.legend()
    plt.tight_layout()
    mlflow.log_figure(fig, 'fig/' + title.replace(' ', '_') + '_' + data_type + '.png')


def plot_feature_bar(data, col, typ, data_type):
    """
    Produce a bar plot for a chosen feature
    --------
    :param df: dataframe to plot
    :param col: feature column to plot
    :param typ: 'count' or 'percentage'
    :param data_type: type of data - train, test, val, rec, sup
    """
    if typ == 'count':
        to_plot = data.groupby(['cluster']).apply(
            lambda x: x.groupby(col).size())
        x_label = "Number"
    else:
        to_plot = data.groupby(['cluster']).apply(
            lambda x: 100 * x.groupby(col).size() / len(x))
        x_label = "Percentage"
    fig, ax = plt.subplots()
    to_plot.plot(ax=ax, kind='barh')
    title = "Patient Cohorts"
    ax.set_title(title, size=20)
    ax.set_xlabel(x_label + " of patients")
    ax.set_ylabel("Cluster")
    plt.tight_layout()
    mlflow.log_figure(fig, 'fig/' + '_'.join((title.replace(' ', '_'), col, data_type + '.png')))


def plot_cluster_bar(data, typ, data_type):
    """
    Produce a bar plot for a chosen feature
    --------
    :param data: data to plot
    :param typ: 'count' or 'percentage'
    :param data_type: type of data - train, test, val, rec, sup
    """
    fig, ax = plt.subplots()
    data.plot(ax=ax, kind='bar')
    ax.set_title(typ, size=20)
    ax.set_xlabel("Cluster")
    ax.set_ylabel("Percentage")
    ax.set_ylim(0, 100)
    plt.tight_layout()
    mlflow.log_figure(fig, 'fig/' + typ + '_' + data_type + '.png')


def plot_events(df, data_type):
    """
    Plot events in the next 12 months based on metric table
    --------
    :param df: metric table
    :param data_type: type of data - train, test, val, rec, sup
    """
    df = df.drop('SafeHavenID', axis=1).set_index('cluster')
    events = df.groupby('cluster').apply(lambda x: 100 * x.apply(
        lambda x: len(x[x == 1]) / len(x)))
    plot_cluster_bar(events, 'events', data_type)


def process_deceased_metrics(col):
    """
    Process deceased column for plotting
    -------
    :param col: column to process
    """
    n_deceased = 100 * ((col[col < '12+']).count()) / len(col)
    res = pd.DataFrame({'alive': [100 - n_deceased], 'deceased': [n_deceased]})

    return res


def plot_deceased(df, data_type):
    """
    Plot events in the next 12 months based on metric table
    --------
    :param df: metric table
    :param data_type: type of data - train, test, val, rec, sup
    """
    survival = df.groupby('cluster')['time_to_death'].apply(
        process_deceased_metrics).reset_index().drop(
        'level_1', axis=1).set_index('cluster')
    plot_cluster_bar(survival, 'survival', data_type)


def plot_therapies(df_year, results, data_type):
    """
    Plot patient therapies per cluster
    --------
    :param df_year: unscaled data for current year
    :param results: cluster results and safehaven id
    :param data_type: type of data - train, test, val, rec, sup
    """
    # Inhaler data for training group
    therapies = df_year[['SafeHavenID', 'single_inhaler', 'double_inhaler', 'triple_inhaler']]
    res_therapies = pd.merge(therapies, results, on='SafeHavenID', how='inner')

    # Find counts/percentage per cluster
    inhaler_cols = ['single_inhaler', 'double_inhaler', 'triple_inhaler']
    inhals = res_therapies[['cluster'] + inhaler_cols].set_index('cluster')
    in_res = inhals.groupby('cluster').apply(
        lambda x: x.apply(lambda x: 100 * (x[x > 0].count()) / len(x)))

    # Number of people without an inhaler presc
    no_in = res_therapies.groupby('cluster').apply(
        lambda x: 100 * len(x[(x[inhaler_cols] == 0).all(axis=1)]) / len(x)).values

    # Rename columns for plotting
    in_res.columns = [c[0] for c in in_res.columns.str.split('_')]

    # Add those with no inhaler
    in_res['no_inhaler'] = no_in

    plot_cluster_bar(in_res, 'therapies', data_type)


def main():

    # Load in config items
    with open('../../../config.json') as json_config_file:
        config = json.load(json_config_file)
    data_path = config['model_data_path']

    # Get datatype from cmd line
    data_type = sys.argv[1]
    run_name = sys.argv[2]
    run_id = sys.argv[3]

    # Set MLFlow parameters
    model_type = 'hierarchical'
    experiment_name = 'Model E - Date Specific: ' + model_type
    mlflow.set_tracking_uri('http://127.0.0.1:5000/')
    mlflow.set_experiment(experiment_name)
    mlflow.start_run(run_id=run_id)

    # Read in unscaled data, results and column names used to train model
    columns = np.load(data_path + run_name + '_cols.npy', allow_pickle=True)
    df_clusters = pd.read_pickle(data_path + "_".join((run_name, data_type, 'clusters.pkl')))
    df_reduced = df_clusters[list(columns) + ['cluster']]

    # Number of patients
    plot_cluster_size(df_reduced, data_type)

    # Generate mean/std table
    t1_year = TableOne(df_reduced, categorical=[], groupby='cluster', pval=True)
    t1yr_file = data_path + 't1_year_' + run_name + '_' + data_type + '.html'
    t1_year.to_html(t1yr_file)
    mlflow.log_artifact(t1yr_file)

    # Histogram feature plots
    plot_feature_hist(df_clusters, 'age', data_type)
    plot_feature_hist(df_clusters, 'albumin_med_2yr', data_type)

    # Bar plots
    df_clusters['sex'] = df_clusters['sex_bin'].map({0: 'Male', 1: 'Female'})
    plot_feature_bar(df_clusters, 'sex', 'percent', data_type)
    plot_feature_bar(df_clusters, 'simd_decile', 'precent', data_type)

    # Metrics for following 12 months
    df_events = pd.read_pickle(data_path + 'metric_table_events.pkl')
    df_counts = pd.read_pickle(data_path + 'metric_table_counts.pkl')
    df_next = pd.read_pickle(data_path + 'metric_table_next.pkl')

    # Merge cluster number with SafeHavenID and metrics
    clusters = df_clusters[['SafeHavenID', 'cluster']]
    df_events = clusters.merge(df_events, on='SafeHavenID', how='left').fillna(0)
    df_counts = clusters.merge(df_counts, on='SafeHavenID', how='left').fillna(0)
    df_next = clusters.merge(df_next, on='SafeHavenID', how='left').fillna('12+')

    # Generate TableOne for events
    cat_cols = df_events.columns[2:]
    df_events[cat_cols] = df_events[cat_cols].astype('int')
    event_limit = dict(zip(cat_cols, 5 * [1]))
    event_order = dict(zip(cat_cols, 5 * [[1, 0]]))
    t1_events = TableOne(df_events[df_events.columns[1:]], groupby='cluster',
                         limit=event_limit, order=event_order)
    t1_events_file = data_path + '_'.join(('t1', data_type, 'events', run_name + '.html'))
    t1_events.to_html(t1_events_file)
    mlflow.log_artifact(t1_events_file)

    # Generate TableOne for event counts
    count_cols = df_counts.columns[2:]
    df_counts[count_cols] = df_counts[count_cols].astype('int')
    t1_counts = TableOne(df_counts[df_counts.columns[1:]], categorical=[], groupby='cluster')
    t1_counts_file = data_path + '_'.join(('t1', data_type, 'counts', run_name + '.html'))
    t1_counts.to_html(t1_counts_file)
    mlflow.log_artifact(t1_counts_file)

    # Generate TableOne for time to next events
    next_cols = df_next.columns[2:]
    next_event_order = dict(zip(next_cols, 5 * [['1', '3', '6', '12', '12+']]))
    t1_next = TableOne(df_next[df_next.columns[1:]], groupby='cluster',
                       order=next_event_order)
    t1_next_file = data_path + '_'.join(('t1', data_type, 'next', run_name + '.html'))
    t1_next.to_html(t1_next_file)
    mlflow.log_artifact(t1_next_file)

    # Plot metrics
    plot_events(df_events, data_type)
    plot_deceased(df_next, data_type)
    plot_therapies(df_clusters, clusters, data_type)

    # Stop ML Flow
    mlflow.end_run()


main()