# app.py
# ===== 1) Install deps (Colab) =====
# !pip -q install kagglehub[pandas-datasets] scikit-learn matplotlib gradio pillow


"""
Feature/Depth/Sample Explorer

Dataset: Customer Shopping Trends (Kaggle)
URL: https://www.kaggle.com/datasets/iamsouravbanerjee/customer-shopping-trends-dataset

Purpose: Educational tool to visualize how model complexity (tree depth),
training sample size, and data dimensionality affect generalization
(under/overfitting) via F1 on a held-out test set.
"""

# ===== 2) App (launch inline) =====
# import io, re
from typing import List, Sequence #, Tuple
import numpy as np
import pandas as pd
import os
# import matplotlib.pyplot as plt
# from matplotlib.ticker import MaxNLocator

import plotly.graph_objects as go
# from plotly.subplots import make_subplots

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

# import kagglehub
# from kagglehub import KaggleDatasetAdapter
import gradio as gr

# Apply global styling
custom_css = """
/* === Base font and readability === */
.gradio-container label, 
.gradio-container h1, 
.gradio-container h2, 
.gradio-container h3, 
.gradio-container p, 
.gradio-container button, 
.gradio-container span, 
.gradio-container div {
    font-weight: 600 !important;
    line-height: 1.2 !important;
    word-break: normal !important;
    overflow-wrap: normal !important;
    white-space: normal !important;
}
"""


# ---- App metadata ----
APP_NAME = "Feature/Depth/Sample Explorer"
# DATASET_NAME = "Customer Shopping Trends (Kaggle)"
# DATASET_URL = "https://www.kaggle.com/datasets/iamsouravbanerjee/customer-shopping-trends-dataset"
DATASET_NAME = "UCI Irvine - Predict Students' Dropout and Academic Success"
DATASET_URL = "https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success"
TOOL_DESC = (
    "Explore how decision tree depth (model complexity), training sample size, "
    "and feature count affect performance (e.g., F1)."
)

ABOUT_MD = f"""
### {APP_NAME}
{TOOL_DESC}

**Dataset:** [{DATASET_NAME}]({DATASET_URL})  
**Target:** Student is a Dropout vs. Non-dropout.

This tool is for education only.
"""
# **Target:** `Discount Applied` (binary)

# ---------------- Config ----------------
TARGET_COL = "Target"
TEST_SIZE = 700
TRAIN_FOLD_STEP=300
MIN_TRAIN_SIZE = 800
N_SPLITS_K_FOLD = 4
RANDOM_SEEDS = [42, 43, 44, 45, 46]
DEFAULT_DEPTH_GRID = list(range(1, 51, 5))
# NUMERIC_CANDIDATES = ["Age", "Purchase Amount (USD)", "Review Rating", "Previous Purchases"]
NUMERIC_CANDIDATES = ["Application order", "Previous qualification (grade)", "Admission grade", "Age", "Curricular units 1st sem (credited)"
                      "Curricular units 1st sem (enrolled)", "Curricular units 1st sem (evaluations)", "Curricular units 1st sem (approved)", "Curricular units 1st sem (grade)",
                      "Curricular units 1st sem (without evaluations)", "Curricular units 2nd sem (credited)", "Curricular units 2nd sem (enrolled)", "Curricular units 2nd sem (evaluations)", "Curricular units 2nd sem (approved)"
                      "Curricular units 2nd sem (grade)", "Curricular units 2nd sem (without evaluations)", "Unemployment rate", "Inflation rate", "GDP"]

# Growing training set
FIXED_SEED = 7

# ---------------- Data loading ----------------
def load_data() -> pd.DataFrame:
    from ucimlrepo import fetch_ucirepo 
    
    # fetch dataset 
    predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
    
    df = predict_students_dropout_and_academic_success.data.features.copy()
    target = predict_students_dropout_and_academic_success.data.targets
    df[TARGET_COL] = target[TARGET_COL].str.strip().map({"Dropout": 1, "Enrolled":0, "Graduate": 0}).astype(int)
    return df


    # file_path = "shopping_trends_updated.csv"
    # df = kagglehub.dataset_load(
    #     KaggleDatasetAdapter.PANDAS,
    #     "iamsouravbanerjee/customer-shopping-trends-dataset",
    #     file_path,
    # ).copy()
    # df[TARGET_COL] = df[TARGET_COL].astype(str).str.strip().str.lower().map({"yes": 1, "no": 0}).astype(int)
    # return df

DF = load_data()
# ALL_FEATURES = DF.columns.drop([TARGET_COL, "Promo Code Used", "Customer ID"]).to_list()
ALL_FEATURES = DF.columns.drop([TARGET_COL]).to_list()
DEFAULT_SELECTED = ALL_FEATURES[:4]

def make_kfold_buckets(df: pd.DataFrame, target_col: str, k: int, seed: int = FIXED_SEED):
    """Return (train_folds, test_fold) where test_fold is fixed (e.g., fold 0)."""
    y = df[target_col].to_numpy()
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

    # Collect per-fold indices
    folds = []
    for _, test_idx in skf.split(np.zeros_like(y), y):
        folds.append(test_idx)

    return folds

# ---------------- Helpers ----------------
def stratified_fixed_sample(df: pd.DataFrame, train_total: int, seed: int = FIXED_SEED) -> pd.DataFrame:
    """Return a stratified fixed-size sample (same per n_total if seed fixed)."""
    if train_total < len(df):
        sample, test_sample = train_test_split(
            df, train_size=train_total, stratify=df[TARGET_COL], random_state=seed  # ← fixed seed
        )
    else:
        sample = df
        test_sample = None
    return sample.reset_index(), test_sample.reset_index()

TRAIN_DF, TEST_DF = stratified_fixed_sample(DF, train_total=len(DF) - TEST_SIZE)  # fixed once
num_folds = int(np.floor(len(TRAIN_DF)/TRAIN_FOLD_STEP))
TRAIN_FOLDS = make_kfold_buckets(TRAIN_DF, TARGET_COL, num_folds)
TRAIN_POOL = TRAIN_DF.copy()
BUCKET_SIZES = [len(b) for b in TRAIN_FOLDS]
CUM_BUCKET_SIZES = np.cumsum(BUCKET_SIZES)
TOTAL_TRAIN = len(TRAIN_POOL)
# print("TOTAL_TRAIN", TOTAL_TRAIN)
# print("CUM_BUCKET_SIZES", CUM_BUCKET_SIZES)
# print("BUCKET_SIZES", BUCKET_SIZES)
# print("TRAIN_FOLDS", TRAIN_FOLDS)
# print("TRAIN_POOL", TRAIN_POOL)

def get_train_indices_for_n(n_total: int) -> np.ndarray:
    # print("n_total", n_total)
    """Return nested indices for a requested training size using merged folds;
       subsample from the last bucket if needed to match n_total."""
    n = min(n_total, TOTAL_TRAIN)
    # print("n", n)
    # Find how many full buckets we need
    full = int(np.searchsorted(CUM_BUCKET_SIZES, n, side='right'))
    # print("full", full)
    if full == 0:
        # take a prefix of the first bucket
        idx = TRAIN_FOLDS[0][:n]
        # print("idx", idx)
    else:
        idx = np.concatenate(TRAIN_FOLDS[:(full)])
        # print("idx", idx)
        extra = n - len(idx)
        # print("extra", extra)
        # print(" TRAIN_FOLDS[full]",  TRAIN_FOLDS[full])
        # print(" len(TRAIN_FOLDS[full]",  len(TRAIN_FOLDS[full]))
        if extra > 0:
            idx = np.concatenate([idx, TRAIN_FOLDS[full][:extra]])
            # print("idx", idx)
    # print("-----")
    return idx

def get_train_df_for_n(n_total: int) -> pd.DataFrame:
    idx = get_train_indices_for_n(n_total)
    return TRAIN_POOL.loc[idx]


def split_features(feats: Sequence[str]):
    numeric = [c for c in feats if c in NUMERIC_CANDIDATES]
    categorical = [c for c in feats if c not in numeric]
    return numeric, categorical

def build_preprocessor(feats: Sequence[str]) -> ColumnTransformer:
    numeric, categorical = split_features(feats)
    return ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric),
    ])

def one_run(feats, max_depth, n_total, seed, auto_depth, depth_grid=DEFAULT_DEPTH_GRID):
    assert len(feats) > 0, "Select at least one feature."

    train_df = get_train_df_for_n(n_total)
    X_train, y_train = train_df[feats], train_df[TARGET_COL]
    X_test,  y_test  = TEST_DF[feats],  TEST_DF[TARGET_COL]

    prep = build_preprocessor(feats)
    base_clf = DecisionTreeClassifier(random_state=seed, class_weight="balanced")
    if auto_depth:
        pipe = Pipeline([("prep", prep), ("clf", base_clf)])
        cv = StratifiedKFold(n_splits=N_SPLITS_K_FOLD, shuffle=True, random_state=seed)
        grid = GridSearchCV(pipe, {"clf__max_depth": list(depth_grid)}, scoring="f1", cv=cv, refit=True, verbose=0)
        grid.fit(X_train, y_train)
        model = grid.best_estimator_
        chosen_depth = int(model.named_steps["clf"].get_depth())  # actual depth
    else:
        clf = DecisionTreeClassifier(random_state=seed, class_weight="balanced", max_depth=max_depth)
        model = Pipeline([("prep", prep), ("clf", clf)]).fit(X_train, y_train)
        chosen_depth = int(model.named_steps["clf"].get_depth())
    yhat_tr = model.predict(X_train)
    yhat_te = model.predict(X_test)
    return f1_score(y_train, yhat_tr), f1_score(y_test, yhat_te), chosen_depth

def percentile_band(arr: np.ndarray):
    means = arr.mean(axis=0)
    p10, p90 = np.percentile(arr, [10, 90], axis=0)
    return means, p10, p90


def line_and_band(fig, x, mean, lo, hi, name, color, dash="solid"):
    fill = color.replace("1)", "0.15)")
    fig.add_trace(go.Scatter(x=x, y=mean, mode="lines+markers",
                             name=name, line=dict(color=color, dash=dash)))
    fig.add_trace(go.Scatter(x=x, y=lo, mode="lines", line=dict(width=0),
                             showlegend=False, hoverinfo="skip"))
    fig.add_trace(go.Scatter(x=x, y=hi, mode="lines", line=dict(width=0),
                             fill="tonexty", fillcolor=fill,
                             name=f"{name} 10–90%", hoverinfo="skip"))

# ---------------- Plots ----------------

def plot_f1_vs_features(selected_feats: List[str], max_depth: int, n_total: int, auto_depth: bool):
    if not selected_feats:
        raise gr.Error("Please select at least one feature.")

    ks = list(range(1, len(selected_feats) + 1))
    tr_runs, te_runs, depth_runs = [], [], []

    for k in ks:
        tr_scores, te_scores, depths = [], [], []
        feats_k = selected_feats[:k]
        for s in RANDOM_SEEDS:
            tr, te, d = one_run(feats_k, max_depth, n_total, s, auto_depth)
            tr_scores.append(tr); te_scores.append(te); depths.append(d)
        tr_runs.append(tr_scores); te_runs.append(te_scores); depth_runs.append(depths)

    tr_arr, te_arr = np.array(tr_runs).T, np.array(te_runs).T
    tr_m, tr_lo, tr_hi = percentile_band(tr_arr)
    te_m, te_lo, te_hi = percentile_band(te_arr)

    x_labels = [selected_feats[i-1] for i in ks]

    # --- Figure 1: F1 vs #features ---
    fig_f1 = go.Figure()
    line_and_band(fig_f1, ks, tr_m, tr_lo, tr_hi, "Train F1", "rgba(31,119,180,1)")
    line_and_band(fig_f1, ks, te_m, te_lo, te_hi, "Test F1", "rgba(255,127,14,1)")


    mode = "auto-depth (grid search)" if auto_depth else f"max_depth={max_depth}"
    fig_f1.update_layout(
        title=f"F1 vs Features ({mode}; n={n_total})",
        template="plotly_white",
        height=600,
        margin=dict(l=40, r=10, t=60, b=60),
        legend=dict(orientation="h", y=-0.2),
        uirevision="keep-zoom"
    )
    fig_f1.update_xaxes(tickmode="array", tickvals=ks, ticktext=x_labels, tickangle=-30)
    fig_f1.update_yaxes(title_text="F1 Score", range=[0, 1])

    # --- Figure 2: Depth vs #features (only when auto_depth) ---
    if auto_depth:
        depth_arr = np.array(depth_runs).T
        d_m, d_lo, d_hi = percentile_band(depth_arr)

        fig_depth = go.Figure()
        line_and_band(fig_depth, ks, d_m, d_lo, d_hi, "Depth", "rgba(44,160,44,1)", dash="dot")

        fig_depth.update_layout(
            title=f"Depth vs Features (n={n_total})",
            template="plotly_white",
            height=600,
            margin=dict(l=40, r=10, t=60, b=60),
            legend=dict(orientation="h", y=-0.2),
            uirevision="keep-zoom"
        )
        fig_depth.update_xaxes(tickmode="array",
                               tickvals=ks, ticktext=x_labels, tickangle=-30)
        y_min = max(0, np.nanmin(d_lo) if np.isfinite(np.nanmin(d_lo)) else 0)
        y_max = np.nanmax(d_hi) if np.isfinite(np.nanmax(d_hi)) else None
        fig_depth.update_yaxes(title_text="Depth", dtick=1, range=[y_min, y_max])
    else:
        # Return a valid (empty) figure so Gradio Plot doesn't choke
        fig_depth = go.Figure()

    return fig_f1, gr.update(value=fig_depth, visible=auto_depth)


def plot_f1_vs_depth(selected_feats: List[str], n_total: int):
    if not selected_feats:
        raise gr.Error("Please select at least one feature.")
    depths = list(range(1, 51, 5))

    tr_runs, te_runs = [], []
    for d in depths:
        tr_scores, te_scores = [], []
        for s in RANDOM_SEEDS:
            tr, te, _ = one_run(selected_feats, d, n_total, s, auto_depth=False)

            tr_scores.append(tr); te_scores.append(te)
        tr_runs.append(tr_scores); te_runs.append(te_scores)

    tr_arr, te_arr = np.array(tr_runs).T, np.array(te_runs).T
    tr_m, tr_lo, tr_hi = percentile_band(tr_arr)
    te_m, te_lo, te_hi = percentile_band(te_arr)

    fig_f1 = go.Figure()
    line_and_band(fig_f1, depths, tr_m, tr_lo, tr_hi, "Train F1", "rgba(31,119,180,1)")
    line_and_band(fig_f1, depths, te_m, te_lo, te_hi, "Test F1", "rgba(255,127,14,1)")
    
    fig_f1.update_layout(
        title=f"F1 vs Tree Depth (n={n_total}; #features={len(selected_feats)})",
        template="plotly_white",
        height=600,
        margin=dict(l=40, r=10, t=60, b=60),
        legend=dict(orientation="h", y=-0.2),
        uirevision="keep-zoom"
    )
    fig_f1.update_yaxes(title_text="F1 Score", range=[0, 1])
    fig_f1.update_xaxes(title_text="max_depth", dtick=5)

    # IMPORTANT: return a single figure (not a tuple)
    return fig_f1


def plot_f1_vs_samplesize(selected_feats: List[str], max_depth: int, auto_depth: bool):
    if not selected_feats:
        raise gr.Error("Please select at least one feature.")
    sample_sizes = list(range(MIN_TRAIN_SIZE, len(DF) - TEST_SIZE + 1, TRAIN_FOLD_STEP)) # 600, 3401, 200
    # print(sample_sizes, MIN_TRAIN_SIZE, len(DF) - TEST_SIZE + 1, TRAIN_FOLD_STEP)

    tr_runs, te_runs, depth_runs = [], [], []
    for n_total in sample_sizes:
        tr_scores, te_scores, depths = [], [], []
        for s in RANDOM_SEEDS:
            tr, te, d = one_run(selected_feats, max_depth, n_total, s, auto_depth)
            tr_scores.append(tr); te_scores.append(te); depths.append(d)
        tr_runs.append(tr_scores); te_runs.append(te_scores); depth_runs.append(depths)

    tr_arr, te_arr, d_arr = np.array(tr_runs).T, np.array(te_runs).T, np.array(depth_runs).T
    tr_m, tr_lo, tr_hi = percentile_band(tr_arr)
    te_m, te_lo, te_hi = percentile_band(te_arr)

    # ---- Figure 1: F1 vs Sample Size ----
    fig_f1 = go.Figure()
    line_and_band(fig_f1, sample_sizes, tr_m, tr_lo, tr_hi, "Train F1", "rgba(31,119,180,1)")
    line_and_band(fig_f1, sample_sizes, te_m, te_lo, te_hi, "Test F1", "rgba(255,127,14,1)")

    mode = "auto-depth (grid search)" if auto_depth else f"max_depth={max_depth}"
    fig_f1.update_layout(
        title=f"F1 vs Sample Size ({mode}; #features={len(selected_feats)})",
        template="plotly_white",
        height=600,
        margin=dict(l=40, r=10, t=60, b=60),
        legend=dict(orientation="h", y=-0.2),
        uirevision="keep-zoom"
    )
    fig_f1.update_xaxes(title_text="Number of samples (n)")
    fig_f1.update_yaxes(title_text="F1 Score", range=[0, 1])

    # ---- Figure 2: Depth vs Sample Size ----
    if auto_depth:
        d_m, d_lo, d_hi = percentile_band(d_arr)
        fig_depth = go.Figure()
        fig_depth.add_trace(go.Scatter(x=sample_sizes, y=d_m, mode="lines+markers",
                                       name="Depth (mean)", line=dict(dash="dot")))
        fig_depth.add_trace(go.Scatter(x=sample_sizes, y=d_lo, mode="lines", line=dict(width=0),
                                       showlegend=False, hoverinfo="skip"))
        fig_depth.add_trace(go.Scatter(x=sample_sizes, y=d_hi, mode="lines", line=dict(width=0),
                                       fill="tonexty", name="Depth 10–90%", hoverinfo="skip"))
        fig_depth.update_layout(
            title=f"Depth vs Sample Size",
            template="plotly_white",
            height=600,
            margin=dict(l=40, r=10, t=60, b=60),
            legend=dict(orientation="h", y=-0.2),
            uirevision="keep-zoom"
        )
        fig_depth.update_xaxes(title_text="Number of samples (n)")
        y_min = max(0, np.nanmin(d_lo) if np.isfinite(np.nanmin(d_lo)) else 0)
        y_max = np.nanmax(d_hi) if np.isfinite(np.nanmax(d_hi)) else None
        fig_depth.update_yaxes(title_text="Depth", dtick=1, range=[y_min, y_max])

    else:
        fig_depth = go.Figure()

    return fig_f1, gr.update(value=fig_depth, visible=auto_depth)


# ---------------- Gradio UI ----------------


with gr.Blocks(title="Feature/Depth/Sample Explorer", css=custom_css) as demo:
    with gr.Accordion("About this tool", open=False):
        gr.Markdown(ABOUT_MD)    

    with gr.Row():
        with gr.Column(scale=1):
            feat_choices = gr.CheckboxGroup(
                label="Select features (order is preserved):",
                choices=ALL_FEATURES,
                value=DEFAULT_SELECTED,
            )
            gr.Markdown(
                f"**Dataset size:** {len(DF):,} rows • **Test size/run:** {TEST_SIZE} • **Seeds:** {len(RANDOM_SEEDS)}"
            )

        with gr.Column(scale=2):
            # -------- Tab: F1 vs Features --------
            with gr.Tab("F1 vs Features"):
                with gr.Row():
                    auto_depth_feat = gr.Checkbox(value=False, label="Auto-depth (grid 1..50 step 5)")
                    depth_feat = gr.Slider(1, 50, value=5, step=1, label="max_depth (used when auto-depth is OFF)")
                    n_total_feat = gr.Slider(minimum=MIN_TRAIN_SIZE, maximum=len(DF)-TEST_SIZE, value=min(MIN_TRAIN_SIZE, len(DF)),
                                             step=TRAIN_FOLD_STEP, label="Sample size (n)")
                btn_feat = gr.Button("Run")
                # Two plots: main F1 + depth
                plt_feat_main = gr.Plot(label="F1 vs Features", visible=True)
                plt_feat_depth = gr.Plot(label="Depth vs #Features", visible=False)

            # -------- Tab: F1 vs Depth --------
            with gr.Tab("F1 vs Depth"):
                n_total_depth = gr.Slider(minimum=MIN_TRAIN_SIZE, maximum=len(DF)-TEST_SIZE, value=min(MIN_TRAIN_SIZE, len(DF)),
                                          step=TRAIN_FOLD_STEP, label="Sample size (n)")
                btn_depth = gr.Button("Run")
                plt_depth = gr.Plot(label="F1 vs Depth")

            # -------- Tab: F1 vs Sample Size --------
            with gr.Tab("F1 vs Sample Size"):
                with gr.Row():
                    auto_depth_samp = gr.Checkbox(value=False, label="Auto-depth (grid 1..50 step 5)")
                    depth_samp = gr.Slider(1, 50, value=5, step=1, label="max_depth (used when auto-depth is OFF)")
                btn_size = gr.Button("Run")
                # Two plots: main F1 + depth
                plt_size_main = gr.Plot(label="F1 vs Sample Size")
                plt_size_depth = gr.Plot(label="Depth vs Sample Size")
    
    def toggle_depth_and_plot(checked: bool):
        return gr.update(visible=not checked)

    auto_depth_feat.change(
        fn=toggle_depth_and_plot,
        inputs=auto_depth_feat,
        outputs=[depth_feat],
    )

    auto_depth_samp.change(
        fn=toggle_depth_and_plot,
        inputs=auto_depth_samp,
        outputs=[depth_samp],
    )

    # Wiring
    btn_feat.click(
        fn=plot_f1_vs_features,
        inputs=[feat_choices, depth_feat, n_total_feat, auto_depth_feat],
        outputs=[plt_feat_main, plt_feat_depth],
    )
    btn_depth.click(
        fn=plot_f1_vs_depth,
        inputs=[feat_choices, n_total_depth],
        outputs=plt_depth,  # single figure
    )
    btn_size.click(
        fn=plot_f1_vs_samplesize,
        inputs=[feat_choices, depth_samp, auto_depth_samp],
        outputs=[plt_size_main, plt_size_depth],
    )

    # AUTO-RUN on load with default values (return exactly 5 figures)
    demo.load(
        fn=lambda feats, d_feat, n_feat, auto_feat, n_depth, d_samp, auto_samp: (
            *plot_f1_vs_features(feats, d_feat, n_feat, auto_feat),   # -> 2 figs
            plot_f1_vs_depth(feats, n_depth),                         # -> 1 fig
            *plot_f1_vs_samplesize(feats, d_samp, auto_samp),         # -> 2 figs
        ),
        inputs=[feat_choices, depth_feat, n_total_feat, auto_depth_feat, n_total_depth, depth_samp, auto_depth_samp],
        outputs=[plt_feat_main, plt_feat_depth, plt_depth, plt_size_main, plt_size_depth],
    )


workers = int(os.getenv("WORKERS", "4"))

# set a global default concurrency for all events
demo.queue(
    default_concurrency_limit=workers,
    max_size=100,
    status_update_rate="auto"  # or a number of seconds
)
demo.launch(
    server_name="0.0.0.0",
    server_port=int(os.getenv("PORT", "7860")),
    show_error=True,
)