"""Pre-compute Plotly figure dicts for every (industry × metric × view) combo
and write them to figures_cache.json next to app.py.

Run this whenever the underlying results or metadata change.
Usage:
    cd hf/tabbench && python build_figures_cache.py
"""
import json, os, sys
import pandas as pd

# We need the plot function + the data-loading + Elo/improvement helpers from app.py.
# Approach: exec app.py's source up to the gr.Blocks() boundary in a namespace,
# pulling out the needed symbols. This avoids importing app.py (which would
# launch the Gradio server).
HERE = os.path.dirname(os.path.abspath(__file__))
os.chdir(HERE)

src = open("app.py").read()
END_MARK = "with gr.Blocks("
idx = src.find(END_MARK)
if idx < 0:
    print("ERROR: couldn't find 'with gr.Blocks(' in app.py", file=sys.stderr)
    sys.exit(1)
prefix = src[:idx]
ns = {"__file__": os.path.join(HERE, "app.py"), "__name__": "__main__"}
exec(compile(prefix, "app.py", "exec"), ns)

plot_global_model_ranking_plotly = ns["plot_global_model_ranking_plotly"]
plot_winrate_matrix = ns["plot_winrate_matrix"]
plot_significance_forest = ns["plot_significance_forest"]
# Use the FULL per-dataset frame as input. We can't use public_enter_per_dataset
# (which hides single-fit GBDTs) because compute_pct_improvement_over_baseline
# needs the `xgboost_ensemble` rows present as the baseline, or every %↗ cell
# turns NaN. The hidden display models are filtered out at the agg step below.
public_per_dataset = ns["public_per_dataset"]
public_enter_per_dataset = ns["public_enter_per_dataset"]
compute_elo_for_subset = ns["compute_elo_for_subset"]
compute_pct_improvement_over_baseline = ns["compute_pct_improvement_over_baseline"]
HIDDEN_DISPLAY_MODELS = ns.get("_HIDDEN_DISPLAY_MODELS", set())

# Same buckets as the app
INDUSTRY_BUCKETS = [
    "All",
    "Healthcare",
    "Behavioral",
    "Computer Vision",
    "Industry & Science",
    "Finance/insurance",
    "Games & Synthetic",
    "Social/Public",
    "Other",
]
METRICS = ["Accuracy", "AUC", "F1_score", "Precision", "Recall", "Cross_entropy", "Elo_score", "%↗ over XGBoost"]

# External-benchmark dataset filters. Must stay in sync with the
# _BENCHMARK_DATASET_IDS dict in app.py (single source of truth: the app).
BENCHMARK_DATASET_IDS = {
    "TabArena": {
        46905, 46906, 46908, 46910, 46911, 46912, 46916, 46919,
        46920, 46922, 46924, 46927, 46929, 46930, 46932, 46933,
        46935, 46937, 46938, 46940, 46941, 46947, 46950, 46955,
        46956, 46958, 46960, 46962, 46963, 46969, 46979, 46980,
        # equivalents we have under different IDs:
        1464, 40701, 31, 37, 1494,
        # kddcup09_appetency
        46939,
    },
}
BENCHMARKS = ["All", "TabArena"]

# Same metadata merge as the app does at boot.
meta_df = pd.DataFrame(json.load(open("public_datasets_info.json")))
meta_df['dataset_id'] = meta_df['dataset_id'].astype(str)
public_per_dataset['dataset_id'] = public_per_dataset['dataset_id'].astype(str)
public_per_dataset_with_meta = public_per_dataset.merge(
    meta_df[['dataset_id', 'dataset_industry', 'rows', 'features']],
    on='dataset_id', how='left',
)

def _apply_benchmark(df, benchmark):
    if not benchmark or benchmark == "All":
        return df
    ids = {str(i) for i in BENCHMARK_DATASET_IDS.get(benchmark, set())}
    return df[df['dataset_id'].astype(str).isin(ids)]

def build_one(benchmark, industry, metric):
    """Return (bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg_records, n_datasets)."""
    df = _apply_benchmark(public_per_dataset_with_meta, benchmark)
    if industry != "All":
        df = df[df['dataset_industry'] == industry]
    if df.empty:
        return None, None, None, None, [], 0
    # Recompute Elo + %↗ on the filtered subset. Must include the
    # `xgboost_ensemble` baseline rows; we drop the hidden display models
    # only AFTER the per-dataset comparison is done.
    df = compute_elo_for_subset(df, metric="Accuracy")
    df = compute_pct_improvement_over_baseline(df, baseline_model="xgboost_ensemble", metric="Accuracy")
    df_display = df[~df['model'].isin(HIDDEN_DISPLAY_MODELS)].copy() if HIDDEN_DISPLAY_MODELS else df
    agg = df_display.groupby('model')[['Accuracy', 'AUC', 'F1_score', 'Precision', 'Recall', 'Cross_entropy', 'Elo_score', '%↗ over XGBoost']].mean().reset_index()
    bars = plot_global_model_ranking_plotly(agg, metric=metric, per_dataset_df=df_display)
    # Forest plot (Wilcoxon-Holm leading-group highlight + paired-bootstrap CI).
    # This is the slow figure on live render (B=1000 bootstrap × all model pairs),
    # so pre-caching it is the whole point of this build script for the Bars view.
    # Returns (figure, kpi_html) — both go in the cache.
    try:
        forest, forest_kpi = plot_significance_forest(df_display, metric=metric)
    except Exception as _e:
        print(f"  [warn] forest plot failed for bench={benchmark} ind={industry} metric={metric}: {_e}")
        forest, forest_kpi = None, None
    # Win-rate matrix should ALSO use the display-filtered frame so the
    # heatmap doesn't show duplicate display names (single-fit vs ensemble).
    winrate = plot_winrate_matrix(df_display, metric=metric) if metric not in ("Elo_score", "%↗ over XGBoost") else None
    return (
        bars.to_plotly_json(),
        forest.to_plotly_json() if forest is not None else None,
        forest_kpi,
        winrate.to_plotly_json() if winrate is not None else None,
        agg.round(6).to_dict(orient="records"),
        int(df_display['dataset_id'].nunique()),
    )

# Cache shape:
#   { benchmark_key: { industry: { "metrics": {metric: fig},
#                                  "forest":  {metric: fig},
#                                  "winrate": {metric: fig},
#                                  "n_datasets": int,
#                                  "table": [...] } } }
# Backward compat: the "All" benchmark is also written at the top level so
# existing app.py readers that index `_figures_cache[industry]` keep working.
cache = {}
total = len(BENCHMARKS) * len(INDUSTRY_BUCKETS) * len(METRICS)
i = 0
for benchmark in BENCHMARKS:
    cache[benchmark] = {}
    for industry in INDUSTRY_BUCKETS:
        cache[benchmark][industry] = {
            "metrics": {}, "forest": {}, "forest_kpi": {}, "winrate": {},
            "n_datasets": 0, "table": [],
        }
        for metric in METRICS:
            i += 1
            bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg, n = build_one(benchmark, industry, metric)
            cache[benchmark][industry]["metrics"][metric] = bars_dict
            if forest_dict is not None:
                cache[benchmark][industry]["forest"][metric] = forest_dict
            if forest_kpi_html is not None:
                cache[benchmark][industry]["forest_kpi"][metric] = forest_kpi_html
            if winrate_dict is not None:
                cache[benchmark][industry]["winrate"][metric] = winrate_dict
            cache[benchmark][industry]["n_datasets"] = n
            if not cache[benchmark][industry]["table"]:
                cache[benchmark][industry]["table"] = agg
            print(f"[{i:3d}/{total}] bench={benchmark:10s} industry={industry!r:25s} metric={metric:20s} n_datasets={n}")

# Backward-compat alias: top-level keys mirror the All-benchmark slice so
# legacy `cache[industry]` reads still resolve. Done as a separate dict
# snapshot first to avoid mutating-while-iterating issues.
_all_slice = {industry: cache["All"][industry] for industry in INDUSTRY_BUCKETS}
for industry, payload in _all_slice.items():
    cache[industry] = payload

import numpy as np

def _default(o):
    if isinstance(o, np.ndarray):
        return o.tolist()
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    raise TypeError(f"not serializable: {type(o).__name__}")

out_path = os.path.join(HERE, "figures_cache.json")
with open(out_path, "w") as f:
    json.dump(cache, f, default=_default)  # NB: no indent — keeps file small
size_kb = os.path.getsize(out_path) / 1024
print(f"\nWrote {out_path} ({size_kb:.1f} KB)")
_n_combos = 0
for k in BENCHMARKS:
    bench = cache.get(k, {})
    if not isinstance(bench, dict):
        continue
    for ind_v in bench.values():
        if isinstance(ind_v, dict):
            _n_combos += len(ind_v.get("metrics", {}))
print(f"Combos cached: {_n_combos}")