"""Pre-compute Plotly figure dicts for every (industry × metric × view) combo and write them to figures_cache.json next to app.py. Run this whenever the underlying results or metadata change. Usage: cd hf/tabbench && python build_figures_cache.py """ import json, os, sys import pandas as pd # We need the plot function + the data-loading + Elo/improvement helpers from app.py. # Approach: exec app.py's source up to the gr.Blocks() boundary in a namespace, # pulling out the needed symbols. This avoids importing app.py (which would # launch the Gradio server). HERE = os.path.dirname(os.path.abspath(__file__)) os.chdir(HERE) src = open("app.py").read() END_MARK = "with gr.Blocks(" idx = src.find(END_MARK) if idx < 0: print("ERROR: couldn't find 'with gr.Blocks(' in app.py", file=sys.stderr) sys.exit(1) prefix = src[:idx] ns = {"__file__": os.path.join(HERE, "app.py"), "__name__": "__main__"} exec(compile(prefix, "app.py", "exec"), ns) plot_global_model_ranking_plotly = ns["plot_global_model_ranking_plotly"] plot_winrate_matrix = ns["plot_winrate_matrix"] plot_significance_forest = ns["plot_significance_forest"] # Use the FULL per-dataset frame as input. We can't use public_enter_per_dataset # (which hides single-fit GBDTs) because compute_pct_improvement_over_baseline # needs the `xgboost_ensemble` rows present as the baseline, or every %↗ cell # turns NaN. The hidden display models are filtered out at the agg step below. public_per_dataset = ns["public_per_dataset"] public_enter_per_dataset = ns["public_enter_per_dataset"] compute_elo_for_subset = ns["compute_elo_for_subset"] compute_pct_improvement_over_baseline = ns["compute_pct_improvement_over_baseline"] HIDDEN_DISPLAY_MODELS = ns.get("_HIDDEN_DISPLAY_MODELS", set()) # Same buckets as the app INDUSTRY_BUCKETS = [ "All", "Healthcare", "Behavioral", "Computer Vision", "Industry & Science", "Finance/insurance", "Games & Synthetic", "Social/Public", "Other", ] METRICS = ["Accuracy", "AUC", "F1_score", "Precision", "Recall", "Cross_entropy", "Elo_score", "%↗ over XGBoost"] # External-benchmark dataset filters. Must stay in sync with the # _BENCHMARK_DATASET_IDS dict in app.py (single source of truth: the app). BENCHMARK_DATASET_IDS = { "TabArena": { 46905, 46906, 46908, 46910, 46911, 46912, 46916, 46919, 46920, 46922, 46924, 46927, 46929, 46930, 46932, 46933, 46935, 46937, 46938, 46940, 46941, 46947, 46950, 46955, 46956, 46958, 46960, 46962, 46963, 46969, 46979, 46980, # equivalents we have under different IDs: 1464, 40701, 31, 37, 1494, # kddcup09_appetency 46939, }, } BENCHMARKS = ["All", "TabArena"] # Same metadata merge as the app does at boot. meta_df = pd.DataFrame(json.load(open("public_datasets_info.json"))) meta_df['dataset_id'] = meta_df['dataset_id'].astype(str) public_per_dataset['dataset_id'] = public_per_dataset['dataset_id'].astype(str) public_per_dataset_with_meta = public_per_dataset.merge( meta_df[['dataset_id', 'dataset_industry', 'rows', 'features']], on='dataset_id', how='left', ) def _apply_benchmark(df, benchmark): if not benchmark or benchmark == "All": return df ids = {str(i) for i in BENCHMARK_DATASET_IDS.get(benchmark, set())} return df[df['dataset_id'].astype(str).isin(ids)] def build_one(benchmark, industry, metric): """Return (bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg_records, n_datasets).""" df = _apply_benchmark(public_per_dataset_with_meta, benchmark) if industry != "All": df = df[df['dataset_industry'] == industry] if df.empty: return None, None, None, None, [], 0 # Recompute Elo + %↗ on the filtered subset. Must include the # `xgboost_ensemble` baseline rows; we drop the hidden display models # only AFTER the per-dataset comparison is done. df = compute_elo_for_subset(df, metric="Accuracy") df = compute_pct_improvement_over_baseline(df, baseline_model="xgboost_ensemble", metric="Accuracy") df_display = df[~df['model'].isin(HIDDEN_DISPLAY_MODELS)].copy() if HIDDEN_DISPLAY_MODELS else df agg = df_display.groupby('model')[['Accuracy', 'AUC', 'F1_score', 'Precision', 'Recall', 'Cross_entropy', 'Elo_score', '%↗ over XGBoost']].mean().reset_index() bars = plot_global_model_ranking_plotly(agg, metric=metric, per_dataset_df=df_display) # Forest plot (Wilcoxon-Holm leading-group highlight + paired-bootstrap CI). # This is the slow figure on live render (B=1000 bootstrap × all model pairs), # so pre-caching it is the whole point of this build script for the Bars view. # Returns (figure, kpi_html) — both go in the cache. try: forest, forest_kpi = plot_significance_forest(df_display, metric=metric) except Exception as _e: print(f" [warn] forest plot failed for bench={benchmark} ind={industry} metric={metric}: {_e}") forest, forest_kpi = None, None # Win-rate matrix should ALSO use the display-filtered frame so the # heatmap doesn't show duplicate display names (single-fit vs ensemble). winrate = plot_winrate_matrix(df_display, metric=metric) if metric not in ("Elo_score", "%↗ over XGBoost") else None return ( bars.to_plotly_json(), forest.to_plotly_json() if forest is not None else None, forest_kpi, winrate.to_plotly_json() if winrate is not None else None, agg.round(6).to_dict(orient="records"), int(df_display['dataset_id'].nunique()), ) # Cache shape: # { benchmark_key: { industry: { "metrics": {metric: fig}, # "forest": {metric: fig}, # "winrate": {metric: fig}, # "n_datasets": int, # "table": [...] } } } # Backward compat: the "All" benchmark is also written at the top level so # existing app.py readers that index `_figures_cache[industry]` keep working. cache = {} total = len(BENCHMARKS) * len(INDUSTRY_BUCKETS) * len(METRICS) i = 0 for benchmark in BENCHMARKS: cache[benchmark] = {} for industry in INDUSTRY_BUCKETS: cache[benchmark][industry] = { "metrics": {}, "forest": {}, "forest_kpi": {}, "winrate": {}, "n_datasets": 0, "table": [], } for metric in METRICS: i += 1 bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg, n = build_one(benchmark, industry, metric) cache[benchmark][industry]["metrics"][metric] = bars_dict if forest_dict is not None: cache[benchmark][industry]["forest"][metric] = forest_dict if forest_kpi_html is not None: cache[benchmark][industry]["forest_kpi"][metric] = forest_kpi_html if winrate_dict is not None: cache[benchmark][industry]["winrate"][metric] = winrate_dict cache[benchmark][industry]["n_datasets"] = n if not cache[benchmark][industry]["table"]: cache[benchmark][industry]["table"] = agg print(f"[{i:3d}/{total}] bench={benchmark:10s} industry={industry!r:25s} metric={metric:20s} n_datasets={n}") # Backward-compat alias: top-level keys mirror the All-benchmark slice so # legacy `cache[industry]` reads still resolve. Done as a separate dict # snapshot first to avoid mutating-while-iterating issues. _all_slice = {industry: cache["All"][industry] for industry in INDUSTRY_BUCKETS} for industry, payload in _all_slice.items(): cache[industry] = payload import numpy as np def _default(o): if isinstance(o, np.ndarray): return o.tolist() if isinstance(o, (np.integer,)): return int(o) if isinstance(o, (np.floating,)): return float(o) raise TypeError(f"not serializable: {type(o).__name__}") out_path = os.path.join(HERE, "figures_cache.json") with open(out_path, "w") as f: json.dump(cache, f, default=_default) # NB: no indent — keeps file small size_kb = os.path.getsize(out_path) / 1024 print(f"\nWrote {out_path} ({size_kb:.1f} KB)") _n_combos = 0 for k in BENCHMARKS: bench = cache.get(k, {}) if not isinstance(bench, dict): continue for ind_v in bench.values(): if isinstance(ind_v, dict): _n_combos += len(ind_v.get("metrics", {})) print(f"Combos cached: {_n_combos}")