Spaces:
Running
Running
| """Pre-compute Plotly figure dicts for every (industry Γ metric Γ view) combo | |
| and write them to figures_cache.json next to app.py. | |
| Run this whenever the underlying results or metadata change. | |
| Usage: | |
| cd hf/tabbench && python build_figures_cache.py | |
| """ | |
| import json, os, sys | |
| import pandas as pd | |
| # We need the plot function + the data-loading + Elo/improvement helpers from app.py. | |
| # Approach: exec app.py's source up to the gr.Blocks() boundary in a namespace, | |
| # pulling out the needed symbols. This avoids importing app.py (which would | |
| # launch the Gradio server). | |
| HERE = os.path.dirname(os.path.abspath(__file__)) | |
| os.chdir(HERE) | |
| src = open("app.py").read() | |
| END_MARK = "with gr.Blocks(" | |
| idx = src.find(END_MARK) | |
| if idx < 0: | |
| print("ERROR: couldn't find 'with gr.Blocks(' in app.py", file=sys.stderr) | |
| sys.exit(1) | |
| prefix = src[:idx] | |
| ns = {"__file__": os.path.join(HERE, "app.py"), "__name__": "__main__"} | |
| exec(compile(prefix, "app.py", "exec"), ns) | |
| plot_global_model_ranking_plotly = ns["plot_global_model_ranking_plotly"] | |
| plot_winrate_matrix = ns["plot_winrate_matrix"] | |
| plot_significance_forest = ns["plot_significance_forest"] | |
| # Use the FULL per-dataset frame as input. We can't use public_enter_per_dataset | |
| # (which hides single-fit GBDTs) because compute_pct_improvement_over_baseline | |
| # needs the `xgboost_ensemble` rows present as the baseline, or every %β cell | |
| # turns NaN. The hidden display models are filtered out at the agg step below. | |
| public_per_dataset = ns["public_per_dataset"] | |
| public_enter_per_dataset = ns["public_enter_per_dataset"] | |
| compute_elo_for_subset = ns["compute_elo_for_subset"] | |
| compute_pct_improvement_over_baseline = ns["compute_pct_improvement_over_baseline"] | |
| HIDDEN_DISPLAY_MODELS = ns.get("_HIDDEN_DISPLAY_MODELS", set()) | |
| # Same buckets as the app | |
| INDUSTRY_BUCKETS = [ | |
| "All", | |
| "Healthcare", | |
| "Behavioral", | |
| "Computer Vision", | |
| "Industry & Science", | |
| "Finance/insurance", | |
| "Games & Synthetic", | |
| "Social/Public", | |
| "Other", | |
| ] | |
| METRICS = ["Accuracy", "AUC", "F1_score", "Precision", "Recall", "Cross_entropy", "Elo_score", "%β over XGBoost"] | |
| # External-benchmark dataset filters. Must stay in sync with the | |
| # _BENCHMARK_DATASET_IDS dict in app.py (single source of truth: the app). | |
| BENCHMARK_DATASET_IDS = { | |
| "TabArena": { | |
| 46905, 46906, 46908, 46910, 46911, 46912, 46916, 46919, | |
| 46920, 46922, 46924, 46927, 46929, 46930, 46932, 46933, | |
| 46935, 46937, 46938, 46940, 46941, 46947, 46950, 46955, | |
| 46956, 46958, 46960, 46962, 46963, 46969, 46979, 46980, | |
| # equivalents we have under different IDs: | |
| 1464, 40701, 31, 37, 1494, | |
| # kddcup09_appetency | |
| 46939, | |
| }, | |
| } | |
| BENCHMARKS = ["All", "TabArena"] | |
| # Same metadata merge as the app does at boot. | |
| meta_df = pd.DataFrame(json.load(open("public_datasets_info.json"))) | |
| meta_df['dataset_id'] = meta_df['dataset_id'].astype(str) | |
| public_per_dataset['dataset_id'] = public_per_dataset['dataset_id'].astype(str) | |
| public_per_dataset_with_meta = public_per_dataset.merge( | |
| meta_df[['dataset_id', 'dataset_industry', 'rows', 'features']], | |
| on='dataset_id', how='left', | |
| ) | |
| def _apply_benchmark(df, benchmark): | |
| if not benchmark or benchmark == "All": | |
| return df | |
| ids = {str(i) for i in BENCHMARK_DATASET_IDS.get(benchmark, set())} | |
| return df[df['dataset_id'].astype(str).isin(ids)] | |
| def build_one(benchmark, industry, metric): | |
| """Return (bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg_records, n_datasets).""" | |
| df = _apply_benchmark(public_per_dataset_with_meta, benchmark) | |
| if industry != "All": | |
| df = df[df['dataset_industry'] == industry] | |
| if df.empty: | |
| return None, None, None, None, [], 0 | |
| # Recompute Elo + %β on the filtered subset. Must include the | |
| # `xgboost_ensemble` baseline rows; we drop the hidden display models | |
| # only AFTER the per-dataset comparison is done. | |
| df = compute_elo_for_subset(df, metric="Accuracy") | |
| df = compute_pct_improvement_over_baseline(df, baseline_model="xgboost_ensemble", metric="Accuracy") | |
| df_display = df[~df['model'].isin(HIDDEN_DISPLAY_MODELS)].copy() if HIDDEN_DISPLAY_MODELS else df | |
| agg = df_display.groupby('model')[['Accuracy', 'AUC', 'F1_score', 'Precision', 'Recall', 'Cross_entropy', 'Elo_score', '%β over XGBoost']].mean().reset_index() | |
| bars = plot_global_model_ranking_plotly(agg, metric=metric, per_dataset_df=df_display) | |
| # Forest plot (Wilcoxon-Holm leading-group highlight + paired-bootstrap CI). | |
| # This is the slow figure on live render (B=1000 bootstrap Γ all model pairs), | |
| # so pre-caching it is the whole point of this build script for the Bars view. | |
| # Returns (figure, kpi_html) β both go in the cache. | |
| try: | |
| forest, forest_kpi = plot_significance_forest(df_display, metric=metric) | |
| except Exception as _e: | |
| print(f" [warn] forest plot failed for bench={benchmark} ind={industry} metric={metric}: {_e}") | |
| forest, forest_kpi = None, None | |
| # Win-rate matrix should ALSO use the display-filtered frame so the | |
| # heatmap doesn't show duplicate display names (single-fit vs ensemble). | |
| winrate = plot_winrate_matrix(df_display, metric=metric) if metric not in ("Elo_score", "%β over XGBoost") else None | |
| return ( | |
| bars.to_plotly_json(), | |
| forest.to_plotly_json() if forest is not None else None, | |
| forest_kpi, | |
| winrate.to_plotly_json() if winrate is not None else None, | |
| agg.round(6).to_dict(orient="records"), | |
| int(df_display['dataset_id'].nunique()), | |
| ) | |
| # Cache shape: | |
| # { benchmark_key: { industry: { "metrics": {metric: fig}, | |
| # "forest": {metric: fig}, | |
| # "winrate": {metric: fig}, | |
| # "n_datasets": int, | |
| # "table": [...] } } } | |
| # Backward compat: the "All" benchmark is also written at the top level so | |
| # existing app.py readers that index `_figures_cache[industry]` keep working. | |
| cache = {} | |
| total = len(BENCHMARKS) * len(INDUSTRY_BUCKETS) * len(METRICS) | |
| i = 0 | |
| for benchmark in BENCHMARKS: | |
| cache[benchmark] = {} | |
| for industry in INDUSTRY_BUCKETS: | |
| cache[benchmark][industry] = { | |
| "metrics": {}, "forest": {}, "forest_kpi": {}, "winrate": {}, | |
| "n_datasets": 0, "table": [], | |
| } | |
| for metric in METRICS: | |
| i += 1 | |
| bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg, n = build_one(benchmark, industry, metric) | |
| cache[benchmark][industry]["metrics"][metric] = bars_dict | |
| if forest_dict is not None: | |
| cache[benchmark][industry]["forest"][metric] = forest_dict | |
| if forest_kpi_html is not None: | |
| cache[benchmark][industry]["forest_kpi"][metric] = forest_kpi_html | |
| if winrate_dict is not None: | |
| cache[benchmark][industry]["winrate"][metric] = winrate_dict | |
| cache[benchmark][industry]["n_datasets"] = n | |
| if not cache[benchmark][industry]["table"]: | |
| cache[benchmark][industry]["table"] = agg | |
| print(f"[{i:3d}/{total}] bench={benchmark:10s} industry={industry!r:25s} metric={metric:20s} n_datasets={n}") | |
| # Backward-compat alias: top-level keys mirror the All-benchmark slice so | |
| # legacy `cache[industry]` reads still resolve. Done as a separate dict | |
| # snapshot first to avoid mutating-while-iterating issues. | |
| _all_slice = {industry: cache["All"][industry] for industry in INDUSTRY_BUCKETS} | |
| for industry, payload in _all_slice.items(): | |
| cache[industry] = payload | |
| import numpy as np | |
| def _default(o): | |
| if isinstance(o, np.ndarray): | |
| return o.tolist() | |
| if isinstance(o, (np.integer,)): | |
| return int(o) | |
| if isinstance(o, (np.floating,)): | |
| return float(o) | |
| raise TypeError(f"not serializable: {type(o).__name__}") | |
| out_path = os.path.join(HERE, "figures_cache.json") | |
| with open(out_path, "w") as f: | |
| json.dump(cache, f, default=_default) # NB: no indent β keeps file small | |
| size_kb = os.path.getsize(out_path) / 1024 | |
| print(f"\nWrote {out_path} ({size_kb:.1f} KB)") | |
| _n_combos = 0 | |
| for k in BENCHMARKS: | |
| bench = cache.get(k, {}) | |
| if not isinstance(bench, dict): | |
| continue | |
| for ind_v in bench.values(): | |
| if isinstance(ind_v, dict): | |
| _n_combos += len(ind_v.get("metrics", {})) | |
| print(f"Combos cached: {_n_combos}") | |