tabbench / build_figures_cache.py
alexandreabraham's picture
%β†— over XGBoost: baseline is xgboost_ensemble (the canonical "XGBoost")
52d607e
Raw
History Blame Contribute Delete
8.44 kB
"""Pre-compute Plotly figure dicts for every (industry Γ— metric Γ— view) combo
and write them to figures_cache.json next to app.py.
Run this whenever the underlying results or metadata change.
Usage:
cd hf/tabbench && python build_figures_cache.py
"""
import json, os, sys
import pandas as pd
# We need the plot function + the data-loading + Elo/improvement helpers from app.py.
# Approach: exec app.py's source up to the gr.Blocks() boundary in a namespace,
# pulling out the needed symbols. This avoids importing app.py (which would
# launch the Gradio server).
HERE = os.path.dirname(os.path.abspath(__file__))
os.chdir(HERE)
src = open("app.py").read()
END_MARK = "with gr.Blocks("
idx = src.find(END_MARK)
if idx < 0:
print("ERROR: couldn't find 'with gr.Blocks(' in app.py", file=sys.stderr)
sys.exit(1)
prefix = src[:idx]
ns = {"__file__": os.path.join(HERE, "app.py"), "__name__": "__main__"}
exec(compile(prefix, "app.py", "exec"), ns)
plot_global_model_ranking_plotly = ns["plot_global_model_ranking_plotly"]
plot_winrate_matrix = ns["plot_winrate_matrix"]
plot_significance_forest = ns["plot_significance_forest"]
# Use the FULL per-dataset frame as input. We can't use public_enter_per_dataset
# (which hides single-fit GBDTs) because compute_pct_improvement_over_baseline
# needs the `xgboost_ensemble` rows present as the baseline, or every %β†— cell
# turns NaN. The hidden display models are filtered out at the agg step below.
public_per_dataset = ns["public_per_dataset"]
public_enter_per_dataset = ns["public_enter_per_dataset"]
compute_elo_for_subset = ns["compute_elo_for_subset"]
compute_pct_improvement_over_baseline = ns["compute_pct_improvement_over_baseline"]
HIDDEN_DISPLAY_MODELS = ns.get("_HIDDEN_DISPLAY_MODELS", set())
# Same buckets as the app
INDUSTRY_BUCKETS = [
"All",
"Healthcare",
"Behavioral",
"Computer Vision",
"Industry & Science",
"Finance/insurance",
"Games & Synthetic",
"Social/Public",
"Other",
]
METRICS = ["Accuracy", "AUC", "F1_score", "Precision", "Recall", "Cross_entropy", "Elo_score", "%β†— over XGBoost"]
# External-benchmark dataset filters. Must stay in sync with the
# _BENCHMARK_DATASET_IDS dict in app.py (single source of truth: the app).
BENCHMARK_DATASET_IDS = {
"TabArena": {
46905, 46906, 46908, 46910, 46911, 46912, 46916, 46919,
46920, 46922, 46924, 46927, 46929, 46930, 46932, 46933,
46935, 46937, 46938, 46940, 46941, 46947, 46950, 46955,
46956, 46958, 46960, 46962, 46963, 46969, 46979, 46980,
# equivalents we have under different IDs:
1464, 40701, 31, 37, 1494,
# kddcup09_appetency
46939,
},
}
BENCHMARKS = ["All", "TabArena"]
# Same metadata merge as the app does at boot.
meta_df = pd.DataFrame(json.load(open("public_datasets_info.json")))
meta_df['dataset_id'] = meta_df['dataset_id'].astype(str)
public_per_dataset['dataset_id'] = public_per_dataset['dataset_id'].astype(str)
public_per_dataset_with_meta = public_per_dataset.merge(
meta_df[['dataset_id', 'dataset_industry', 'rows', 'features']],
on='dataset_id', how='left',
)
def _apply_benchmark(df, benchmark):
if not benchmark or benchmark == "All":
return df
ids = {str(i) for i in BENCHMARK_DATASET_IDS.get(benchmark, set())}
return df[df['dataset_id'].astype(str).isin(ids)]
def build_one(benchmark, industry, metric):
"""Return (bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg_records, n_datasets)."""
df = _apply_benchmark(public_per_dataset_with_meta, benchmark)
if industry != "All":
df = df[df['dataset_industry'] == industry]
if df.empty:
return None, None, None, None, [], 0
# Recompute Elo + %β†— on the filtered subset. Must include the
# `xgboost_ensemble` baseline rows; we drop the hidden display models
# only AFTER the per-dataset comparison is done.
df = compute_elo_for_subset(df, metric="Accuracy")
df = compute_pct_improvement_over_baseline(df, baseline_model="xgboost_ensemble", metric="Accuracy")
df_display = df[~df['model'].isin(HIDDEN_DISPLAY_MODELS)].copy() if HIDDEN_DISPLAY_MODELS else df
agg = df_display.groupby('model')[['Accuracy', 'AUC', 'F1_score', 'Precision', 'Recall', 'Cross_entropy', 'Elo_score', '%β†— over XGBoost']].mean().reset_index()
bars = plot_global_model_ranking_plotly(agg, metric=metric, per_dataset_df=df_display)
# Forest plot (Wilcoxon-Holm leading-group highlight + paired-bootstrap CI).
# This is the slow figure on live render (B=1000 bootstrap Γ— all model pairs),
# so pre-caching it is the whole point of this build script for the Bars view.
# Returns (figure, kpi_html) β€” both go in the cache.
try:
forest, forest_kpi = plot_significance_forest(df_display, metric=metric)
except Exception as _e:
print(f" [warn] forest plot failed for bench={benchmark} ind={industry} metric={metric}: {_e}")
forest, forest_kpi = None, None
# Win-rate matrix should ALSO use the display-filtered frame so the
# heatmap doesn't show duplicate display names (single-fit vs ensemble).
winrate = plot_winrate_matrix(df_display, metric=metric) if metric not in ("Elo_score", "%β†— over XGBoost") else None
return (
bars.to_plotly_json(),
forest.to_plotly_json() if forest is not None else None,
forest_kpi,
winrate.to_plotly_json() if winrate is not None else None,
agg.round(6).to_dict(orient="records"),
int(df_display['dataset_id'].nunique()),
)
# Cache shape:
# { benchmark_key: { industry: { "metrics": {metric: fig},
# "forest": {metric: fig},
# "winrate": {metric: fig},
# "n_datasets": int,
# "table": [...] } } }
# Backward compat: the "All" benchmark is also written at the top level so
# existing app.py readers that index `_figures_cache[industry]` keep working.
cache = {}
total = len(BENCHMARKS) * len(INDUSTRY_BUCKETS) * len(METRICS)
i = 0
for benchmark in BENCHMARKS:
cache[benchmark] = {}
for industry in INDUSTRY_BUCKETS:
cache[benchmark][industry] = {
"metrics": {}, "forest": {}, "forest_kpi": {}, "winrate": {},
"n_datasets": 0, "table": [],
}
for metric in METRICS:
i += 1
bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg, n = build_one(benchmark, industry, metric)
cache[benchmark][industry]["metrics"][metric] = bars_dict
if forest_dict is not None:
cache[benchmark][industry]["forest"][metric] = forest_dict
if forest_kpi_html is not None:
cache[benchmark][industry]["forest_kpi"][metric] = forest_kpi_html
if winrate_dict is not None:
cache[benchmark][industry]["winrate"][metric] = winrate_dict
cache[benchmark][industry]["n_datasets"] = n
if not cache[benchmark][industry]["table"]:
cache[benchmark][industry]["table"] = agg
print(f"[{i:3d}/{total}] bench={benchmark:10s} industry={industry!r:25s} metric={metric:20s} n_datasets={n}")
# Backward-compat alias: top-level keys mirror the All-benchmark slice so
# legacy `cache[industry]` reads still resolve. Done as a separate dict
# snapshot first to avoid mutating-while-iterating issues.
_all_slice = {industry: cache["All"][industry] for industry in INDUSTRY_BUCKETS}
for industry, payload in _all_slice.items():
cache[industry] = payload
import numpy as np
def _default(o):
if isinstance(o, np.ndarray):
return o.tolist()
if isinstance(o, (np.integer,)):
return int(o)
if isinstance(o, (np.floating,)):
return float(o)
raise TypeError(f"not serializable: {type(o).__name__}")
out_path = os.path.join(HERE, "figures_cache.json")
with open(out_path, "w") as f:
json.dump(cache, f, default=_default) # NB: no indent β€” keeps file small
size_kb = os.path.getsize(out_path) / 1024
print(f"\nWrote {out_path} ({size_kb:.1f} KB)")
_n_combos = 0
for k in BENCHMARKS:
bench = cache.get(k, {})
if not isinstance(bench, dict):
continue
for ind_v in bench.values():
if isinstance(ind_v, dict):
_n_combos += len(ind_v.get("metrics", {}))
print(f"Combos cached: {_n_combos}")