Spaces:

Neuralk-AI
/

tabbench

Running

App Files Files Community

tabbench / build_figures_cache.py

alexandreabraham

%↗ over XGBoost: baseline is xgboost_ensemble (the canonical "XGBoost")

52d607e 16 days ago

Raw

History Blame Contribute Delete

8.44 kB

	"""Pre-compute Plotly figure dicts for every (industry × metric × view) combo
	and write them to figures_cache.json next to app.py.

	Run this whenever the underlying results or metadata change.
	Usage:
	cd hf/tabbench && python build_figures_cache.py
	"""
	import json, os, sys
	import pandas as pd

	# We need the plot function + the data-loading + Elo/improvement helpers from app.py.
	# Approach: exec app.py's source up to the gr.Blocks() boundary in a namespace,
	# pulling out the needed symbols. This avoids importing app.py (which would
	# launch the Gradio server).
	HERE = os.path.dirname(os.path.abspath(__file__))
	os.chdir(HERE)

	src = open("app.py").read()
	END_MARK = "with gr.Blocks("
	idx = src.find(END_MARK)
	if idx < 0:
	print("ERROR: couldn't find 'with gr.Blocks(' in app.py", file=sys.stderr)
	sys.exit(1)
	prefix = src[:idx]
	ns = {"__file__": os.path.join(HERE, "app.py"), "__name__": "__main__"}
	exec(compile(prefix, "app.py", "exec"), ns)

	plot_global_model_ranking_plotly = ns["plot_global_model_ranking_plotly"]
	plot_winrate_matrix = ns["plot_winrate_matrix"]
	plot_significance_forest = ns["plot_significance_forest"]
	# Use the FULL per-dataset frame as input. We can't use public_enter_per_dataset
	# (which hides single-fit GBDTs) because compute_pct_improvement_over_baseline
	# needs the `xgboost_ensemble` rows present as the baseline, or every %↗ cell
	# turns NaN. The hidden display models are filtered out at the agg step below.
	public_per_dataset = ns["public_per_dataset"]
	public_enter_per_dataset = ns["public_enter_per_dataset"]
	compute_elo_for_subset = ns["compute_elo_for_subset"]
	compute_pct_improvement_over_baseline = ns["compute_pct_improvement_over_baseline"]
	HIDDEN_DISPLAY_MODELS = ns.get("_HIDDEN_DISPLAY_MODELS", set())

	# Same buckets as the app
	INDUSTRY_BUCKETS = [
	"All",
	"Healthcare",
	"Behavioral",
	"Computer Vision",
	"Industry & Science",
	"Finance/insurance",
	"Games & Synthetic",
	"Social/Public",
	"Other",
	]
	METRICS = ["Accuracy", "AUC", "F1_score", "Precision", "Recall", "Cross_entropy", "Elo_score", "%↗ over XGBoost"]

	# External-benchmark dataset filters. Must stay in sync with the
	# _BENCHMARK_DATASET_IDS dict in app.py (single source of truth: the app).
	BENCHMARK_DATASET_IDS = {
	"TabArena": {
	46905, 46906, 46908, 46910, 46911, 46912, 46916, 46919,
	46920, 46922, 46924, 46927, 46929, 46930, 46932, 46933,
	46935, 46937, 46938, 46940, 46941, 46947, 46950, 46955,
	46956, 46958, 46960, 46962, 46963, 46969, 46979, 46980,
	# equivalents we have under different IDs:
	1464, 40701, 31, 37, 1494,
	# kddcup09_appetency
	46939,
	},
	}
	BENCHMARKS = ["All", "TabArena"]

	# Same metadata merge as the app does at boot.
	meta_df = pd.DataFrame(json.load(open("public_datasets_info.json")))
	meta_df['dataset_id'] = meta_df['dataset_id'].astype(str)
	public_per_dataset['dataset_id'] = public_per_dataset['dataset_id'].astype(str)
	public_per_dataset_with_meta = public_per_dataset.merge(
	meta_df[['dataset_id', 'dataset_industry', 'rows', 'features']],
	on='dataset_id', how='left',
	)

	def _apply_benchmark(df, benchmark):
	if not benchmark or benchmark == "All":
	return df
	ids = {str(i) for i in BENCHMARK_DATASET_IDS.get(benchmark, set())}
	return df[df['dataset_id'].astype(str).isin(ids)]

	def build_one(benchmark, industry, metric):
	"""Return (bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg_records, n_datasets)."""
	df = _apply_benchmark(public_per_dataset_with_meta, benchmark)
	if industry != "All":
	df = df[df['dataset_industry'] == industry]
	if df.empty:
	return None, None, None, None, [], 0
	# Recompute Elo + %↗ on the filtered subset. Must include the
	# `xgboost_ensemble` baseline rows; we drop the hidden display models
	# only AFTER the per-dataset comparison is done.
	df = compute_elo_for_subset(df, metric="Accuracy")
	df = compute_pct_improvement_over_baseline(df, baseline_model="xgboost_ensemble", metric="Accuracy")
	df_display = df[~df['model'].isin(HIDDEN_DISPLAY_MODELS)].copy() if HIDDEN_DISPLAY_MODELS else df
	agg = df_display.groupby('model')[['Accuracy', 'AUC', 'F1_score', 'Precision', 'Recall', 'Cross_entropy', 'Elo_score', '%↗ over XGBoost']].mean().reset_index()
	bars = plot_global_model_ranking_plotly(agg, metric=metric, per_dataset_df=df_display)
	# Forest plot (Wilcoxon-Holm leading-group highlight + paired-bootstrap CI).
	# This is the slow figure on live render (B=1000 bootstrap × all model pairs),
	# so pre-caching it is the whole point of this build script for the Bars view.
	# Returns (figure, kpi_html) — both go in the cache.
	try:
	forest, forest_kpi = plot_significance_forest(df_display, metric=metric)
	except Exception as _e:
	print(f" [warn] forest plot failed for bench={benchmark} ind={industry} metric={metric}: {_e}")
	forest, forest_kpi = None, None
	# Win-rate matrix should ALSO use the display-filtered frame so the
	# heatmap doesn't show duplicate display names (single-fit vs ensemble).
	winrate = plot_winrate_matrix(df_display, metric=metric) if metric not in ("Elo_score", "%↗ over XGBoost") else None
	return (
	bars.to_plotly_json(),
	forest.to_plotly_json() if forest is not None else None,
	forest_kpi,
	winrate.to_plotly_json() if winrate is not None else None,
	agg.round(6).to_dict(orient="records"),
	int(df_display['dataset_id'].nunique()),
	)

	# Cache shape:
	# { benchmark_key: { industry: { "metrics": {metric: fig},
	# "forest": {metric: fig},
	# "winrate": {metric: fig},
	# "n_datasets": int,
	# "table": [...] } } }
	# Backward compat: the "All" benchmark is also written at the top level so
	# existing app.py readers that index `_figures_cache[industry]` keep working.
	cache = {}
	total = len(BENCHMARKS) * len(INDUSTRY_BUCKETS) * len(METRICS)
	i = 0
	for benchmark in BENCHMARKS:
	cache[benchmark] = {}
	for industry in INDUSTRY_BUCKETS:
	cache[benchmark][industry] = {
	"metrics": {}, "forest": {}, "forest_kpi": {}, "winrate": {},
	"n_datasets": 0, "table": [],
	}
	for metric in METRICS:
	i += 1
	bars_dict, forest_dict, forest_kpi_html, winrate_dict, agg, n = build_one(benchmark, industry, metric)
	cache[benchmark][industry]["metrics"][metric] = bars_dict
	if forest_dict is not None:
	cache[benchmark][industry]["forest"][metric] = forest_dict
	if forest_kpi_html is not None:
	cache[benchmark][industry]["forest_kpi"][metric] = forest_kpi_html
	if winrate_dict is not None:
	cache[benchmark][industry]["winrate"][metric] = winrate_dict
	cache[benchmark][industry]["n_datasets"] = n
	if not cache[benchmark][industry]["table"]:
	cache[benchmark][industry]["table"] = agg
	print(f"[{i:3d}/{total}] bench={benchmark:10s} industry={industry!r:25s} metric={metric:20s} n_datasets={n}")

	# Backward-compat alias: top-level keys mirror the All-benchmark slice so
	# legacy `cache[industry]` reads still resolve. Done as a separate dict
	# snapshot first to avoid mutating-while-iterating issues.
	_all_slice = {industry: cache["All"][industry] for industry in INDUSTRY_BUCKETS}
	for industry, payload in _all_slice.items():
	cache[industry] = payload

	import numpy as np

	def _default(o):
	if isinstance(o, np.ndarray):
	return o.tolist()
	if isinstance(o, (np.integer,)):
	return int(o)
	if isinstance(o, (np.floating,)):
	return float(o)
	raise TypeError(f"not serializable: {type(o).__name__}")

	out_path = os.path.join(HERE, "figures_cache.json")
	with open(out_path, "w") as f:
	json.dump(cache, f, default=_default) # NB: no indent — keeps file small
	size_kb = os.path.getsize(out_path) / 1024
	print(f"\nWrote {out_path} ({size_kb:.1f} KB)")
	_n_combos = 0
	for k in BENCHMARKS:
	bench = cache.get(k, {})
	if not isinstance(bench, dict):
	continue
	for ind_v in bench.values():
	if isinstance(ind_v, dict):
	_n_combos += len(ind_v.get("metrics", {}))
	print(f"Combos cached: {_n_combos}")