Spaces:

OpenEvals
/

leaderboard-watcher

Running

App Files Files Community

leaderboard-watcher / app.py

Linker1907

Prepare for HF Space deployment

4f8462a 8 days ago

raw

history blame contribute delete

32.7 kB

	#!/usr/bin/env python3
	"""
	HF Hub Benchmark Dashboard — Gradio app.
	Run: python app.py
	"""

	import html as _html
	import json
	import urllib.request
	import urllib.error
	import concurrent.futures
	from collections import defaultdict
	from datetime import datetime, timezone
	from pathlib import Path

	import gradio as gr

	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------

	ROOT = Path(__file__).resolve().parent
	CACHE_PATH = ROOT / "dashboard_cache.json"
	CACHE_TTL_SECONDS = 6 * 60 * 60 # 6 hours

	CATEGORY_ORDER = [
	"Knowledge",
	"Math / Reasoning",
	"Code / Engineering",
	"Agents",
	"Vision",
	"Audio / Speech",
	"Document / OCR",
	"Retrieval / Embedding",
	"NLP / Classification",
	"Robotics",
	"Other",
	]

	CATEGORY_ICONS = {
	"Knowledge": "🧠",
	"Math / Reasoning": "🔢",
	"Code / Engineering": "💻",
	"Agents": "🤖",
	"Vision": "👁️",
	"Audio / Speech": "🔊",
	"Document / OCR": "📄",
	"Retrieval / Embedding": "🔎",
	"NLP / Classification": "🏷️",
	"Robotics": "🦾",
	"Other": "📦",
	}

	BENCHMARK_DISPLAY_NAMES = {
	"openai/gsm8k": "GSM8K",
	"Idavidrein/gpqa": "GPQA",
	"allenai/olmOCR-bench": "olmOCR-Bench",
	"llamaindex/ParseBench": "ParseBench",
	"mercor/apex-agents": "APEX-Agents",
	"harborframework/terminal-bench-2.0": "Terminal-Bench 2.0",
	"SWE-bench/SWE-bench_Verified": "SWE-bench Verified",
	"TIGER-Lab/MMLU-Pro": "MMLU-Pro",
	"hf-audio/open-asr-leaderboard": "Open ASR Leaderboard",
	"MathArena/aime_2026": "AIME 2026",
	"claw-eval/Claw-Eval": "Claw-Eval",
	"cais/hle": "HLE",
	"likaixin/ScreenSpot-Pro": "ScreenSpot-Pro",
	"nvidia/compute-eval": "ComputeEval",
	"ScaleAI/SWE-bench_Pro": "SWE-bench Pro",
	"FutureMa/EvasionBench": "EvasionBench",
	"mteb/BRIGHT": "BRIGHT",
	"Delores-Lin/MDPBench": "MDPBench",
	"mteb/arguana": "ArguAna",
	"MMMU/MMMU_Pro": "MMMU-Pro",
	"LEXam-Benchmark/LEXam": "LEXam",
	"mercor/ACE": "ACE",
	"mercor/APEX-v1-extended": "APEX-v1",
	"VLABench/vlabench_primitive_ft_lerobot_video": "VLABench",
	"tiiuae/PBench": "PBench",
	"MathArena/hmmt_feb_2026": "HMMT Feb 2026",
	"collinear-ai/yc-bench": "YC-Bench",
	"internlm/WildClawBench": "WildClawBench",
	"MME-Benchmarks/Video-MME-v2": "Video-MME v2",
	"open-agent-leaderboard/results": "Open Agent Leaderboard",
	}

	CUSTOM_CSS = """
	/* ---- Topbar ---- */
	.topbar {
	display: flex; align-items: center; justify-content: space-between;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white; padding: 14px 24px; border-radius: 10px;
	margin-bottom: 8px; flex-wrap: wrap; gap: 12px;
	}
	.topbar-title { font-size: 18px; font-weight: 700; margin-bottom: 2px; }
	.topbar-meta { font-size: 11px; opacity: 0.85; }
	.topbar-pills { display: flex; gap: 8px; flex-wrap: wrap; }
	.stat-pill {
	background: rgba(255,255,255,0.2); border-radius: 20px;
	padding: 4px 14px; font-size: 12px; white-space: nowrap;
	}
	.stat-pill b { font-size: 14px; }

	/* ---- Layout columns ---- */
	#sidebar-col {
	background: white !important; padding: 0 !important;
	border-right: 1px solid #e5e7eb !important;
	border-radius: 10px 0 0 10px !important;
	}
	#main-col {
	background: #f8fafc !important; padding: 18px 22px !important;
	border-radius: 0 10px 10px 0 !important; min-width: 0 !important;
	}
	#sidebar-col > .form, #main-col > .form {
	background: transparent !important; box-shadow: none !important;
	border: none !important; padding: 0 !important;
	}

	/* ---- Sidebar Radio → nav buttons ---- */
	#cat_radio {
	background: transparent !important; border: none !important;
	box-shadow: none !important; padding: 0 !important;
	}
	#cat_radio > .wrap { flex-direction: column !important; gap: 0 !important; padding: 0 !important; }
	#cat_radio label {
	display: flex !important; align-items: center !important;
	padding: 8px 12px !important; margin: 0 !important;
	border-left: 3px solid transparent !important; border-radius: 0 !important;
	cursor: pointer !important; font-size: 12px !important;
	color: #374151 !important; background: white !important;
	width: 100% !important; box-sizing: border-box !important; gap: 0 !important;
	}
	#cat_radio label:hover { background: #f3f4f6 !important; }
	#cat_radio label:has(input:checked) {
	background: #ede9fe !important; border-left-color: #7c3aed !important;
	color: #5b21b6 !important; font-weight: 600 !important;
	}
	#cat_radio input[type="radio"] { display: none !important; }
	#cat_radio .wrap span { margin-left: 0 !important; padding-left: 0 !important; }

	/* ---- Bench cards Radio ---- */
	#bench_radio {
	background: transparent !important; border: none !important;
	box-shadow: none !important; padding: 0 !important;
	}
	#bench_radio > .wrap {
	flex-direction: row !important; flex-wrap: wrap !important;
	gap: 10px !important; padding: 4px 0 12px !important;
	}
	#bench_radio label {
	display: flex !important; align-items: center !important;
	padding: 10px 14px !important; border: 2px solid #e5e7eb !important;
	border-radius: 10px !important; cursor: pointer !important;
	font-size: 12px !important; background: white !important;
	color: #374151 !important; min-width: 150px !important;
	margin: 0 !important; gap: 0 !important; transition: border-color 0.15s !important;
	}
	#bench_radio label:hover { border-color: #a78bfa !important; }
	#bench_radio label:has(input:checked) {
	border-color: #7c3aed !important; background: #faf5ff !important;
	font-weight: 600 !important; color: #5b21b6 !important;
	}
	#bench_radio input[type="radio"] { display: none !important; }
	#bench_radio .wrap span { margin-left: 0 !important; padding-left: 0 !important; }

	/* ---- Filter checkbox ---- */
	#providers-filter { margin: 2px 0 10px; }
	#providers-filter > label { font-size: 12px !important; color: #6b7280 !important; }

	/* ---- Layout columns ---- */
	#sidebar-col {
	background: white !important; padding: 0 !important;
	border-right: 1px solid #e5e7eb !important;
	border-radius: 10px 0 0 10px !important;
	}
	#main-col {
	background: #f8fafc !important; padding: 18px 22px !important;
	border-radius: 0 10px 10px 0 !important; min-width: 0 !important;
	}
	#sidebar-col > .form, #main-col > .form {
	background: transparent !important; box-shadow: none !important;
	border: none !important; padding: 0 !important;
	}

	/* ---- Sidebar HTML ---- */
	.hf-sidebar { display: flex; flex-direction: column; padding: 10px 0; }
	.hf-sidebar-label {
	font-size: 10px; font-weight: 700; color: #9ca3af;
	text-transform: uppercase; letter-spacing: 0.8px; padding: 0 16px 8px;
	}
	.hf-cat-btn {
	display: flex; align-items: center; gap: 9px; width: 100%;
	padding: 9px 16px; border: none; background: none; cursor: pointer;
	border-left: 3px solid transparent; font-size: 13px; color: #374151;
	text-align: left; transition: background 0.1s;
	}
	.hf-cat-btn:hover { background: #f3f4f6; }
	.hf-cat-active {
	background: #ede9fe !important; border-left-color: #7c3aed !important;
	color: #5b21b6 !important; font-weight: 600;
	}
	.hf-cat-active .hf-cat-badge { background: #ddd6fe !important; color: #7c3aed !important; }
	.hf-cat-icon { font-size: 15px; min-width: 20px; }
	.hf-cat-name { flex: 1; }
	.hf-cat-badge {
	background: #f3f4f6; border-radius: 12px;
	padding: 1px 8px; font-size: 11px; color: #6b7280;
	}

	/* ---- Benchmark cards HTML ---- */
	.hf-section-head { display: flex; align-items: baseline; gap: 12px; margin-bottom: 12px; }
	.hf-section-title { font-size: 16px; font-weight: 700; color: #111827; }
	.hf-section-meta { font-size: 12px; color: #9ca3af; }
	.hf-cards { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 14px; }
	.hf-card {
	border: 2px solid #e5e7eb; border-radius: 10px; padding: 10px 14px;
	cursor: pointer; background: white; min-width: 150px;
	transition: border-color 0.15s, box-shadow 0.15s;
	}
	.hf-card:hover { border-color: #a78bfa; box-shadow: 0 1px 6px rgba(124,58,237,0.1); }
	.hf-card-active { border-color: #7c3aed !important; background: #faf5ff !important; }
	.hf-card-name { font-size: 13px; font-weight: 600; color: #111827; }
	.hf-card-active .hf-card-name { color: #5b21b6; }
	.hf-card-count { font-size: 11px; color: #6b7280; margin-top: 3px; }
	.hf-card-owner { font-size: 10px; color: #9ca3af; margin-top: 2px; }

	/* ---- JS bridge textboxes (rendered but invisible) ---- */
	#cat_trigger, #bench_trigger {
	display: none !important;
	position: absolute !important;
	pointer-events: none !important;
	}

	/* ---- Filter checkbox ---- */
	#providers-filter { margin: 2px 0 10px; }
	#providers-filter > label { font-size: 12px !important; color: #6b7280 !important; }

	/* ---- Leaderboard HTML ---- */
	.hf-lb { border: 1px solid #e5e7eb; border-radius: 10px; overflow: hidden; background: white; }
	.hf-lb-head {
	display: flex; align-items: center; justify-content: space-between;
	padding: 10px 16px; border-bottom: 1px solid #f3f4f6; background: #f9fafb;
	}
	.hf-lb-title { font-size: 13px; font-weight: 600; color: #374151; }
	.hf-lb-meta { display: flex; align-items: center; gap: 12px; }
	.hf-lb-count { font-size: 11px; color: #9ca3af; }
	.hf-hub-link { font-size: 11px; color: #7c3aed; text-decoration: none; font-weight: 500; }
	.hf-hub-link:hover { text-decoration: underline; }
	.hf-lb-scroll { overflow-x: auto; }
	.hf-table { width: 100%; border-collapse: collapse; font-size: 12px; }
	.hf-table thead th {
	padding: 7px 12px; text-align: left; font-size: 10px; font-weight: 700;
	color: #6b7280; text-transform: uppercase; letter-spacing: 0.4px;
	white-space: nowrap; background: white; border-bottom: 1px solid #f3f4f6;
	}
	.hf-table td { padding: 7px 12px; border-bottom: 1px solid #f3f4f6; vertical-align: middle; }
	.hf-table tbody tr:last-child td { border-bottom: none; }
	.hf-table tbody tr:nth-child(even) td { background: #fafafa; }
	.hf-table tbody tr:hover td { background: #faf5ff !important; }
	.hf-rank { width: 44px; text-align: center; font-size: 17px; }
	.hf-rank-num { color: #9ca3af; font-size: 12px; font-variant-numeric: tabular-nums; }
	.hf-model a { color: #2563eb; text-decoration: none; font-size: 11px; word-break: break-all; }
	.hf-model a:hover { text-decoration: underline; }
	.hf-score { font-variant-numeric: tabular-nums; font-weight: 600; color: #111827; white-space: nowrap; }
	.hf-price { font-variant-numeric: tabular-nums; color: #059669; white-space: nowrap; }
	.hf-ctx, .hf-params, .hf-ttft, .hf-tput { white-space: nowrap; }
	.hf-ttft { color: #7c3aed; }
	.hf-tput { color: #0369a1; }
	.hf-lic { color: #6b7280; font-size: 11px; }
	.hf-params { font-weight: 500; }
	.hf-provs { display: flex; flex-wrap: wrap; gap: 3px; }
	.hf-chip {
	background: #dbeafe; color: #1e40af;
	border-radius: 4px; padding: 1px 6px;
	font-size: 10px; font-weight: 500; white-space: nowrap;
	}
	.hf-chip-more { background: #f3f4f6 !important; color: #6b7280 !important; }
	.hf-na { color: #d1d5db; }
	.hf-empty { padding: 48px 24px; text-align: center; color: #9ca3af; font-size: 14px; }
	"""

	# ---------------------------------------------------------------------------
	# HF API helpers
	# ---------------------------------------------------------------------------


	def _http_get_json(url: str, token: str \| None = None, timeout: int = 30):
	req = urllib.request.Request(url, headers={"Accept": "application/json"})
	if token:
	req.add_header("Authorization", f"Bearer {token}")
	with urllib.request.urlopen(req, timeout=timeout) as resp:
	return json.loads(resp.read().decode("utf-8"))


	def _read_token() -> str \| None:
	import os
	p = Path(os.path.expanduser("~/.cache/huggingface/token"))
	if p.exists():
	tok = p.read_text().strip()
	if tok:
	return tok
	return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")


	def discover_benchmarks(token=None) -> list[dict]:
	url = "https://huggingface.co/api/datasets?filter=benchmark:official&limit=1000"
	data = _http_get_json(url, token, timeout=30)
	results = []
	for d in data:
	if not isinstance(d, dict) or "id" not in d:
	continue
	results.append({
	"id": d["id"],
	"tags": d.get("tags", []),
	"description": (d.get("description") or "")[:200],
	})
	return results


	def get_leaderboard(dataset_id: str, token=None) -> list[dict]:
	url = f"https://huggingface.co/api/datasets/{dataset_id}/leaderboard"
	try:
	data = _http_get_json(url, token, timeout=30)
	except (urllib.error.HTTPError, urllib.error.URLError):
	return []
	if isinstance(data, dict) and "entries" in data:
	data = data["entries"]
	return data if isinstance(data, list) else []


	# ---------------------------------------------------------------------------
	# Categorisation
	# ---------------------------------------------------------------------------


	def categorize_benchmark(bench: dict) -> list[str]:
	tags = bench.get("tags", [])
	bid = bench["id"]
	bid_lower = bid.lower()
	categories = set()

	if any(t in tags for t in ["modality:audio", "modality:speech"]):
	categories.add("Audio / Speech")
	if any(t in tags for t in ["modality:image", "modality:video"]):
	categories.add("Vision")
	if any(t in tags for t in ["modality:document"]):
	categories.add("Document / OCR")
	if any(t in tags for t in ["task_categories:robotics"]):
	categories.add("Robotics")
	if any(t in tags for t in ["task_categories:text-retrieval"]):
	categories.add("Retrieval / Embedding")

	if "math" in bid_lower or "aime" in bid_lower or "hmmt" in bid_lower or "gsm8k" in bid_lower:
	categories.add("Math / Reasoning")
	if "swe" in bid_lower or "terminal" in bid_lower or "compute-eval" in bid_lower:
	categories.add("Code / Engineering")
	if "agent" in bid_lower or "claw" in bid_lower or "apex-agent" in bid_lower or "wildclaw" in bid_lower or "yc-bench" in bid_lower:
	categories.add("Agents")
	if "mmlu" in bid_lower or "gpqa" in bid_lower or "hle" in bid_lower:
	categories.add("Knowledge")
	if "ocr" in bid_lower or "parse" in bid_lower or "mdp" in bid_lower:
	categories.add("Document / OCR")
	if "asr" in bid_lower:
	categories.add("Audio / Speech")
	if "screen" in bid_lower or "mmmu" in bid_lower or "video" in bid_lower or "pbench" in bid_lower:
	categories.add("Vision")
	if "evasion" in bid_lower or "lex" in bid_lower:
	categories.add("NLP / Classification")
	if "bright" in bid_lower or "arguana" in bid_lower:
	categories.add("Retrieval / Embedding")

	if not categories:
	categories.add("Other")

	return sorted(categories, key=lambda c: CATEGORY_ORDER.index(c) if c in CATEGORY_ORDER else 99)


	# ---------------------------------------------------------------------------
	# Data fetching & aggregation
	# ---------------------------------------------------------------------------


	def fetch_all_data() -> dict:
	token = _read_token()
	benchmarks = discover_benchmarks(token)
	all_models: set[str] = set()
	benchmark_data = []

	for bench in benchmarks:
	bid = bench["id"]
	entries = get_leaderboard(bid, token)
	models: set[str] = set()
	model_details = []
	for entry in entries:
	mid = entry.get("modelId") or entry.get("model_id") or entry.get("model") or ""
	if not mid:
	continue
	models.add(mid)
	model_details.append({
	"rank": entry.get("rank"),
	"model_id": mid,
	"value": entry.get("value"),
	"verified": entry.get("verified", False),
	})

	model_details.sort(key=lambda x: (x["rank"] is None, x["rank"] or 999))
	all_models.update(models)
	cats = categorize_benchmark(bench)
	display_name = BENCHMARK_DISPLAY_NAMES.get(bid, bid.split("/")[-1])

	benchmark_data.append({
	"id": bid,
	"display_name": display_name,
	"categories": cats,
	"num_models": len(models),
	"models": sorted(models),
	"model_details": model_details,
	"description": bench["description"],
	})

	cat_benchmarks: dict[str, list] = defaultdict(list)
	cat_models: dict[str, set] = defaultdict(set)
	for bd in benchmark_data:
	for cat in bd["categories"]:
	cat_benchmarks[cat].append(bd)
	cat_models[cat].update(bd["models"])

	return {
	"total_benchmarks": len(benchmarks),
	"total_unique_models": len(all_models),
	"benchmarks_with_entries": sum(1 for bd in benchmark_data if bd["num_models"] > 0),
	"benchmarks_empty": sum(1 for bd in benchmark_data if bd["num_models"] == 0),
	"timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"),
	"all_models": sorted(all_models),
	"benchmark_data": benchmark_data,
	"categories": {
	cat: {
	"benchmarks": len(cat_benchmarks[cat]),
	"unique_models": len(cat_models[cat]),
	}
	for cat in CATEGORY_ORDER
	if cat in cat_benchmarks
	},
	}


	def load_cached_data() -> dict \| None:
	if not CACHE_PATH.exists():
	return None
	try:
	d = json.loads(CACHE_PATH.read_text())
	ts = d.get("timestamp", "")
	if ts:
	age = (datetime.now(timezone.utc) - datetime.fromisoformat(ts)).total_seconds()
	if age < CACHE_TTL_SECONDS:
	return d
	except Exception:
	pass
	return None


	def save_cache(data: dict) -> None:
	CACHE_PATH.write_text(json.dumps(data, indent=2))


	# ---------------------------------------------------------------------------
	# UI helpers
	# ---------------------------------------------------------------------------

	_app_data: dict = {}
	_router_data: dict = {} # model_id → {providers, cheapest_input, cheapest_output, context_length}
	_model_meta_cache: dict = {} # model_id → {license, params}


	def _render_topbar(data: dict) -> str:
	ts = data.get("timestamp", "?")[:19]
	total_entries = sum(bd["num_models"] for bd in data.get("benchmark_data", []))
	return (
	f'<div class="topbar">'
	f'<div><div class="topbar-title">🏆 HF Hub Benchmark Dashboard</div>'
	f'<div class="topbar-meta">Last updated: {ts} UTC · auto-refreshes every 6h</div></div>'
	f'<div class="topbar-pills">'
	f'<div class="stat-pill"><b>{data["total_benchmarks"]}</b> benchmarks</div>'
	f'<div class="stat-pill"><b>{data["total_unique_models"]}</b> models</div>'
	f'<div class="stat-pill"><b>{total_entries:,}</b> entries</div>'
	f'<div class="stat-pill"><b>{data["benchmarks_with_entries"]}</b> active</div>'
	f'<div class="stat-pill"><b>{data["benchmarks_empty"]}</b> empty</div>'
	f'</div></div>'
	)


	def _load_router_data(token: str \| None = None) -> dict:
	"""Fetch all inference-available models from the HF router (pricing + context)."""
	try:
	resp = _http_get_json("https://router.huggingface.co/v1/models", token, timeout=30)
	except Exception:
	return {}
	out: dict = {}
	for m in resp.get("data", []):
	mid = m.get("id", "")
	if not mid:
	continue
	live = [p for p in m.get("providers", []) if p.get("status") == "live"]
	if not live:
	continue
	cheapest_out = min(live, key=lambda p: p.get("pricing", {}).get("output", 1e9))
	cheapest_in = min(live, key=lambda p: p.get("pricing", {}).get("input", 1e9))
	ttfts = [p["first_token_latency_ms"] for p in live if p.get("first_token_latency_ms")]
	throughputs = [p["throughput"] for p in live if p.get("throughput")]
	out[mid] = {
	"providers": [p["provider"] for p in live],
	"cheapest_input": cheapest_in.get("pricing", {}).get("input"),
	"cheapest_output": cheapest_out.get("pricing", {}).get("output"),
	"context_length": max((p.get("context_length") or 0) for p in live),
	"fastest_ttft_ms": min(ttfts) if ttfts else None,
	"fastest_throughput": max(throughputs) if throughputs else None,
	}
	return out


	def _load_model_metas(model_ids: list[str], token: str \| None = None) -> None:
	"""Fetch license + param count for model_ids not yet cached. Fills _model_meta_cache."""
	to_fetch = [m for m in model_ids if m not in _model_meta_cache]
	if not to_fetch:
	return

	def _fetch_one(mid: str) -> tuple[str, dict]:
	url = f"https://huggingface.co/api/models/{mid}?expand[]=safetensors&expand[]=cardData"
	try:
	d = _http_get_json(url, token, timeout=10)
	except Exception:
	return mid, {}
	lic = (d.get("cardData") or {}).get("license", "")
	if not lic:
	for t in d.get("tags", []):
	if t.startswith("license:"):
	lic = t[8:]
	break
	total = (d.get("safetensors") or {}).get("total", 0)
	params = ""
	if total:
	b = total / 1e9
	params = f"{round(b)}B" if b >= 1 else f"{round(total / 1e6)}M"
	return mid, {"license": lic, "params": params}

	with concurrent.futures.ThreadPoolExecutor(max_workers=20) as ex:
	for mid, meta in ex.map(_fetch_one, to_fetch):
	_model_meta_cache[mid] = meta


	def _sidebar_choices(data: dict) -> list[tuple[str, str]]:
	cats = data.get("categories", {})
	result = []
	for cat in CATEGORY_ORDER:
	if cat not in cats:
	continue
	icon = CATEGORY_ICONS.get(cat, "")
	count = cats[cat]["benchmarks"]
	result.append((f"{icon} {cat} ({count})", cat))
	return result


	def _card_choices(data: dict, category: str) -> list[tuple[str, str]]:
	bds = sorted(
	[bd for bd in data["benchmark_data"] if category in bd["categories"]],
	key=lambda x: x["num_models"], reverse=True,
	)
	choices = []
	for bd in bds:
	owner = bd["id"].split("/")[0] if "/" in bd["id"] else ""
	label = f"{bd['display_name']} · {bd['num_models']} models"
	if owner:
	label += f" [{owner}]"
	choices.append((label, bd["id"]))
	return choices


	def _cat_header(data: dict, cat: str) -> str:
	icon = CATEGORY_ICONS.get(cat, "")
	info = data.get("categories", {}).get(cat, {})
	return f"### {icon} {cat}  ·  {info.get('benchmarks', 0)} benchmarks · {info.get('unique_models', 0)} models"


	def _render_leaderboard(data: dict, bid: str, providers_only: bool = False) -> str:
	if not bid:
	return '<div class="hf-empty">Select a benchmark to view its leaderboard.</div>'

	lookup = {bd["id"]: bd for bd in data["benchmark_data"]}
	bd = lookup.get(bid, {})
	safe_bid = _html.escape(bid)
	hub_link = (
	f'<a class="hf-hub-link" href="https://huggingface.co/datasets/{safe_bid}" target="_blank">'
	f'↗ View on Hub</a>'
	)

	rows = _lb_rows(data, bid, providers_only)
	if not rows:
	return (
	f'<div class="hf-lb">'
	f'<div class="hf-lb-head">'
	f'<span class="hf-lb-title">{_html.escape(bd.get("display_name", bid))}</span>'
	f'{hub_link}</div>'
	f'<div class="hf-empty">No entries yet.</div></div>'
	)

	thead = "<tr>" + "".join(
	f"<th>{h}</th>"
	for h in ["", "Model", "Score", "In $/1M", "Out $/1M", "Context", "TTFT", "Throughput", "License", "Params", "Providers"]
	) + "</tr>"

	tbody = []
	for rank, model_id, score, price_in, price_out, ctx, ttft, tput, lic, params, provs in rows:
	if rank == 1: rank_html = "🥇"
	elif rank == 2: rank_html = "🥈"
	elif rank == 3: rank_html = "🥉"
	else: rank_html = f'<span class="hf-rank-num">{rank}</span>'

	safe_mid = _html.escape(model_id)
	model_html = f'<a href="https://huggingface.co/{safe_mid}" target="_blank">{safe_mid}</a>'

	if provs != "—":
	chips = []
	for p in provs.split(","):
	p = p.strip()
	cls = "hf-chip hf-chip-more" if p.startswith("+") else "hf-chip"
	chips.append(f'<span class="{cls}">{_html.escape(p)}</span>')
	prov_html = f'<div class="hf-provs">{"".join(chips)}</div>'
	else:
	prov_html = '<span class="hf-na">—</span>'

	tbody.append(
	f'<tr>'
	f'<td class="hf-rank">{rank_html}</td>'
	f'<td class="hf-model">{model_html}</td>'
	f'<td class="hf-score">{_html.escape(str(score))}</td>'
	f'<td class="hf-price">{_html.escape(str(price_in))}</td>'
	f'<td class="hf-price">{_html.escape(str(price_out))}</td>'
	f'<td class="hf-ctx">{_html.escape(str(ctx))}</td>'
	f'<td class="hf-ttft">{_html.escape(str(ttft))}</td>'
	f'<td class="hf-tput">{_html.escape(str(tput))}</td>'
	f'<td class="hf-lic">{_html.escape(str(lic))}</td>'
	f'<td class="hf-params">{_html.escape(str(params))}</td>'
	f'<td>{prov_html}</td>'
	f'</tr>'
	)

	return (
	f'<div class="hf-lb">'
	f'<div class="hf-lb-head">'
	f'<span class="hf-lb-title">{_html.escape(bd.get("display_name", bid))}</span>'
	f'<div class="hf-lb-meta">'
	f'<span class="hf-lb-count">{len(rows)} entries</span>'
	f'{hub_link}</div></div>'
	f'<div class="hf-lb-scroll">'
	f'<table class="hf-table">'
	f'<thead>{thead}</thead>'
	f'<tbody>{"".join(tbody)}</tbody>'
	f'</table></div></div>'
	)


	def _fmt_ctx(n: int) -> str:
	if n >= 1_000_000:
	v = n / 1_000_000
	return f"{v:.0f}M" if v == int(v) else f"{v:.1f}M"
	if n >= 1_000:
	v = n / 1_000
	return f"{v:.0f}K" if v == int(v) else f"{v:.1f}K"
	return str(n) if n else "—"


	def _lb_rows(data: dict, bid: str, providers_only: bool = False) -> list[list]:
	lookup = {bd["id"]: bd for bd in data["benchmark_data"]}
	details = lookup.get(bid, {}).get("model_details", [])[:50]
	if not details:
	return []

	model_ids = [m["model_id"] for m in details]
	_load_model_metas(model_ids, _read_token())

	if providers_only:
	details = [m for m in details if _router_data.get(m["model_id"], {}).get("providers")]

	rows = []
	for m in details:
	mid = m["model_id"]
	meta = _model_meta_cache.get(mid, {})
	router = _router_data.get(mid, {})

	providers = router.get("providers", [])
	if providers:
	prov_str = ", ".join(providers[:3])
	if len(providers) > 3:
	prov_str += f" +{len(providers) - 3}"
	else:
	prov_str = "—"

	in_price = router.get("cheapest_input")
	out_price = router.get("cheapest_output")
	price_in_str = f"${in_price:.2f}" if in_price is not None else "—"
	price_out_str = f"${out_price:.2f}" if out_price is not None else "—"

	ctx_str = _fmt_ctx(router.get("context_length") or 0)

	ttft = router.get("fastest_ttft_ms")
	ttft_str = f"{ttft:,.0f} ms" if ttft is not None else "—"

	tput = router.get("fastest_throughput")
	tput_str = f"{tput:.0f} t/s" if tput is not None else "—"

	rows.append([
	m["rank"] if m["rank"] is not None else "—",
	mid,
	str(m["value"]) if m["value"] is not None else "—",
	price_in_str,
	price_out_str,
	ctx_str,
	ttft_str,
	tput_str,
	meta.get("license") or "—",
	meta.get("params") or "—",
	prov_str,
	])
	return rows


	# ---------------------------------------------------------------------------
	# Gradio app
	# ---------------------------------------------------------------------------

	def build_app() -> gr.Blocks:
	global _app_data, _router_data
	_app_data = load_cached_data()
	if _app_data is None:
	_app_data = fetch_all_data()
	save_cache(_app_data)
	_router_data = _load_router_data(_read_token())

	s_choices = _sidebar_choices(_app_data)
	init_cat = s_choices[0][1] if s_choices else ""
	c_choices = _card_choices(_app_data, init_cat)
	init_bid = c_choices[0][1] if c_choices else ""

	with gr.Blocks(
	title="HF Hub Benchmark Dashboard",
	css=CUSTOM_CSS,
	theme=gr.themes.Soft(),
	) as demo:

	topbar = gr.HTML(_render_topbar(_app_data))

	# Tracks the currently-selected benchmark for the filter toggle
	sel_bid = gr.State(init_bid)

	with gr.Row(equal_height=True):
	with gr.Column(scale=1, min_width=170, elem_id="sidebar-col"):
	cat_radio = gr.Radio(
	choices=s_choices, value=init_cat,
	label="Categories", elem_id="cat_radio",
	)

	with gr.Column(scale=5, elem_id="main-col"):
	with gr.Row():
	cat_header = gr.Markdown(_cat_header(_app_data, init_cat))
	refresh_btn = gr.Button("🔄 Refresh Now", variant="primary", scale=0, min_width=150)

	bench_radio = gr.Radio(
	choices=c_choices, value=init_bid,
	show_label=False, elem_id="bench_radio",
	)

	providers_filter = gr.Checkbox(
	label="Only show models with inference providers",
	value=False, elem_id="providers-filter",
	)

	leaderboard = gr.HTML(_render_leaderboard(_app_data, init_bid))

	# ---- Event handlers ----

	def on_cat(cat: str, prov_only: bool):
	new_c = _card_choices(_app_data, cat)
	new_bid = new_c[0][1] if new_c else ""
	return (
	_cat_header(_app_data, cat),
	gr.update(choices=new_c, value=new_bid),
	_render_leaderboard(_app_data, new_bid, prov_only),
	new_bid,
	)

	def on_bench(bid: str, prov_only: bool):
	return _render_leaderboard(_app_data, bid, prov_only), bid

	def on_filter(bid: str, prov_only: bool):
	return _render_leaderboard(_app_data, bid, prov_only)

	def on_refresh(prov_only: bool):
	global _app_data, _router_data
	try:
	new_data = fetch_all_data()
	save_cache(new_data)
	_app_data = new_data
	_router_data = _load_router_data(_read_token())
	except Exception as e:
	err = f'<p style="color:#dc2626;padding:8px">⚠️ Refresh failed: {e}</p>'
	return _render_topbar(_app_data) + err, gr.update(), gr.update(), gr.update(), gr.update(), gr.update()

	new_s = _sidebar_choices(_app_data)
	new_cat = new_s[0][1] if new_s else ""
	new_c = _card_choices(_app_data, new_cat)
	new_bid = new_c[0][1] if new_c else ""
	return (
	_render_topbar(_app_data),
	gr.update(choices=new_s, value=new_cat),
	_cat_header(_app_data, new_cat),
	gr.update(choices=new_c, value=new_bid),
	_render_leaderboard(_app_data, new_bid, prov_only),
	new_bid,
	)

	cat_radio.change(
	fn=on_cat,
	inputs=[cat_radio, providers_filter],
	outputs=[cat_header, bench_radio, leaderboard, sel_bid],
	)
	bench_radio.change(
	fn=on_bench,
	inputs=[bench_radio, providers_filter],
	outputs=[leaderboard, sel_bid],
	)
	providers_filter.change(
	fn=on_filter,
	inputs=[sel_bid, providers_filter],
	outputs=[leaderboard],
	)
	refresh_btn.click(
	fn=on_refresh,
	inputs=[providers_filter],
	outputs=[topbar, cat_radio, cat_header, bench_radio, leaderboard, sel_bid],
	show_progress="full",
	)

	return demo


	if __name__ == "__main__":
	demo = build_app()
	demo.launch()