Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

cadgenbench-leaderboard / leaderboard.py

Michael Rabinovich

leaderboard: serve renders from the public bucket, not the dataset proxy

d2161b1 about 13 hours ago

23.2 kB

	# Copyright 2026 Hugging Face
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Leaderboard read path.

	Loads `results.jsonl` from the submissions dataset on the Hub and
	shapes the rows into the dataframe shown on the Leaderboard tab. The
	live file is the single source of truth: there is no fallback to
	bundled/stale data, so any read failure raises
	:class:`LeaderboardDataError` rather than silently serving wrong rows.
	Module-level constants describe the env-var-driven repo identities
	that the submit path also consumes.
	"""
	from __future__ import annotations

	import html
	import json
	import logging
	import os
	import re
	import tempfile
	import time
	import uuid
	from pathlib import Path

	import pandas as pd
	import requests
	from huggingface_hub import get_token

	logger = logging.getLogger(__name__)

	HF_ORG = os.getenv("HF_ORG", "michaelr27")
	HF_SUBMISSIONS_REPO = os.getenv(
	"HF_SUBMISSIONS_REPO", f"{HF_ORG}/cadgenbench-submissions"
	)
	HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data")
	# Private ground-truth dataset. The gallery's GT render proxy reads
	# `<fixture>/renders/iso.png` from here (needs the Space HF_TOKEN's
	# read scope, same token the eval already uses for GT).
	HF_DATA_GT_REPO = os.getenv("HF_DATA_GT_REPO", f"{HF_ORG}/cadgenbench-data-gt")

	# Public HF Storage Bucket holding the per-submission gallery/report renders
	# (candidate turntables + edit-diff WebP). Public so the browser can fetch a
	# render straight from object storage with no token and no Space proxy; the
	# eval job is the only writer. Submission renders are public anyway, the GT
	# renders stay in the private GT dataset and are never published here.
	HF_RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging")
	HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
	# Permanent renders live under this prefix; transient shard staging lives under
	# its own prefix and is wiped after merge, so the two never collide.
	RENDER_BUCKET_PREFIX = "renders"

	RESULTS_FILENAME = "results.jsonl"
	HUB_FETCH_TIMEOUT_SECONDS = 30


	def render_object_path(submission_id: str, fixture: str, filename: str) -> str:
	"""Bucket-relative path of one render: ``renders/<id>/<fixture>/<file>``."""
	return f"{RENDER_BUCKET_PREFIX}/{submission_id}/{fixture}/{filename}"


	def render_submission_prefix(submission_id: str) -> str:
	"""Bucket-relative prefix holding every render for submission_id."""
	return f"{RENDER_BUCKET_PREFIX}/{submission_id}"


	def render_public_url(submission_id: str, fixture: str, filename: str) -> str:
	"""Stable anonymous URL for one render (browser follows the 302 to the CDN)."""
	return (
	f"{HF_ENDPOINT}/buckets/{HF_RENDER_BUCKET}/resolve/"
	f"{render_object_path(submission_id, fixture, filename)}"
	)


	def render_submission_base_url(submission_id: str) -> str:
	"""Public base URL for submission_id's renders, ``.../resolve/renders/<id>``.

	The report generator appends ``/<fixture>/<file>`` to this; passed to
	``cadgenbench``'s ``generate_html`` as the display-only ``render_base_url``.
	"""
	return (
	f"{HF_ENDPOINT}/buckets/{HF_RENDER_BUCKET}/resolve/"
	f"{render_submission_prefix(submission_id)}"
	)


	class LeaderboardDataError(RuntimeError):
	"""Raised when the live ``results.jsonl`` cannot be read from the Hub.

	The leaderboard has no fallback: rather than silently serving
	stale or bundled data (which can make a broken Hub read, e.g. an
	under-scoped Space ``HF_TOKEN``, look like an up-to-date but wrong
	leaderboard), every read failure surfaces loudly here.
	"""

	# Columns visible in the rendered table, in left-to-right order, followed
	# by hidden-but-data-present columns the row-click detail panel pulls from.
	# Hidden columns ride along in the DataFrame so `Leaderboard.select(...)`
	# can read them out without a separate state-cache or re-fetch.
	# Display column names (these are what the rendered table headers
	# read). The visible set is intentionally compact; the detail-panel
	# columns at the end of the list ride along in the DataFrame so the
	# row-click handler can populate from them without a separate state
	# cache. `submission_name` is the primary link (markdown-wrapped at
	# projection time to point at the report when one exists), so we
	# don't need a separate `report` column. `submission_blob_url` lives
	# in the detail panel only.
	LEADERBOARD_COLS = [
	"status",
	"submission_name",
	"submitter_name",
	"aggregate_score",
	"validity_rate",
	"submitted_at",
	"cadgenbench_version",
	"model details (optional)",
	# Detail-panel-only (hidden via `hide_columns` on the widget):
	"submission_id",
	"notes",
	"failure_reason",
	"submission_blob_url",
	"report_url",
	]

	# Validated table additionally exposes `validation_method`; on the
	# unvalidated table the field is always null so the column is omitted
	# rather than rendered. See cadgenbench-submissions/schema.md.
	VALIDATED_LEADERBOARD_COLS = [
	"status",
	"submission_name",
	"submitter_name",
	"aggregate_score",
	"validity_rate",
	"validation_method",
	"submitted_at",
	"cadgenbench_version",
	"model details (optional)",
	"submission_id",
	"notes",
	"failure_reason",
	"submission_blob_url",
	"report_url",
	]

	# Columns to hide from rendering on both tables. These ride in the
	# DataFrame so the row-click detail panel can populate from them; the
	# widget hides them from view.
	LEADERBOARD_HIDE_COLUMNS = [
	"submission_id",
	"notes",
	"failure_reason",
	"submission_blob_url",
	"report_url",
	]

	# Per-column gradio_leaderboard datatypes. `submission_name` and
	# `model details (optional)` render their pre-formatted markdown
	# (the submission_name cell links to the report when one exists; the
	# model-details cell is the agent URL or _None_); everything else is
	# plain string (numeric cells get pending / failed status tags
	# applied by _fmt_pct / _fmt_score so they're string-shaped by the
	# time the widget sees them).
	_LINK_COLUMNS = frozenset({"submission_name", "model details (optional)"})


	def _datatypes_for(columns: list[str]) -> list[str]:
	return ["markdown" if c in _LINK_COLUMNS else "str" for c in columns]


	LEADERBOARD_DATATYPES = _datatypes_for(LEADERBOARD_COLS)
	VALIDATED_LEADERBOARD_DATATYPES = _datatypes_for(VALIDATED_LEADERBOARD_COLS)

	PENDING_CELL_TAG = "⏳ evaluating..."
	FAILED_CELL_TAG = "✗ failed"

	_ISO_TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z$")


	def _fmt_timestamp(ts) -> str:
	"""Render an ISO-8601 ``submitted_at`` as ``YYYY-MM-DD HH:MM UTC``.

	The schema writes timestamps as ``YYYY-MM-DDTHH:MM:SSZ``; the
	minute-level UTC form is plenty for the table + detail panel,
	drops the ``T``/``Z`` punctuation, and renders the timezone
	explicitly so a reader doesn't have to know that "Z" means UTC.
	"""
	if ts is None or (isinstance(ts, float) and pd.isna(ts)):
	return ""
	s = str(ts).strip()
	if not s:
	return ""
	m = _ISO_TS_RE.match(s)
	if m:
	return f"{m.group(1)} {m.group(2)} UTC"
	return s


	def _load_rows_from_hub() -> list[dict]:
	"""Pull results.jsonl from the submissions dataset via raw HTTPS.

	Avoids :func:`huggingface_hub.hf_hub_download` because its layered
	caching (local disk cache + revision pinning + the Hub's own
	CDN-fronted resolve endpoint) can hand back stale bytes for a few
	minutes even with ``force_download=True``, which makes pending
	rows look like they never landed. A direct GET with a cache-bust
	query param and ``Cache-Control: no-cache`` consistently sees the
	latest commit on the dataset's ``main`` branch within seconds.

	The live ``results.jsonl`` is the single source of truth. Any
	failure (network, auth, malformed JSON) raises
	:class:`LeaderboardDataError`: there is deliberately no fallback
	to bundled/stale data, so a broken read fails loudly instead of
	silently serving wrong rows. An empty file is a valid result (an
	empty leaderboard), not a failure.
	"""
	url = (
	f"https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}"
	f"/resolve/main/{RESULTS_FILENAME}"
	)
	headers = {"Cache-Control": "no-cache"}
	token = get_token()
	if token:
	headers["Authorization"] = f"Bearer {token}"
	try:
	r = requests.get(
	url,
	headers=headers,
	params={"_cb": str(int(time.time() * 1000))},
	timeout=HUB_FETCH_TIMEOUT_SECONDS,
	)
	r.raise_for_status()
	except Exception as e:
	raise LeaderboardDataError(
	f"Could not fetch {RESULTS_FILENAME} from {HF_SUBMISSIONS_REPO}: "
	f"{type(e).__name__}: {e}. Verify the Space's HF_TOKEN has read "
	f"access to the (private) submissions dataset. The leaderboard "
	f"serves no fallback data."
	) from e
	try:
	rows = [json.loads(line) for line in r.text.splitlines() if line.strip()]
	except json.JSONDecodeError as e:
	raise LeaderboardDataError(
	f"Malformed {RESULTS_FILENAME} from {HF_SUBMISSIONS_REPO}: "
	f"{type(e).__name__}: {e}."
	) from e
	logger.info("Loaded %d rows from Hub", len(rows))
	return rows


	def _fmt_pct(x: float \| None, status: str) -> str:
	"""Render a 0-1 fraction as 'NN%' (or 'NN.N%' for non-whole values).

	Status-aware: pending / failed rows render a tag in place of the
	number (the row's eventual score is not yet known or never will
	be). ``pd.isna`` covers both ``None`` and pandas-coerced ``NaN``.
	"""
	if status == "pending":
	return PENDING_CELL_TAG
	if status == "failed":
	return FAILED_CELL_TAG
	if pd.isna(x):
	return ""
	pct = float(x) * 100
	return f"{pct:.0f}%" if pct == int(pct) else f"{pct:.1f}%"


	def _fmt_score(x: float \| None, status: str) -> str:
	"""Render an aggregate CAD score, status-aware tag on pending / failed."""
	if status == "pending":
	return PENDING_CELL_TAG
	if status == "failed":
	return FAILED_CELL_TAG
	if pd.isna(x):
	return ""
	return f"{float(x):.4f}"


	def _is_empty(v) -> bool:
	"""True for None, NaN, or empty/whitespace-only strings."""
	if v is None:
	return True
	if isinstance(v, float) and pd.isna(v):
	return True
	if isinstance(v, str) and not v.strip():
	return True
	return False


	_AGENT_URL_MAX_LINK_TEXT = 40


	def _shorten_url_for_display(url: str) -> str:
	"""Strip scheme + trailing slash; truncate to keep the table cell tidy."""
	s = url.replace("https://", "").replace("http://", "").rstrip("/")
	if len(s) > _AGENT_URL_MAX_LINK_TEXT:
	s = s[: _AGENT_URL_MAX_LINK_TEXT - 1] + "…"
	return s


	def _agent_url_md(url) -> str:
	"""Render the `model details (optional)` cell as a markdown link.

	Uses a shortened version of the URL itself as the link text:
	`agent_url` is a free-form "URL pointing at the agent code or
	paper" per the schema, so the URL itself carries the only honest
	hint about what's behind the click. Missing cells render as
	italic ``_None_`` so a reader sees the field is optional and
	just wasn't filled, rather than a blank.
	"""
	if _is_empty(url):
	return "_None_"
	return f"[{_shorten_url_for_display(str(url))}]({url})"


	def _report_relative_url(submission_id, status, submission_sha256) -> str:
	"""Relative URL to the Space's report-proxy route, or empty.

	The Space exposes ``/reports/<id>.html`` which fetches the file
	from the submissions dataset and re-serves it as ``text/html``
	(the dataset's ``/resolve/`` path returns it as ``text/plain``,
	which renders as source). Returning a relative URL means the
	same string works whether the Space is running locally on a
	random port or on huggingface.co.

	`reports/<id>.html` only exists for completed rows from the
	modern submit pipeline; legacy seed rows (pre-pipeline,
	``submission_sha256`` null) never had a report uploaded, so this
	function returns empty for them and the caller leaves the
	submission_name cell as plain text rather than a broken link.
	"""
	if status != "completed" or _is_empty(submission_id):
	return ""
	if _is_empty(submission_sha256):
	return ""
	return f"/reports/{submission_id}.html"


	def _submission_name_md(name, report_url) -> str:
	"""Render `submission_name`, linking to the report in a new tab.

	Now that the Space is public, HF's edge serves the FastAPI
	``/reports/<id>.html`` route to in-browser users (it 404'd
	same-origin pathname navigations while the Space was private,
	which is why an earlier iteration kept this cell plain text and
	inlined the report via an iframe-srcdoc viewer instead). So the
	name cell becomes a deep-link that opens the self-contained
	per-submission report in a new tab — the typical HF
	leaderboard pattern, and far lighter than shipping the
	(tens-to-hundreds-of-MB) report through the page on every click.

	``report_url`` is the relative ``/reports/<id>.html`` route the
	reader computes only for completed modern-pipeline rows; rows
	without one (pending / failed / legacy) render as plain text. The
	name column is a ``markdown`` datatype, which renders inline HTML,
	so a raw anchor with ``target="_blank"`` works; the name is
	HTML-escaped so an odd submission name can't break the cell.
	"""
	if _is_empty(name):
	return "(unnamed submission)"
	label = html.escape(str(name))
	if _is_empty(report_url):
	return label
	href = html.escape(str(report_url), quote=True)
	return f'<a href="{href}" target="_blank" rel="noopener">{label}</a>'


	def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
	"""Two-tier reader: returns ``(validated_df, unvalidated_df)``.

	Splits incoming rows on ``validation_status`` (defaulting to
	``"unvalidated"`` for legacy rows that pre-date the schema bump).
	Both DataFrames sort by ``aggregate_score`` descending with null
	last; the validated DataFrame additionally exposes the
	``validation_method`` column. Same status-aware cell formatting
	on both tiers via :func:`_project_and_format`.
	"""
	rows = _load_rows_from_hub()
	if not rows:
	return (
	pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
	pd.DataFrame(columns=LEADERBOARD_COLS),
	)
	# Backfill defaults for legacy rows that pre-date the relevant
	# schema bumps. `status` retrofits to "completed" (the legacy
	# baseline rows all have populated score fields).
	# `validation_status` retrofits to "unvalidated" per the validation
	# policy doc; defaulting in the reader avoids a results.jsonl
	# rewrite.
	for row in rows:
	if row.get("status") is None:
	row["status"] = "completed"
	if row.get("validation_status") is None:
	row["validation_status"] = "unvalidated"
	df = pd.DataFrame(rows)
	# Defensive split: anything not literally "validated" lands in the
	# unvalidated table (legacy rows, null, future-unknown values).
	validated_mask = df["validation_status"] == "validated"
	validated = _project_and_format(df[validated_mask], VALIDATED_LEADERBOARD_COLS)
	unvalidated = _project_and_format(df[~validated_mask], LEADERBOARD_COLS)
	return validated, unvalidated


	def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
	"""Project to display columns, sort by score, apply status-aware formatting.

	Pulled into a helper because :func:`load_leaderboard_split` runs
	it twice (once per tier), and both tiers need identically-shaped
	pending / failed cell tagging and link rendering.
	"""
	if df.empty:
	return pd.DataFrame(columns=columns)
	df = df.copy()
	# Compute the relative report URL before projection drops the
	# source columns it depends on. Lives as a hidden column so the
	# detail panel can show "Report: link" and `submission_name` can
	# use it as the link target.
	if "submission_id" in df.columns and "status" in df.columns:
	df["report_url"] = df.apply(
	lambda r: _report_relative_url(
	r["submission_id"],
	r["status"],
	r.get("submission_sha256"),
	),
	axis=1,
	)
	# Schema field name -> display column header. Keep the rename
	# narrow: only the agent URL gets a friendlier header; the schema
	# field stays `agent_url` in results.jsonl. `submission_blob_url`
	# stays under that name (hidden, detail-panel-only).
	df = df.rename(columns={"agent_url": "model details (optional)"})
	# Make sure every declared column exists (legacy rows can be
	# missing optional fields). Detail-panel reads expect the
	# column-set to be stable regardless of which source rows had
	# which keys.
	for c in columns:
	if c not in df.columns:
	df[c] = None
	out = (
	df[columns]
	.sort_values("aggregate_score", ascending=False, na_position="last")
	.reset_index(drop=True)
	)
	if "validity_rate" in out.columns:
	out["validity_rate"] = out.apply(
	lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
	)
	if "aggregate_score" in out.columns:
	out["aggregate_score"] = out.apply(
	lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
	)
	if "submission_name" in out.columns:
	out["submission_name"] = out.apply(
	lambda r: _submission_name_md(
	r["submission_name"], r.get("report_url"),
	),
	axis=1,
	)
	if "model details (optional)" in out.columns:
	out["model details (optional)"] = out["model details (optional)"].apply(
	_agent_url_md
	)
	if "submitted_at" in out.columns:
	out["submitted_at"] = out["submitted_at"].apply(_fmt_timestamp)
	return out


	# CSV-export columns. Wider than the on-screen table (raw values
	# instead of the display-formatted strings, plus identity / artifact
	# fields useful for offline analysis). `validation_status` is the
	# discriminator between the two on-screen tables when readers grep
	# the file. Order matches roughly: identity -> state -> headline
	# scores -> provenance / artifact links -> long-form fields.
	CSV_COLUMNS = [
	"submission_id",
	"status",
	"validation_status",
	"validation_method",
	"submitter_name",
	"submission_name",
	"hf_username",
	"aggregate_score",
	"validity_rate",
	"agent_url",
	"submitted_at",
	"cadgenbench_version",
	"cadgenbench_data_revision",
	"submission_blob_url",
	"submission_sha256",
	"notes",
	"failure_reason",
	]


	def build_combined_csv() -> str:
	"""Write the full leaderboard (both tiers) to a temp CSV and return its path.

	One file, both tables, ``validation_status`` discriminator
	column. Used by ``gr.DownloadButton`` on the Leaderboard tab.

	Each call writes a uniquely-named file under the OS tmp dir;
	Gradio caches the file at serve time so we don't need to delete
	it eagerly (the OS tmp cleaner reaps it eventually). Generating
	fresh on every click keeps the export current with whatever the
	next refresh of the table would show.

	Sort order: validated rows first (highest score top), then
	unvalidated, then any rows whose validation_status is some
	unexpected value (defensive). Mirrors the on-screen layout so
	readers diffing the CSV against the UI see the same ordering.
	"""
	rows = _load_rows_from_hub()
	for row in rows:
	if row.get("status") is None:
	row["status"] = "completed"
	if row.get("validation_status") is None:
	row["validation_status"] = "unvalidated"
	df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=CSV_COLUMNS)
	for c in CSV_COLUMNS:
	if c not in df.columns:
	df[c] = None
	df = df[CSV_COLUMNS]
	if not df.empty:
	# "validated" > "unvalidated" alphabetically (v > u), so
	# descending puts the validated tier first.
	df = df.sort_values(
	["validation_status", "aggregate_score"],
	ascending=[False, False],
	na_position="last",
	)
	out_dir = Path(tempfile.gettempdir())
	path = out_dir / f"cadgenbench-leaderboard-{uuid.uuid4().hex[:8]}.csv"
	df.to_csv(path, index=False)
	return str(path)


	# Admin-tab table. A single flat view of every row (both tiers) with a
	# leading editable ``select`` checkbox column; the rest is read-only
	# context the maintainer scans before acting. Raw values (not the
	# display-formatted leaderboard strings) so the maintainer sees exactly
	# what is on the row. `submission_id` rides last as the action key.
	ADMIN_SELECT_COL = "select"
	ADMIN_COLUMNS = [
	ADMIN_SELECT_COL,
	"validation_status",
	"validation_method",
	"submission_name",
	"submitter_name",
	"submitted_at",
	"status",
	"aggregate_score",
	"submission_id",
	]


	def load_admin_table() -> pd.DataFrame:
	"""Build the Admin tab's editable table: one row per submission.

	Both tiers in a single frame, validated first then by score, with a
	fresh (all-unchecked) ``select`` column the maintainer ticks to
	choose action targets. Legacy rows get the same ``status`` /
	``validation_status`` defaults the leaderboard reader applies, so
	pre-schema-bump rows still show up and are actionable.
	"""
	rows = _load_rows_from_hub()
	for row in rows:
	if row.get("status") is None:
	row["status"] = "completed"
	if row.get("validation_status") is None:
	row["validation_status"] = "unvalidated"
	if not rows:
	return pd.DataFrame(columns=ADMIN_COLUMNS)
	df = pd.DataFrame(rows)
	for c in ADMIN_COLUMNS:
	if c not in df.columns:
	df[c] = None
	if "submitted_at" in df.columns:
	df["submitted_at"] = df["submitted_at"].apply(_fmt_timestamp)
	df = (
	df[ADMIN_COLUMNS]
	.sort_values(
	["validation_status", "aggregate_score"],
	ascending=[False, False],
	na_position="last",
	)
	.reset_index(drop=True)
	)
	# Set after projection so the column is a clean all-False boolean
	# regardless of what (if anything) a stray source key held.
	df[ADMIN_SELECT_COL] = False
	return df