cadgenbench-leaderboard / leaderboard.py
Michael Rabinovich
leaderboard: serve renders from the public bucket, not the dataset proxy
d2161b1
# Copyright 2026 Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Leaderboard read path.
Loads `results.jsonl` from the submissions dataset on the Hub and
shapes the rows into the dataframe shown on the Leaderboard tab. The
live file is the single source of truth: there is **no fallback** to
bundled/stale data, so any read failure raises
:class:`LeaderboardDataError` rather than silently serving wrong rows.
Module-level constants describe the env-var-driven repo identities
that the submit path also consumes.
"""
from __future__ import annotations
import html
import json
import logging
import os
import re
import tempfile
import time
import uuid
from pathlib import Path
import pandas as pd
import requests
from huggingface_hub import get_token
logger = logging.getLogger(__name__)
HF_ORG = os.getenv("HF_ORG", "michaelr27")
HF_SUBMISSIONS_REPO = os.getenv(
"HF_SUBMISSIONS_REPO", f"{HF_ORG}/cadgenbench-submissions"
)
HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data")
# Private ground-truth dataset. The gallery's GT render proxy reads
# `<fixture>/renders/iso.png` from here (needs the Space HF_TOKEN's
# read scope, same token the eval already uses for GT).
HF_DATA_GT_REPO = os.getenv("HF_DATA_GT_REPO", f"{HF_ORG}/cadgenbench-data-gt")
# Public HF Storage Bucket holding the per-submission gallery/report renders
# (candidate turntables + edit-diff WebP). Public so the browser can fetch a
# render straight from object storage with no token and no Space proxy; the
# eval job is the only writer. Submission renders are public anyway, the GT
# renders stay in the private GT dataset and are never published here.
HF_RENDER_BUCKET = os.getenv("HF_RENDER_BUCKET", f"{HF_ORG}/cadgenbench-eval-staging")
HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
# Permanent renders live under this prefix; transient shard staging lives under
# its own prefix and is wiped after merge, so the two never collide.
RENDER_BUCKET_PREFIX = "renders"
RESULTS_FILENAME = "results.jsonl"
HUB_FETCH_TIMEOUT_SECONDS = 30
def render_object_path(submission_id: str, fixture: str, filename: str) -> str:
"""Bucket-relative path of one render: ``renders/<id>/<fixture>/<file>``."""
return f"{RENDER_BUCKET_PREFIX}/{submission_id}/{fixture}/{filename}"
def render_submission_prefix(submission_id: str) -> str:
"""Bucket-relative prefix holding every render for *submission_id*."""
return f"{RENDER_BUCKET_PREFIX}/{submission_id}"
def render_public_url(submission_id: str, fixture: str, filename: str) -> str:
"""Stable anonymous URL for one render (browser follows the 302 to the CDN)."""
return (
f"{HF_ENDPOINT}/buckets/{HF_RENDER_BUCKET}/resolve/"
f"{render_object_path(submission_id, fixture, filename)}"
)
def render_submission_base_url(submission_id: str) -> str:
"""Public base URL for *submission_id*'s renders, ``.../resolve/renders/<id>``.
The report generator appends ``/<fixture>/<file>`` to this; passed to
``cadgenbench``'s ``generate_html`` as the display-only ``render_base_url``.
"""
return (
f"{HF_ENDPOINT}/buckets/{HF_RENDER_BUCKET}/resolve/"
f"{render_submission_prefix(submission_id)}"
)
class LeaderboardDataError(RuntimeError):
"""Raised when the live ``results.jsonl`` cannot be read from the Hub.
The leaderboard has **no fallback**: rather than silently serving
stale or bundled data (which can make a broken Hub read, e.g. an
under-scoped Space ``HF_TOKEN``, look like an up-to-date but wrong
leaderboard), every read failure surfaces loudly here.
"""
# Columns visible in the rendered table, in left-to-right order, followed
# by hidden-but-data-present columns the row-click detail panel pulls from.
# Hidden columns ride along in the DataFrame so `Leaderboard.select(...)`
# can read them out without a separate state-cache or re-fetch.
# Display column names (these are what the rendered table headers
# read). The visible set is intentionally compact; the detail-panel
# columns at the end of the list ride along in the DataFrame so the
# row-click handler can populate from them without a separate state
# cache. `submission_name` is the primary link (markdown-wrapped at
# projection time to point at the report when one exists), so we
# don't need a separate `report` column. `submission_blob_url` lives
# in the detail panel only.
LEADERBOARD_COLS = [
"status",
"submission_name",
"submitter_name",
"aggregate_score",
"validity_rate",
"submitted_at",
"cadgenbench_version",
"model details (optional)",
# Detail-panel-only (hidden via `hide_columns` on the widget):
"submission_id",
"notes",
"failure_reason",
"submission_blob_url",
"report_url",
]
# Validated table additionally exposes `validation_method`; on the
# unvalidated table the field is always null so the column is omitted
# rather than rendered. See cadgenbench-submissions/schema.md.
VALIDATED_LEADERBOARD_COLS = [
"status",
"submission_name",
"submitter_name",
"aggregate_score",
"validity_rate",
"validation_method",
"submitted_at",
"cadgenbench_version",
"model details (optional)",
"submission_id",
"notes",
"failure_reason",
"submission_blob_url",
"report_url",
]
# Columns to hide from rendering on both tables. These ride in the
# DataFrame so the row-click detail panel can populate from them; the
# widget hides them from view.
LEADERBOARD_HIDE_COLUMNS = [
"submission_id",
"notes",
"failure_reason",
"submission_blob_url",
"report_url",
]
# Per-column gradio_leaderboard datatypes. `submission_name` and
# `model details (optional)` render their pre-formatted markdown
# (the submission_name cell links to the report when one exists; the
# model-details cell is the agent URL or _None_); everything else is
# plain string (numeric cells get pending / failed status tags
# applied by _fmt_pct / _fmt_score so they're string-shaped by the
# time the widget sees them).
_LINK_COLUMNS = frozenset({"submission_name", "model details (optional)"})
def _datatypes_for(columns: list[str]) -> list[str]:
return ["markdown" if c in _LINK_COLUMNS else "str" for c in columns]
LEADERBOARD_DATATYPES = _datatypes_for(LEADERBOARD_COLS)
VALIDATED_LEADERBOARD_DATATYPES = _datatypes_for(VALIDATED_LEADERBOARD_COLS)
PENDING_CELL_TAG = "⏳ evaluating..."
FAILED_CELL_TAG = "✗ failed"
_ISO_TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z$")
def _fmt_timestamp(ts) -> str:
"""Render an ISO-8601 ``submitted_at`` as ``YYYY-MM-DD HH:MM UTC``.
The schema writes timestamps as ``YYYY-MM-DDTHH:MM:SSZ``; the
minute-level UTC form is plenty for the table + detail panel,
drops the ``T``/``Z`` punctuation, and renders the timezone
explicitly so a reader doesn't have to know that "Z" means UTC.
"""
if ts is None or (isinstance(ts, float) and pd.isna(ts)):
return ""
s = str(ts).strip()
if not s:
return ""
m = _ISO_TS_RE.match(s)
if m:
return f"{m.group(1)} {m.group(2)} UTC"
return s
def _load_rows_from_hub() -> list[dict]:
"""Pull results.jsonl from the submissions dataset via raw HTTPS.
Avoids :func:`huggingface_hub.hf_hub_download` because its layered
caching (local disk cache + revision pinning + the Hub's own
CDN-fronted resolve endpoint) can hand back stale bytes for a few
minutes even with ``force_download=True``, which makes pending
rows look like they never landed. A direct GET with a cache-bust
query param and ``Cache-Control: no-cache`` consistently sees the
latest commit on the dataset's ``main`` branch within seconds.
The live ``results.jsonl`` is the single source of truth. Any
failure (network, auth, malformed JSON) raises
:class:`LeaderboardDataError`: there is deliberately **no fallback**
to bundled/stale data, so a broken read fails loudly instead of
silently serving wrong rows. An empty file is a valid result (an
empty leaderboard), not a failure.
"""
url = (
f"https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}"
f"/resolve/main/{RESULTS_FILENAME}"
)
headers = {"Cache-Control": "no-cache"}
token = get_token()
if token:
headers["Authorization"] = f"Bearer {token}"
try:
r = requests.get(
url,
headers=headers,
params={"_cb": str(int(time.time() * 1000))},
timeout=HUB_FETCH_TIMEOUT_SECONDS,
)
r.raise_for_status()
except Exception as e:
raise LeaderboardDataError(
f"Could not fetch {RESULTS_FILENAME} from {HF_SUBMISSIONS_REPO}: "
f"{type(e).__name__}: {e}. Verify the Space's HF_TOKEN has read "
f"access to the (private) submissions dataset. The leaderboard "
f"serves no fallback data."
) from e
try:
rows = [json.loads(line) for line in r.text.splitlines() if line.strip()]
except json.JSONDecodeError as e:
raise LeaderboardDataError(
f"Malformed {RESULTS_FILENAME} from {HF_SUBMISSIONS_REPO}: "
f"{type(e).__name__}: {e}."
) from e
logger.info("Loaded %d rows from Hub", len(rows))
return rows
def _fmt_pct(x: float | None, status: str) -> str:
"""Render a 0-1 fraction as 'NN%' (or 'NN.N%' for non-whole values).
Status-aware: pending / failed rows render a tag in place of the
number (the row's eventual score is not yet known or never will
be). ``pd.isna`` covers both ``None`` and pandas-coerced ``NaN``.
"""
if status == "pending":
return PENDING_CELL_TAG
if status == "failed":
return FAILED_CELL_TAG
if pd.isna(x):
return ""
pct = float(x) * 100
return f"{pct:.0f}%" if pct == int(pct) else f"{pct:.1f}%"
def _fmt_score(x: float | None, status: str) -> str:
"""Render an aggregate CAD score, status-aware tag on pending / failed."""
if status == "pending":
return PENDING_CELL_TAG
if status == "failed":
return FAILED_CELL_TAG
if pd.isna(x):
return ""
return f"{float(x):.4f}"
def _is_empty(v) -> bool:
"""True for None, NaN, or empty/whitespace-only strings."""
if v is None:
return True
if isinstance(v, float) and pd.isna(v):
return True
if isinstance(v, str) and not v.strip():
return True
return False
_AGENT_URL_MAX_LINK_TEXT = 40
def _shorten_url_for_display(url: str) -> str:
"""Strip scheme + trailing slash; truncate to keep the table cell tidy."""
s = url.replace("https://", "").replace("http://", "").rstrip("/")
if len(s) > _AGENT_URL_MAX_LINK_TEXT:
s = s[: _AGENT_URL_MAX_LINK_TEXT - 1] + "…"
return s
def _agent_url_md(url) -> str:
"""Render the `model details (optional)` cell as a markdown link.
Uses a shortened version of the URL itself as the link text:
`agent_url` is a free-form "URL pointing at the agent code or
paper" per the schema, so the URL itself carries the only honest
hint about what's behind the click. Missing cells render as
italic ``_None_`` so a reader sees the field is optional and
just wasn't filled, rather than a blank.
"""
if _is_empty(url):
return "_None_"
return f"[{_shorten_url_for_display(str(url))}]({url})"
def _report_relative_url(submission_id, status, submission_sha256) -> str:
"""Relative URL to the Space's report-proxy route, or empty.
The Space exposes ``/reports/<id>.html`` which fetches the file
from the submissions dataset and re-serves it as ``text/html``
(the dataset's ``/resolve/`` path returns it as ``text/plain``,
which renders as source). Returning a relative URL means the
same string works whether the Space is running locally on a
random port or on huggingface.co.
`reports/<id>.html` only exists for completed rows from the
modern submit pipeline; legacy seed rows (pre-pipeline,
``submission_sha256`` null) never had a report uploaded, so this
function returns empty for them and the caller leaves the
submission_name cell as plain text rather than a broken link.
"""
if status != "completed" or _is_empty(submission_id):
return ""
if _is_empty(submission_sha256):
return ""
return f"/reports/{submission_id}.html"
def _submission_name_md(name, report_url) -> str:
"""Render `submission_name`, linking to the report in a new tab.
Now that the Space is public, HF's edge serves the FastAPI
``/reports/<id>.html`` route to in-browser users (it 404'd
same-origin pathname navigations while the Space was private,
which is why an earlier iteration kept this cell plain text and
inlined the report via an iframe-srcdoc viewer instead). So the
name cell becomes a deep-link that opens the self-contained
per-submission report in a **new tab** — the typical HF
leaderboard pattern, and far lighter than shipping the
(tens-to-hundreds-of-MB) report through the page on every click.
``report_url`` is the relative ``/reports/<id>.html`` route the
reader computes only for completed modern-pipeline rows; rows
without one (pending / failed / legacy) render as plain text. The
name column is a ``markdown`` datatype, which renders inline HTML,
so a raw anchor with ``target="_blank"`` works; the name is
HTML-escaped so an odd submission name can't break the cell.
"""
if _is_empty(name):
return "(unnamed submission)"
label = html.escape(str(name))
if _is_empty(report_url):
return label
href = html.escape(str(report_url), quote=True)
return f'<a href="{href}" target="_blank" rel="noopener">{label}</a>'
def load_leaderboard_split() -> tuple[pd.DataFrame, pd.DataFrame]:
"""Two-tier reader: returns ``(validated_df, unvalidated_df)``.
Splits incoming rows on ``validation_status`` (defaulting to
``"unvalidated"`` for legacy rows that pre-date the schema bump).
Both DataFrames sort by ``aggregate_score`` descending with null
last; the validated DataFrame additionally exposes the
``validation_method`` column. Same status-aware cell formatting
on both tiers via :func:`_project_and_format`.
"""
rows = _load_rows_from_hub()
if not rows:
return (
pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
pd.DataFrame(columns=LEADERBOARD_COLS),
)
# Backfill defaults for legacy rows that pre-date the relevant
# schema bumps. `status` retrofits to "completed" (the legacy
# baseline rows all have populated score fields).
# `validation_status` retrofits to "unvalidated" per the validation
# policy doc; defaulting in the reader avoids a results.jsonl
# rewrite.
for row in rows:
if row.get("status") is None:
row["status"] = "completed"
if row.get("validation_status") is None:
row["validation_status"] = "unvalidated"
df = pd.DataFrame(rows)
# Defensive split: anything not literally "validated" lands in the
# unvalidated table (legacy rows, null, future-unknown values).
validated_mask = df["validation_status"] == "validated"
validated = _project_and_format(df[validated_mask], VALIDATED_LEADERBOARD_COLS)
unvalidated = _project_and_format(df[~validated_mask], LEADERBOARD_COLS)
return validated, unvalidated
def _project_and_format(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
"""Project to display columns, sort by score, apply status-aware formatting.
Pulled into a helper because :func:`load_leaderboard_split` runs
it twice (once per tier), and both tiers need identically-shaped
pending / failed cell tagging and link rendering.
"""
if df.empty:
return pd.DataFrame(columns=columns)
df = df.copy()
# Compute the relative report URL before projection drops the
# source columns it depends on. Lives as a hidden column so the
# detail panel can show "Report: link" and `submission_name` can
# use it as the link target.
if "submission_id" in df.columns and "status" in df.columns:
df["report_url"] = df.apply(
lambda r: _report_relative_url(
r["submission_id"],
r["status"],
r.get("submission_sha256"),
),
axis=1,
)
# Schema field name -> display column header. Keep the rename
# narrow: only the agent URL gets a friendlier header; the schema
# field stays `agent_url` in results.jsonl. `submission_blob_url`
# stays under that name (hidden, detail-panel-only).
df = df.rename(columns={"agent_url": "model details (optional)"})
# Make sure every declared column exists (legacy rows can be
# missing optional fields). Detail-panel reads expect the
# column-set to be stable regardless of which source rows had
# which keys.
for c in columns:
if c not in df.columns:
df[c] = None
out = (
df[columns]
.sort_values("aggregate_score", ascending=False, na_position="last")
.reset_index(drop=True)
)
if "validity_rate" in out.columns:
out["validity_rate"] = out.apply(
lambda r: _fmt_pct(r["validity_rate"], r["status"]), axis=1,
)
if "aggregate_score" in out.columns:
out["aggregate_score"] = out.apply(
lambda r: _fmt_score(r["aggregate_score"], r["status"]), axis=1,
)
if "submission_name" in out.columns:
out["submission_name"] = out.apply(
lambda r: _submission_name_md(
r["submission_name"], r.get("report_url"),
),
axis=1,
)
if "model details (optional)" in out.columns:
out["model details (optional)"] = out["model details (optional)"].apply(
_agent_url_md
)
if "submitted_at" in out.columns:
out["submitted_at"] = out["submitted_at"].apply(_fmt_timestamp)
return out
# CSV-export columns. Wider than the on-screen table (raw values
# instead of the display-formatted strings, plus identity / artifact
# fields useful for offline analysis). `validation_status` is the
# discriminator between the two on-screen tables when readers grep
# the file. Order matches roughly: identity -> state -> headline
# scores -> provenance / artifact links -> long-form fields.
CSV_COLUMNS = [
"submission_id",
"status",
"validation_status",
"validation_method",
"submitter_name",
"submission_name",
"hf_username",
"aggregate_score",
"validity_rate",
"agent_url",
"submitted_at",
"cadgenbench_version",
"cadgenbench_data_revision",
"submission_blob_url",
"submission_sha256",
"notes",
"failure_reason",
]
def build_combined_csv() -> str:
"""Write the full leaderboard (both tiers) to a temp CSV and return its path.
One file, both tables, ``validation_status`` discriminator
column. Used by ``gr.DownloadButton`` on the Leaderboard tab.
Each call writes a uniquely-named file under the OS tmp dir;
Gradio caches the file at serve time so we don't need to delete
it eagerly (the OS tmp cleaner reaps it eventually). Generating
fresh on every click keeps the export current with whatever the
next refresh of the table would show.
Sort order: validated rows first (highest score top), then
unvalidated, then any rows whose validation_status is some
unexpected value (defensive). Mirrors the on-screen layout so
readers diffing the CSV against the UI see the same ordering.
"""
rows = _load_rows_from_hub()
for row in rows:
if row.get("status") is None:
row["status"] = "completed"
if row.get("validation_status") is None:
row["validation_status"] = "unvalidated"
df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=CSV_COLUMNS)
for c in CSV_COLUMNS:
if c not in df.columns:
df[c] = None
df = df[CSV_COLUMNS]
if not df.empty:
# "validated" > "unvalidated" alphabetically (v > u), so
# descending puts the validated tier first.
df = df.sort_values(
["validation_status", "aggregate_score"],
ascending=[False, False],
na_position="last",
)
out_dir = Path(tempfile.gettempdir())
path = out_dir / f"cadgenbench-leaderboard-{uuid.uuid4().hex[:8]}.csv"
df.to_csv(path, index=False)
return str(path)
# Admin-tab table. A single flat view of every row (both tiers) with a
# leading editable ``select`` checkbox column; the rest is read-only
# context the maintainer scans before acting. Raw values (not the
# display-formatted leaderboard strings) so the maintainer sees exactly
# what is on the row. `submission_id` rides last as the action key.
ADMIN_SELECT_COL = "select"
ADMIN_COLUMNS = [
ADMIN_SELECT_COL,
"validation_status",
"validation_method",
"submission_name",
"submitter_name",
"submitted_at",
"status",
"aggregate_score",
"submission_id",
]
def load_admin_table() -> pd.DataFrame:
"""Build the Admin tab's editable table: one row per submission.
Both tiers in a single frame, validated first then by score, with a
fresh (all-unchecked) ``select`` column the maintainer ticks to
choose action targets. Legacy rows get the same ``status`` /
``validation_status`` defaults the leaderboard reader applies, so
pre-schema-bump rows still show up and are actionable.
"""
rows = _load_rows_from_hub()
for row in rows:
if row.get("status") is None:
row["status"] = "completed"
if row.get("validation_status") is None:
row["validation_status"] = "unvalidated"
if not rows:
return pd.DataFrame(columns=ADMIN_COLUMNS)
df = pd.DataFrame(rows)
for c in ADMIN_COLUMNS:
if c not in df.columns:
df[c] = None
if "submitted_at" in df.columns:
df["submitted_at"] = df["submitted_at"].apply(_fmt_timestamp)
df = (
df[ADMIN_COLUMNS]
.sort_values(
["validation_status", "aggregate_score"],
ascending=[False, False],
na_position="last",
)
.reset_index(drop=True)
)
# Set after projection so the column is a clean all-False boolean
# regardless of what (if anything) a stray source key held.
df[ADMIN_SELECT_COL] = False
return df