CADGenBench / app.py
Michael Rabinovich
Serve rotating WebP turntables + GT generator
c1cb5e4
raw
history blame
40.1 kB
# Copyright 2026 Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CADGenBench Leaderboard Space - Gradio UI + report-proxy mount.
Read path lives in :mod:`leaderboard`. Submit-tab validation lives in
:mod:`submit`. Both are wired into the Gradio Blocks below. The
Gradio app is mounted under a FastAPI parent so the custom
``/reports/{submission_id}.html`` route can re-serve dataset HTML
with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it
as ``text/plain`` by policy, which makes the browser show source
rather than render).
"""
from __future__ import annotations
import html
import logging
import os
from functools import lru_cache
from pathlib import Path
import gradio as gr
import pandas as pd
import uvicorn
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, Response
from gradio_leaderboard import Leaderboard
from huggingface_hub import hf_hub_download
from leaderboard import (
ADMIN_COLUMNS,
ADMIN_SELECT_COL,
HF_DATA_GT_REPO,
HF_DATA_REPO,
HF_SUBMISSIONS_REPO,
LEADERBOARD_COLS,
LEADERBOARD_DATATYPES,
LEADERBOARD_HIDE_COLUMNS,
VALIDATED_LEADERBOARD_COLS,
VALIDATED_LEADERBOARD_DATATYPES,
LeaderboardDataError,
_fmt_timestamp,
_load_rows_from_hub,
build_combined_csv,
load_admin_table,
load_leaderboard_split,
)
from gallery import render_gallery_page
from admin import (
VALID_METHODS,
delete_rows,
demote_rows,
is_admin,
promote_rows,
stop_and_delete_rows,
)
from submit import handle_submit
logger = logging.getLogger(__name__)
# Surface module-level logger.info / logger.warning / logger.exception
# calls from leaderboard.py + submit.py in the Space's runtime logs.
# Otherwise they go nowhere and any refresh / worker pathology is
# silent. Format keeps timestamps + module + level + message.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)
# Canonical policy doc lives in the code repo so contributors reading
# the GitHub repo see it without needing to visit the Space. Linked
# from both the Leaderboard tab's Validation Guidelines accordion and
# the About tab.
VALIDATION_DOC_URL = (
"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
)
ABOUT_MD = f"""## About
**CADGenBench** evaluates AI-driven CAD generation: how well a model can
turn a description of a mechanical part into a valid, geometrically
correct 3D model.
- **Reference baseline**: an iterative AI agent that writes build123d Python.
- **Submission flow**: upload a zip of per-fixture STEP files; the Space
runs the eval and appends a row to the submissions dataset.
- **Datasets**: fixture inputs in
[`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO});
submissions and computed results in
[`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}).
- **Code**: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench).
- **Validation policy**: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).
- **Data**: CAD geometry from [Mecado](https://www.mecado.com).
"""
# Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md
# (Locked decisions section). Shown in the Citation accordion as a
# copy-paste handle for anyone citing this benchmark; the About tab
# already links the source code via huggingface/cadgenbench so the
# Space URL is the right deep-link target for the citation.
CITATION_BIBTEX = r"""@misc{cadgenbench2026,
author = {Rabinovich, Michael and {Hugging Face}},
title = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation},
year = {2026},
publisher = {Hugging Face},
howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/cadgenbench-leaderboard}},
}"""
VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table the moment evaluation completes. Maintainers promote rows to **Validated** after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`).
Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""
SUBMIT_STATUS_IDLE = (
"_Log in, attach a zip, and click **Submit**. Progress and any "
"errors appear here._"
)
def _data_error_banner_md(message: str | None) -> str:
"""Markdown for the top-of-tab data-unavailable banner.
Empty string when there's no error (the banner is also hidden via
``visible=False`` in that case). When the live ``results.jsonl``
can't be read, the banner is the loud, persistent signal that the
tables below are empty *by design* (we never fall back to stale or
bundled data) rather than because the leaderboard is genuinely
empty.
"""
if not message:
return ""
return (
"> ⚠️ **Leaderboard data unavailable.** The live results could not "
"be read from the Hub, so the tables below are empty. No stale or "
"cached data is ever shown in its place.\n>\n"
f"> Details: `{message}`"
)
def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str | None]:
"""Load both tiers, turning a Hub failure into empty frames + a message.
The reader (:func:`load_leaderboard_split`) deliberately *raises*
on any read failure (no silent fallback). The Space, however, must
stay up and loudly surface the failure rather than crash, so this
wrapper converts :class:`LeaderboardDataError` into empty,
correctly-shaped DataFrames plus an error string the caller renders
in the banner / a toast. Returns ``(validated, unvalidated, error)``
with ``error`` ``None`` on success.
"""
try:
validated, unvalidated = load_leaderboard_split()
return validated, unvalidated, None
except LeaderboardDataError as e:
logger.exception("Leaderboard data load failed")
return (
pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
pd.DataFrame(columns=LEADERBOARD_COLS),
str(e),
)
def _safe_load_admin() -> tuple[pd.DataFrame, str | None]:
"""Admin-table counterpart to :func:`_safe_load_split`.
Same no-crash contract: a Hub read failure yields an empty,
correctly-shaped admin frame plus the error string instead of
propagating the exception (which would take the whole Space down at
boot, since the admin table loads at module-construction time).
"""
try:
return load_admin_table(), None
except LeaderboardDataError as e:
logger.exception("Admin table load failed")
return pd.DataFrame(columns=ADMIN_COLUMNS), str(e)
def _refresh_leaderboard_with_toast():
"""Manual Refresh button handler: toast + fresh DataFrames + banner.
Surfaces the outcome loudly either way: ``gr.Info`` on success,
``gr.Warning`` when the live read failed. The third output keeps
the data-unavailable banner in sync (shown with the error,
cleared on success).
"""
validated, unvalidated, error = _safe_load_split()
if error:
gr.Warning(f"Leaderboard data unavailable: {error}")
else:
gr.Info("Leaderboard refreshed.")
return (
validated,
unvalidated,
gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
)
def _auto_refresh_leaderboard():
"""Timer-tick handler: fresh DataFrames + banner, no success toast.
Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on
success (a toast every 10s would be noise). A read failure still
fires a loud ``gr.Warning`` and updates the banner so a degraded
Hub read can't quietly leave the tables blank.
"""
validated, unvalidated, error = _safe_load_split()
if error:
gr.Warning(f"Leaderboard data unavailable: {error}")
return (
validated,
unvalidated,
gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
)
def _enable_submit_when_logged_in(
profile: gr.OAuthProfile | None,
) -> gr.Button:
"""Flip the Submit button's interactivity based on login state.
Runs once per page load via ``blocks.load``. Gradio injects
``gr.OAuthProfile`` automatically (``None`` if the visitor isn't
logged in via the LoginButton). The visible-disable mirrors the
server-side gate in :func:`submit.handle_submit`; the handler
still raises ``gr.Error`` defensively if it ever gets called
without a profile.
"""
return gr.Button(interactive=profile is not None)
def _selected_ids(table_df: pd.DataFrame | None) -> list[str]:
"""Submission ids of the rows whose ``select`` checkbox is ticked."""
if (
table_df is None
or len(table_df) == 0
or ADMIN_SELECT_COL not in table_df.columns
or "submission_id" not in table_df.columns
):
return []
mask = table_df[ADMIN_SELECT_COL].apply(bool)
return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s]
def _admin_selection_status(table_df: pd.DataFrame | None) -> str:
"""Live count line under the admin table, updated as boxes are ticked."""
n = len(_selected_ids(table_df))
return f"**{n}** row(s) selected." if n else "_No rows selected._"
def _gate_admin_controls(
profile: gr.OAuthProfile | None,
) -> tuple[
gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
gr.Button, str,
]:
"""Enable the admin controls only for a logged-in user in the admin set.
Runs on every page load and re-runs on LoginButton auth events, so
the table value is also refreshed from the live Hub data instead of
staying pinned to whatever rows existed when the Space process
booted. Non-admins and logged-out visitors get the tab with the
table read-only and every control disabled, mirroring the server-side
re-check in each handler. The delete + stop-and-delete buttons always
load disarmed: they only enable once the confirm checkbox is ticked.
"""
admin_df, error = _safe_load_admin()
if error:
gr.Warning(f"Admin table unavailable: {error}")
admin = is_admin(profile)
if profile is None:
status = "Log in with an admin account to enable the controls below."
elif admin:
status = f"Signed in as `{profile.username}`. Admin controls enabled."
else:
status = (
f"Signed in as `{profile.username}`, which is not in the admin "
"set. Controls are disabled."
)
return (
gr.Dataframe(value=admin_df, interactive=admin),
gr.Radio(interactive=admin),
gr.Button(interactive=admin),
gr.Button(interactive=admin),
gr.Checkbox(interactive=admin, value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
status,
)
def _arm_delete(
confirm: bool, profile: gr.OAuthProfile | None,
) -> tuple[gr.Button, gr.Button]:
"""Arm both destructive buttons once an admin ticks the confirm box.
The plain delete and the stop-and-delete share the single confirm
checkbox, so a deliberate tick is required before either fires.
"""
armed = bool(confirm) and is_admin(profile)
return gr.Button(interactive=armed), gr.Button(interactive=armed)
def _refresh_admin_table() -> pd.DataFrame:
"""Admin Refresh button handler: reload the admin table, toast on failure.
Uses the no-crash :func:`_safe_load_admin` so a Hub read failure
surfaces as a loud ``gr.Warning`` plus an empty table rather than an
uncaught exception.
"""
admin_df, error = _safe_load_admin()
if error:
gr.Warning(f"Admin table unavailable: {error}")
return admin_df
def _reapply_selection(
fresh: pd.DataFrame, selected: set[str],
) -> pd.DataFrame:
"""Re-tick the ``select`` column on rows the maintainer had selected.
A freshly-loaded admin frame comes back all-unchecked; this carries
the prior ticks forward by ``submission_id`` so a background refresh
doesn't wipe an in-progress selection. Ids that vanished (e.g. a row
deleted out from under the table) simply drop out.
"""
if (
selected
and ADMIN_SELECT_COL in fresh.columns
and "submission_id" in fresh.columns
):
fresh[ADMIN_SELECT_COL] = (
fresh["submission_id"].astype(str).isin(selected)
)
return fresh
def _auto_refresh_admin_table(current_df: pd.DataFrame | None) -> pd.DataFrame:
"""Timer-tick handler: reload the admin table, preserving ticked rows.
The leaderboard tables auto-refresh every 10s but the admin table did
not, so a pending row submitted after the tab loaded stayed invisible
until a manual Refresh. This keeps it current on the same cadence.
Unlike the leaderboard handler it stays silent (no per-tick toast)
and, on a Hub read failure, returns the current frame unchanged so a
transient blip never blanks the table or drops the user's selection.
"""
admin_df, error = _safe_load_admin()
if error:
return current_df if current_df is not None else admin_df
return _reapply_selection(admin_df, set(_selected_ids(current_df)))
def _admin_promote(
table_df: pd.DataFrame | None,
method: str | None,
profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
"""Promote ticked rows, then refresh admin, leaderboard, and gallery.
Re-checks :func:`admin.is_admin` server-side so a tampered client
that re-enables the button still can't write.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
if not method:
raise gr.Error("Pick a validation_method first.")
try:
promote_rows(ids, method)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return admin_df, validated, unvalidated, _gallery_iframe_html()
def _admin_demote(
table_df: pd.DataFrame | None,
profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
"""Demote ticked rows, then refresh admin, leaderboard, and gallery."""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
demote_rows(ids)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return admin_df, validated, unvalidated, _gallery_iframe_html()
def _admin_delete(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
gr.Button,
]:
"""Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.
Resets the confirm checkbox and re-disables both destructive buttons
on the way out so the next deletion needs a fresh, deliberate confirm.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable delete.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
delete_rows(ids)
except ValueError as e:
raise gr.Error(str(e))
gr.Info(f"Deleted {len(ids)} submission(s).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
)
def _admin_stop_delete(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
gr.Button,
]:
"""Stop running eval job(s) for ticked rows, delete them, then disarm.
Same gating + disarm contract as :func:`_admin_delete`; the only
difference is it calls :func:`admin.stop_and_delete_rows`, which
best-effort cancels the submissions' in-flight HF Jobs before
deleting. Use this for pending rows whose GPU eval is still running.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable delete.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
stop_and_delete_rows(ids)
except ValueError as e:
raise gr.Error(str(e))
gr.Info(f"Stopped + deleted {len(ids)} submission(s).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
)
@lru_cache(maxsize=128)
def _fetch_report_html(submission_id: str) -> bytes | None:
"""Pull ``reports/<id>.html`` off the submissions dataset.
Cached in-process so repeat clicks on the same row don't hit
the Hub. Returns ``None`` on any failure so the caller can
serve a clean 404 rather than leaking a stack trace.
"""
try:
local_path = hf_hub_download(
repo_id=HF_SUBMISSIONS_REPO,
filename=f"reports/{submission_id}.html",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch report for %s (%s: %s)",
submission_id, type(e).__name__, e,
)
return None
def serve_report(submission_id: str) -> Response:
"""Proxy a per-submission HTML report through the Space.
HF Hub serves dataset HTML under ``/resolve/`` with
``Content-Type: text/plain`` (security: dataset files can't host
live HTML), so a direct dataset link shows source instead of
rendering. This route lives on the Space (which can legitimately
serve text/html) and re-streams the file's bytes with the right
content-type.
"""
content = _fetch_report_html(submission_id)
if content is None:
return HTMLResponse(
content="<h1>Report not found</h1>",
status_code=404,
)
return Response(content=content, media_type="text/html; charset=utf-8")
def _fetch_render(submission_id: str, fixture: str) -> bytes | None:
"""Pull a submission's gallery WebP (``renders/<id>/<fixture>/rotating.webp``).
Deliberately **not** memoized: renders land over time (a submission
completes, or an existing row is backfilled) after the Space process
booted, so negative-caching a boot-time miss would keep a turntable
dashed until the next restart. ``hf_hub_download`` does its own disk
caching per revision, so a re-fetch of an unchanged file stays cheap.
Returns ``None`` on any failure (the gallery draws the dashed cell).
"""
try:
local_path = hf_hub_download(
repo_id=HF_SUBMISSIONS_REPO,
filename=f"renders/{submission_id}/{fixture}/rotating.webp",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch render %s/%s (%s: %s)",
submission_id, fixture, type(e).__name__, e,
)
return None
def _fetch_gt_render(fixture: str) -> bytes | None:
"""Pull a fixture's ground-truth GIF from the private GT dataset.
Path inside the GT repo is ``<fixture>/renders/rotating.webp``. GT
renders are a property of the data revision, not of any submission,
so they're served straight from the GT repo rather than duplicated
per submission. Not memoized for the same reason as :func:`_fetch_render` (GT
renders can be added/updated on a data revision bump);
``hf_hub_download`` handles the per-revision disk cache. Needs the
Space ``HF_TOKEN``'s read scope on the private repo.
"""
try:
local_path = hf_hub_download(
repo_id=HF_DATA_GT_REPO,
filename=f"{fixture}/renders/rotating.webp",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch GT render for %s (%s: %s)",
fixture, type(e).__name__, e,
)
return None
# Long-lived immutable caching: a (submission, fixture) render never
# changes (fixed camera + lighting; re-renders would be a new artifact),
# so the browser/CDN can keep it forever. This is what makes fixture
# swaps and repeat visits free: only the ~33 on-screen turntables are
# fetched on first paint, and everything after that is a cache hit.
RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable"
def _render_proxy_url(submission_id: str, fixture: str) -> str | None:
"""Resolver returning the cached proxy URL for a submission GIF.
Returns the route string **without** fetching the bytes (that's the
whole point: the browser lazy-fetches on demand, so only the visible
tiles load). The gallery only calls this for fixtures whose
per-fixture status is ``valid``; an absolute path resolves against
the Space origin even inside the iframe ``srcdoc``. A render that
404s (valid status but a missing upload) degrades to the dashed cell
client-side via the ``<img onerror>`` hook.
Requires the Space to be **public**: while private, HF's edge 404s
in-browser fetches to these custom routes.
"""
return f"/render/{submission_id}/{fixture}.webp"
def _gt_proxy_url(fixture: str) -> str | None:
"""Resolver returning the cached proxy URL for a fixture's GT WebP."""
return f"/gt-render/{fixture}.webp"
def serve_render(submission_id: str, fixture: str) -> Response:
"""Stream a submission's per-fixture render WebP with long-lived caching.
The gallery references ``/render/<id>/<fixture>.webp`` and the browser
fetches it lazily. Re-streams the dataset bytes (the Space holds the
read token) with an immutable ``Cache-Control`` so the CDN/browser
cache it hard.
"""
webp = _fetch_render(submission_id, fixture)
if webp is None:
return Response(status_code=404)
return Response(
content=webp,
media_type="image/webp",
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def serve_gt_render(fixture: str) -> Response:
"""Stream a fixture's ground-truth render WebP with long-lived caching."""
webp = _fetch_gt_render(fixture)
if webp is None:
return Response(status_code=404)
return Response(
content=webp,
media_type="image/webp",
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _gallery_iframe_html() -> str:
"""Build the gallery as a self-contained ``srcdoc`` iframe.
Reads the live rows and renders the page (turntables referenced as
cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the
browser), then inlines the whole document into an iframe ``srcdoc``
so it gets its own style context (no Gradio CSS collision). A Hub
read failure degrades to an empty gallery rather than crashing the
tab.
"""
try:
rows = _load_rows_from_hub()
except LeaderboardDataError:
logger.exception("Gallery row load failed; rendering empty gallery")
rows = []
doc = render_gallery_page(rows, _render_proxy_url, _gt_proxy_url)
escaped = html.escape(doc, quote=True)
return (
f'<iframe srcdoc="{escaped}" '
'style="width:100%; height:90vh; border:0; display:block;" '
'title="CADGenBench gallery"></iframe>'
)
with gr.Blocks(title="CADGenBench Leaderboard", theme=gr.themes.Soft()) as blocks:
gr.Markdown(
"# CADGenBench Leaderboard\n"
"_Benchmarking AI-driven CAD generation._"
)
with gr.Tab("Gallery"):
# Visual-first leaderboard. The bespoke surface (sticky GT row,
# fixture picker, turntable grid, compare modal) is a
# self-contained HTML doc inlined into an iframe `srcdoc` so it
# keeps its own style context. Thumbnails are lazy-loaded from
# the cached `/render` / `/gt-render` proxy routes (requires the
# Space to be public). Built at boot, rebuilt on page load, and
# refreshed after admin actions.
gallery_html = gr.HTML(value=_gallery_iframe_html())
gallery_refresh_btn = gr.Button("Refresh gallery", size="sm")
gallery_refresh_btn.click(
fn=_gallery_iframe_html, outputs=gallery_html,
)
with gr.Tab("Leaderboard"):
# Load both tiers once at boot. `_safe_load_split` keeps a Hub
# read failure from crashing the Space: on failure the frames
# come up empty and `initial_error` carries the message the
# banner renders.
initial_validated, initial_unvalidated, initial_error = _safe_load_split()
# Loud, persistent banner shown only when the live results
# can't be read from the Hub (e.g. an under-scoped Space
# HF_TOKEN). Kept in sync by the refresh / Timer handlers. The
# leaderboard never falls back to stale/bundled data, so this
# banner is the signal that empty tables are a read failure,
# not a genuinely empty leaderboard.
data_error_banner = gr.Markdown(
value=_data_error_banner_md(initial_error),
visible=initial_error is not None,
)
# Collapsed accordions above the tables. Validation guidelines
# gives the short two-tier story + link to the full policy
# doc; Citation carries the verbatim BibTeX entry. Both start
# closed so the leaderboard itself stays above the fold.
with gr.Accordion("Validation guidelines", open=False):
gr.Markdown(VALIDATION_GUIDELINES_MD)
with gr.Accordion("Citation", open=False):
# language=None -> plain monospaced render (gr.Code doesn't
# ship a BibTeX highlighter); show_line_numbers off because
# the entry is meant to be copy-pasted, not annotated.
gr.Code(
value=CITATION_BIBTEX,
language=None,
show_line_numbers=False,
)
# Two stacked tables, split by `validation_status`. Validated
# on top so the curated results are above the fold; unvalidated
# below carries every other row (auto-published, awaiting
# methodology review). See decisions/validation-policy.md.
# Initial values come from the boot-time `_safe_load_split`
# above (empty + banner on a Hub read failure).
validated_view = Leaderboard(
value=initial_validated,
datatype=VALIDATED_LEADERBOARD_DATATYPES,
search_columns=["submission_name", "submitter_name"],
hide_columns=LEADERBOARD_HIDE_COLUMNS,
label="Validated Leaderboard",
interactive=False,
)
unvalidated_view = Leaderboard(
value=initial_unvalidated,
datatype=LEADERBOARD_DATATYPES,
search_columns=["submission_name", "submitter_name"],
hide_columns=LEADERBOARD_HIDE_COLUMNS,
label="Unvalidated Leaderboard",
interactive=False,
)
with gr.Row():
refresh_btn = gr.Button("Refresh", size="sm")
# One file, both tables, `validation_status` discriminator
# column. Fresh CSV is generated on every click so the
# download reflects the latest data, not a stale snapshot
# captured at boot.
download_btn = gr.DownloadButton(
label="Download CSV", size="sm",
)
refresh_btn.click(
fn=_refresh_leaderboard_with_toast,
outputs=[validated_view, unvalidated_view, data_error_banner],
)
download_btn.click(fn=build_combined_csv, outputs=download_btn)
# No inline row-click detail panel: the submission_name cell is a
# deep-link that opens the self-contained per-submission report in
# a new tab (see `_submission_name_md` in leaderboard.py). Now that
# the Space is public, HF's edge serves `/reports/<id>.html` to
# browser users, so we link to it directly instead of inlining the
# (tens-to-hundreds-of-MB) report through the Gradio event payload.
with gr.Tab("Submit"):
gr.Markdown(
f"""
**Submission format.** A single zip with:
- one folder per fixture in `{HF_DATA_REPO}`; include `output.step` for
fixtures where your system produced a candidate. Missing `output.step`
scores zero for that fixture;
- a top-level `meta.json`:
```json
{{
"submitter_name": "your name or team",
"submission_name": "MyAgent v2.3 (or whatever describes your system)",
"agent_url": "https://github.com/... (optional)",
"notes": "free text, optional, max 500 chars, single line, plain text",
"agree_to_publish": true
}}
```
**Submission name.** Free text describing the system being benchmarked,
however you choose to describe it. The benchmark is system-agnostic: your
submission may use no LLM, one, or many. If you want to disclose your
stack, put it here or in `notes`.
**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
and stripped to a single line. Shown in the per-submission detail view,
not in the main leaderboard table.
**Consent.** `"agree_to_publish": true` in `meta.json` is your consent
to publish the resulting row on the public leaderboard.
"""
)
# OAuth gate. The user must log in via the HF button before
# the Submit button becomes interactive; the row gets the
# canonical `hf_username` from `gr.OAuthProfile.username`
# (not a free-text claim in meta.json). README front-matter
# already carries `hf_oauth: true` so HF's OAuth integration
# is wired up at the Space level.
login_btn = gr.LoginButton()
zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
# Starts disabled; the `blocks.load` handler below flips it
# to interactive when an OAuthProfile is present.
submit_btn = gr.Button("Submit", variant="primary", interactive=False)
# Persistent status panel. handle_submit is a generator that
# streams stage updates (validating -> uploading/queuing ->
# queued) and any rejection reason here, so the outcome
# survives instead of vanishing with a transient toast. The
# handler also reads `gr.OAuthProfile` implicitly via its
# parameter type annotation (Gradio's dependency-injection
# convention).
submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE)
submit_btn.click(
fn=handle_submit,
inputs=[zip_in],
outputs=[submit_status],
)
with gr.Tab("About"):
gr.Markdown(ABOUT_MD)
with gr.Tab("Admin"):
# Maintainer-only controls. The tab is visible to everyone (a
# hint the path exists); the table + buttons are gated to OAuth
# users in the CADGENBENCH_ADMINS set via the `blocks.load`
# handler below + a server-side re-check in every handler. See
# decisions/validation-policy.md.
gr.Markdown(
"## Admin\n"
"Tick rows in the **select** column, then promote them into the "
"**Validated** tier (recording an evidence type), demote them back "
"to **Unvalidated**, or delete them. Actions apply to every ticked "
"row at once. Limited to maintainers in the admin set; everyone "
"else sees the tab with the controls disabled."
)
admin_login_btn = gr.LoginButton()
admin_status = gr.Markdown(
"Log in with an admin account to enable the controls below."
)
# Only the leading `select` column is editable; the rest is
# read-only context. Click-to-tick drives every action below.
# `_safe_load_admin` keeps a Hub read failure from crashing the
# Space at boot (the admin table loads at construction time).
initial_admin_table, _ = _safe_load_admin()
admin_table = gr.Dataframe(
value=initial_admin_table,
datatype=[
"bool", "str", "str", "str", "str", "str", "str", "number",
"str",
],
static_columns=list(range(1, len(ADMIN_COLUMNS))),
interactive=False,
label="Submissions (tick select to choose rows)",
wrap=True,
)
admin_selection_md = gr.Markdown("_No rows selected._")
admin_method_radio = gr.Radio(
choices=list(VALID_METHODS),
value="manual",
label="validation_method (applied to all rows on promote)",
interactive=False,
)
with gr.Row():
promote_btn = gr.Button(
"Mark validated", variant="primary", interactive=False,
)
demote_btn = gr.Button("Mark unvalidated", interactive=False)
with gr.Accordion("Danger zone: delete", open=False):
gr.Markdown(
"Permanently deletes the ticked rows **and** their uploaded "
"zip + report files from the submissions dataset. This cannot "
"be undone (only a manual revert of the dataset commit).\n\n"
"**Stop & delete** additionally cancels any still-running "
"evaluation job(s) for the ticked rows before deleting — use "
"it for pending submissions whose GPU eval is in flight."
)
delete_confirm = gr.Checkbox(
label=(
"I understand this permanently deletes the selected "
"submissions and their files."
),
value=False,
interactive=False,
)
with gr.Row():
delete_btn = gr.Button(
"Delete selected", variant="stop", interactive=False,
)
stop_delete_btn = gr.Button(
"Stop & delete selected", variant="stop",
interactive=False,
)
admin_refresh_btn = gr.Button("Refresh", size="sm")
admin_table.change(
fn=_admin_selection_status,
inputs=admin_table,
outputs=admin_selection_md,
)
promote_btn.click(
fn=_admin_promote,
inputs=[admin_table, admin_method_radio],
outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
)
demote_btn.click(
fn=_admin_demote,
inputs=[admin_table],
outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
)
delete_confirm.change(
fn=_arm_delete,
inputs=[delete_confirm],
outputs=[delete_btn, stop_delete_btn],
)
delete_btn.click(
fn=_admin_delete,
inputs=[admin_table, delete_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
delete_confirm, delete_btn, stop_delete_btn,
],
)
stop_delete_btn.click(
fn=_admin_stop_delete,
inputs=[admin_table, delete_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
delete_confirm, delete_btn, stop_delete_btn,
],
)
admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
# Keep the admin table on the same 10s cadence as the leaderboard
# so a row that lands (or a pending row that completes) after the
# tab loaded shows up without a manual Refresh. Selection is
# preserved across ticks so an in-progress set of checkboxes
# survives the reload.
admin_auto_refresh_timer = gr.Timer(10)
admin_auto_refresh_timer.tick(
fn=_auto_refresh_admin_table,
inputs=admin_table,
outputs=admin_table,
)
# gradio_leaderboard.Leaderboard handles its own update path
# cleanly; bind a Timer to push fresh dataframes every 10 seconds.
# Single tick runs `_auto_refresh_leaderboard` once and pushes the
# two halves into the validated / unvalidated widgets plus the
# data-unavailable banner. The handler swallows a Hub read failure
# into empty frames + a loud warning toast so a degraded read never
# crashes the tick loop or silently blanks the tables.
auto_refresh_timer = gr.Timer(10)
auto_refresh_timer.tick(
fn=_auto_refresh_leaderboard,
outputs=[validated_view, unvalidated_view, data_error_banner],
)
# On page load, read the visitor's OAuth profile (None if not
# logged in) and flip the Submit button's interactivity. Runs once
# per page load; LoginButton clicks also re-trigger this through
# Gradio's auth-event plumbing.
blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
# Same per-load OAuth read, gating the Admin tab's controls on
# membership in the CADGENBENCH_ADMINS set. Logged-out / non-admin
# visitors get the tab with everything disabled.
blocks.load(
fn=_gate_admin_controls,
outputs=[
admin_table,
admin_method_radio,
promote_btn,
demote_btn,
delete_confirm,
delete_btn,
stop_delete_btn,
admin_status,
],
)
# Mount Gradio under a FastAPI parent so the custom proxy route
# above lives at the same origin as the UI. Direct routes on `app`
# get checked before the Gradio sub-app, so `/reports/<sid>.html`
# never gets shadowed.
app = FastAPI()
app.add_api_route(
"/reports/{submission_id}.html",
serve_report,
methods=["GET"],
)
# Cached render proxies the gallery's lazy-loaded turntables point at.
# Registered before the Gradio mount so they're not shadowed by the
# catch-all sub-app.
app.add_api_route(
"/render/{submission_id}/{fixture}.webp",
serve_render,
methods=["GET"],
)
app.add_api_route(
"/gt-render/{fixture}.webp",
serve_gt_render,
methods=["GET"],
)
app = gr.mount_gradio_app(app, blocks, path="/")
if __name__ == "__main__":
host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
uvicorn.run(app, host=host, port=port)