CADGenBench / app.py
Michael Rabinovich
Gallery: show GT "answer-key" edit-diff for editing fixtures
49e27be
# Copyright 2026 Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CADGenBench Leaderboard Space - Gradio UI + report-proxy mount.
Read path lives in :mod:`leaderboard`. Submit-tab validation lives in
:mod:`submit`. Both are wired into the Gradio Blocks below. The
Gradio app is mounted under a FastAPI parent so the custom
``/reports/{submission_id}.html`` route can re-serve dataset HTML
with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it
as ``text/plain`` by policy, which makes the browser show source
rather than render).
"""
from __future__ import annotations
import base64
import html
import logging
import mimetypes
import os
from functools import lru_cache
from pathlib import Path
import gradio as gr
import pandas as pd
import uvicorn
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, Response
from gradio_leaderboard import Leaderboard
from huggingface_hub import hf_hub_download, snapshot_download
from leaderboard import (
ADMIN_COLUMNS,
ADMIN_SELECT_COL,
HF_DATA_GT_REPO,
HF_DATA_REPO,
HF_SUBMISSIONS_REPO,
LEADERBOARD_COLS,
LEADERBOARD_DATATYPES,
LEADERBOARD_HIDE_COLUMNS,
VALIDATED_LEADERBOARD_COLS,
VALIDATED_LEADERBOARD_DATATYPES,
LeaderboardDataError,
_fmt_timestamp,
_load_rows_from_hub,
build_combined_csv,
load_admin_table,
load_leaderboard_split,
render_public_url,
)
from gallery import render_gallery_page
from metrics_page import build_metrics_page
from tasks import load_tasks_from_dir, render_tasks_page
from admin import (
VALID_METHODS,
delete_rows,
demote_rows,
is_admin,
promote_rows,
rescore_all,
rescore_rows,
stop_and_delete_rows,
)
from submit import handle_submit
logger = logging.getLogger(__name__)
# Surface module-level logger.info / logger.warning / logger.exception
# calls from leaderboard.py + submit.py in the Space's runtime logs.
# Otherwise they go nowhere and any refresh / worker pathology is
# silent. Format keeps timestamps + module + level + message.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)
# Canonical policy doc lives in the code repo so contributors reading
# the GitHub repo see it without needing to visit the Space. Linked
# from both the Detailed View tab's Validation Guidelines accordion and
# the About tab.
VALIDATION_DOC_URL = (
"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
)
# Canonical submission contract (output layout, validity gate, canonical
# pose, local self-check). Linked from the Submit tab so the tab itself
# stays a short "how to package + upload" note rather than re-documenting
# the full contract.
SUBMISSION_DOC_URL = (
"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/submission.md"
)
ABOUT_MD = f"""## About
**CADGenBench** evaluates AI-driven CAD generation: how well a model can
turn a description of a mechanical part into a valid, geometrically
correct 3D model.
- **Reference baseline**: an iterative AI agent that writes build123d Python.
- **Submission flow**: upload a zip of per-fixture STEP files; the Space
runs the eval and appends a row to the submissions dataset.
- **Datasets**: fixture inputs in
[`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO});
submissions and computed results in
[`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}).
- **Code**: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench).
- **Validation policy**: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).
- **Data**: CAD geometry from [Mecado](https://www.mecado.com).
"""
# Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md
# (Locked decisions section). Shown in the Citation accordion as a
# copy-paste handle for anyone citing this benchmark; the About tab
# already links the source code via huggingface/cadgenbench so the
# Space URL is the right deep-link target for the citation.
CITATION_BIBTEX = r"""@misc{cadgenbench2026,
author = {Rabinovich, Michael and {Hugging Face}},
title = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation},
year = {2026},
publisher = {Hugging Face},
howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/CADGenBench}},
}"""
VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table the moment evaluation completes. Maintainers promote rows to **Validated** after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`).
Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""
SUBMIT_STATUS_IDLE = (
"_Log in, attach a zip, and click **Submit**. Progress and any "
"errors appear here._"
)
def _data_error_banner_md(message: str | None) -> str:
"""Markdown for the top-of-tab data-unavailable banner.
Empty string when there's no error (the banner is also hidden via
``visible=False`` in that case). When the live ``results.jsonl``
can't be read, the banner is the loud, persistent signal that the
tables below are empty *by design* (we never fall back to stale or
bundled data) rather than because the leaderboard is genuinely
empty.
"""
if not message:
return ""
return (
"> ⚠️ **Leaderboard data unavailable.** The live results could not "
"be read from the Hub, so the tables below are empty. No stale or "
"cached data is ever shown in its place.\n>\n"
f"> Details: `{message}`"
)
def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str | None]:
"""Load both tiers, turning a Hub failure into empty frames + a message.
The reader (:func:`load_leaderboard_split`) deliberately *raises*
on any read failure (no silent fallback). The Space, however, must
stay up and loudly surface the failure rather than crash, so this
wrapper converts :class:`LeaderboardDataError` into empty,
correctly-shaped DataFrames plus an error string the caller renders
in the banner / a toast. Returns ``(validated, unvalidated, error)``
with ``error`` ``None`` on success.
"""
try:
validated, unvalidated = load_leaderboard_split()
return validated, unvalidated, None
except LeaderboardDataError as e:
logger.exception("Leaderboard data load failed")
return (
pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
pd.DataFrame(columns=LEADERBOARD_COLS),
str(e),
)
def _safe_load_admin() -> tuple[pd.DataFrame, str | None]:
"""Admin-table counterpart to :func:`_safe_load_split`.
Same no-crash contract: a Hub read failure yields an empty,
correctly-shaped admin frame plus the error string instead of
propagating the exception (which would take the whole Space down at
boot, since the admin table loads at module-construction time).
"""
try:
return load_admin_table(), None
except LeaderboardDataError as e:
logger.exception("Admin table load failed")
return pd.DataFrame(columns=ADMIN_COLUMNS), str(e)
def _refresh_leaderboard_with_toast():
"""Manual Refresh button handler: toast + fresh DataFrames + banner.
Surfaces the outcome loudly either way: ``gr.Info`` on success,
``gr.Warning`` when the live read failed. The third output keeps
the data-unavailable banner in sync (shown with the error,
cleared on success).
"""
validated, unvalidated, error = _safe_load_split()
if error:
gr.Warning(f"Leaderboard data unavailable: {error}")
else:
gr.Info("Leaderboard refreshed.")
return (
validated,
unvalidated,
gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
)
def _auto_refresh_leaderboard():
"""Timer-tick handler: fresh DataFrames + banner, no success toast.
Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on
success (a toast every 10s would be noise). A read failure still
fires a loud ``gr.Warning`` and updates the banner so a degraded
Hub read can't quietly leave the tables blank.
"""
validated, unvalidated, error = _safe_load_split()
if error:
gr.Warning(f"Leaderboard data unavailable: {error}")
return (
validated,
unvalidated,
gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
)
def _enable_submit_when_logged_in(
profile: gr.OAuthProfile | None,
) -> gr.Button:
"""Flip the Submit button's interactivity based on login state.
Runs once per page load via ``blocks.load``. Gradio injects
``gr.OAuthProfile`` automatically (``None`` if the visitor isn't
logged in via the LoginButton). The visible-disable mirrors the
server-side gate in :func:`submit.handle_submit`; the handler
still raises ``gr.Error`` defensively if it ever gets called
without a profile.
"""
return gr.Button(interactive=profile is not None)
def _selected_ids(table_df: pd.DataFrame | None) -> list[str]:
"""Submission ids of the rows whose ``select`` checkbox is ticked."""
if (
table_df is None
or len(table_df) == 0
or ADMIN_SELECT_COL not in table_df.columns
or "submission_id" not in table_df.columns
):
return []
mask = table_df[ADMIN_SELECT_COL].apply(bool)
return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s]
def _admin_selection_status(table_df: pd.DataFrame | None) -> str:
"""Live count line under the admin table, updated as boxes are ticked."""
n = len(_selected_ids(table_df))
return f"**{n}** row(s) selected." if n else "_No rows selected._"
def _gate_admin_controls(
profile: gr.OAuthProfile | None,
) -> tuple[
gr.Column, gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox,
gr.Button, gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str,
]:
"""Reveal the admin panel only for a logged-in user in the admin set.
Runs on every page load and re-runs on LoginButton auth events. The
entire admin panel (table + every control) lives in a column that
stays hidden unless the visitor is logged in AND in the admin set, so
non-admins and logged-out visitors see only the login/logout button
and a status line -- no table, no buttons. For admins the panel is
shown, its controls enabled, and the table refreshed from live Hub
data. Data is only loaded into the table for admins, and a server-side
``is_admin`` re-check still guards every handler. The armed-by-
confirmation buttons (delete, stop-and-delete, rescore-selected,
rescore-all) always load disarmed: they only enable once their confirm
box is ticked / phrase typed.
"""
admin = is_admin(profile)
if admin:
admin_df, error = _safe_load_admin()
if error:
gr.Warning(f"Admin table unavailable: {error}")
else:
admin_df = _empty_admin_table()
if profile is None:
status = "Log in with an admin account to access the controls."
elif admin:
status = f"Signed in as `{profile.username}`. Admin controls enabled."
else:
status = (
f"Signed in as `{profile.username}`, which is not in the admin "
"set. You can log out with the button above."
)
return (
gr.Column(visible=admin),
gr.Dataframe(value=admin_df, interactive=admin),
gr.Radio(interactive=admin),
gr.Button(interactive=admin),
gr.Button(interactive=admin),
gr.Checkbox(interactive=admin, value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
gr.Checkbox(interactive=admin, value=False),
gr.Button(interactive=False),
gr.Textbox(interactive=admin, value=""),
gr.Button(interactive=False),
status,
)
def _arm_delete(
confirm: bool, profile: gr.OAuthProfile | None,
) -> tuple[gr.Button, gr.Button]:
"""Arm both destructive buttons once an admin ticks the confirm box.
The plain delete and the stop-and-delete share the single confirm
checkbox, so a deliberate tick is required before either fires.
"""
armed = bool(confirm) and is_admin(profile)
return gr.Button(interactive=armed), gr.Button(interactive=armed)
def _empty_admin_table() -> pd.DataFrame:
"""An admin frame with headers but no rows -- what non-admins get.
The admin panel is hidden from non-admins, but the table refreshers
still run server-side; returning an empty frame ensures no submission
data is ever streamed into a non-admin's (hidden) table.
"""
return pd.DataFrame(columns=list(ADMIN_COLUMNS))
def _refresh_admin_table(profile: gr.OAuthProfile | None) -> pd.DataFrame:
"""Admin Refresh button handler: reload the admin table, toast on failure.
Uses the no-crash :func:`_safe_load_admin` so a Hub read failure
surfaces as a loud ``gr.Warning`` plus an empty table rather than an
uncaught exception. Returns an empty frame to non-admins so a tampered
client can't pull the table out from behind the hidden panel.
"""
if not is_admin(profile):
return _empty_admin_table()
admin_df, error = _safe_load_admin()
if error:
gr.Warning(f"Admin table unavailable: {error}")
return admin_df
def _reapply_selection(
fresh: pd.DataFrame, selected: set[str],
) -> pd.DataFrame:
"""Re-tick the ``select`` column on rows the maintainer had selected.
A freshly-loaded admin frame comes back all-unchecked; this carries
the prior ticks forward by ``submission_id`` so a background refresh
doesn't wipe an in-progress selection. Ids that vanished (e.g. a row
deleted out from under the table) simply drop out.
"""
if (
selected
and ADMIN_SELECT_COL in fresh.columns
and "submission_id" in fresh.columns
):
fresh[ADMIN_SELECT_COL] = (
fresh["submission_id"].astype(str).isin(selected)
)
return fresh
def _auto_refresh_admin_table(
current_df: pd.DataFrame | None,
profile: gr.OAuthProfile | None,
) -> pd.DataFrame:
"""Timer-tick handler: reload the admin table, preserving ticked rows.
The leaderboard tables auto-refresh every 10s but the admin table did
not, so a pending row submitted after the tab loaded stayed invisible
until a manual Refresh. This keeps it current on the same cadence.
Unlike the leaderboard handler it stays silent (no per-tick toast)
and, on a Hub read failure, returns the current frame unchanged so a
transient blip never blanks the table or drops the user's selection.
Non-admins get an empty frame so the (hidden) table is never fed data.
"""
if not is_admin(profile):
return _empty_admin_table()
admin_df, error = _safe_load_admin()
if error:
return current_df if current_df is not None else admin_df
return _reapply_selection(admin_df, set(_selected_ids(current_df)))
def _admin_promote(
table_df: pd.DataFrame | None,
method: str | None,
profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
"""Promote ticked rows, then refresh admin, leaderboard, and gallery.
Re-checks :func:`admin.is_admin` server-side so a tampered client
that re-enables the button still can't write.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
if not method:
raise gr.Error("Pick a validation_method first.")
try:
promote_rows(ids, method)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return admin_df, validated, unvalidated, _gallery_iframe_html()
def _admin_demote(
table_df: pd.DataFrame | None,
profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
"""Demote ticked rows, then refresh admin, leaderboard, and gallery."""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
demote_rows(ids)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return admin_df, validated, unvalidated, _gallery_iframe_html()
def _admin_delete(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
gr.Button,
]:
"""Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.
Resets the confirm checkbox and re-disables both destructive buttons
on the way out so the next deletion needs a fresh, deliberate confirm.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable delete.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
delete_rows(ids)
except ValueError as e:
raise gr.Error(str(e))
gr.Info(f"Deleted {len(ids)} submission(s).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
)
def _admin_stop_delete(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
gr.Button,
]:
"""Stop running eval job(s) for ticked rows, delete them, then disarm.
Same gating + disarm contract as :func:`_admin_delete`; the only
difference is it calls :func:`admin.stop_and_delete_rows`, which
best-effort cancels the submissions' in-flight HF Jobs before
deleting. Use this for pending rows whose GPU eval is still running.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable delete.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
stop_and_delete_rows(ids)
except ValueError as e:
raise gr.Error(str(e))
gr.Info(f"Stopped + deleted {len(ids)} submission(s).")
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
gr.Button(interactive=False),
)
# Exact phrase an admin must type to arm the board-wide rescore. A
# free-text match (not a checkbox) is the deliberate "are you sure"
# friction: it can't be tripped by a stray click and forces the admin
# to consciously type the words before the heavy, score-invalidating
# action arms.
RESCORE_ALL_PHRASE = "RESCORE ALL"
def _arm_rescore_selected(
confirm: bool, profile: gr.OAuthProfile | None,
) -> gr.Button:
"""Arm the rescore-selected button once an admin ticks its confirm box."""
return gr.Button(interactive=bool(confirm) and is_admin(profile))
def _arm_rescore_all(
phrase: str | None, profile: gr.OAuthProfile | None,
) -> gr.Button:
"""Arm the rescore-all button only on an exact phrase match by an admin."""
matched = (phrase or "").strip() == RESCORE_ALL_PHRASE
return gr.Button(interactive=matched and is_admin(profile))
def _rescore_result_message(dispatched: int, skipped: list[str]) -> str:
"""Toast text summarising a rescore dispatch."""
msg = (
f"Rescoring {dispatched} submission(s): rows flipped to pending and "
f"re-evaluating in the background. The leaderboard repopulates as "
f"each finishes."
)
if skipped:
msg += (
f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed "
f"rows can't be rescored)."
)
return msg
def _admin_rescore_selected(
table_df: pd.DataFrame | None,
confirm: bool,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
]:
"""Re-evaluate the ticked rows, refresh the views, then disarm.
Same gating contract as the destructive handlers: server-side
``is_admin`` re-check, an explicit confirm tick, and a non-empty
selection. Resets the confirm box + disarms the button on the way
out so the next rescore needs a fresh, deliberate confirm.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if not confirm:
raise gr.Error("Tick the confirmation box to enable rescore.")
ids = _selected_ids(table_df)
if not ids:
raise gr.Error("Tick at least one row first.")
try:
dispatched, skipped = rescore_rows(ids)
except (LookupError, ValueError) as e:
raise gr.Error(str(e))
gr.Info(_rescore_result_message(dispatched, skipped))
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Checkbox(value=False),
gr.Button(interactive=False),
)
def _admin_rescore_all(
phrase: str | None,
profile: gr.OAuthProfile | None,
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button,
]:
"""Re-evaluate every rescoreable row, refresh the views, then disarm.
The heavy, board-wide action: re-checks ``is_admin`` and the exact
confirmation phrase server-side (so a tampered client that
re-enables the button still can't fire), clears the phrase box, and
disarms the button afterwards.
"""
if not is_admin(profile):
raise gr.Error("You are not in the admin set.")
if (phrase or "").strip() != RESCORE_ALL_PHRASE:
raise gr.Error(
f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore."
)
try:
dispatched, skipped = rescore_all()
except ValueError as e:
raise gr.Error(str(e))
gr.Info(_rescore_result_message(dispatched, skipped))
validated, unvalidated, _ = _safe_load_split()
admin_df, _ = _safe_load_admin()
return (
admin_df,
validated,
unvalidated,
_gallery_iframe_html(),
gr.Textbox(value=""),
gr.Button(interactive=False),
)
@lru_cache(maxsize=128)
def _fetch_report_html(submission_id: str) -> bytes | None:
"""Pull ``reports/<id>.html`` off the submissions dataset.
Cached in-process so repeat clicks on the same row don't hit
the Hub. Returns ``None`` on any failure so the caller can
serve a clean 404 rather than leaking a stack trace.
"""
try:
local_path = hf_hub_download(
repo_id=HF_SUBMISSIONS_REPO,
filename=f"reports/{submission_id}.html",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch report for %s (%s: %s)",
submission_id, type(e).__name__, e,
)
return None
def serve_report(submission_id: str) -> Response:
"""Proxy a per-submission HTML report through the Space.
HF Hub serves dataset HTML under ``/resolve/`` with
``Content-Type: text/plain`` (security: dataset files can't host
live HTML), so a direct dataset link shows source instead of
rendering. This route lives on the Space (which can legitimately
serve text/html) and re-streams the file's bytes with the right
content-type.
"""
content = _fetch_report_html(submission_id)
if content is None:
return HTMLResponse(
content="<h1>Report not found</h1>",
status_code=404,
)
return Response(content=content, media_type="text/html; charset=utf-8")
def serve_metrics_page() -> Response:
"""Serve the static metrics explainer at ``/metrics``.
Same-origin as the report proxy (``/reports/<id>.html``), so a
hosted report's headline pills can deep-link to ``/metrics#<anchor>``
and land on the matching section. The "Metrics" Gradio tab embeds
this same route in an iframe.
"""
return HTMLResponse(content=build_metrics_page())
# Illustration assets the metrics page embeds (e.g. the interface-match
# mating-group WebP). Vendored into the Space repo under `assets/metrics/`
# and served here so the page renders self-contained, with no dependency
# on the code repo's raw GitHub URLs staying reachable.
METRICS_ASSETS_DIR = Path(__file__).parent / "assets" / "metrics"
def serve_metrics_asset(name: str) -> Response:
"""Serve a bundled metrics illustration from ``assets/metrics/``.
Flat namespace (no nested paths), traversal-guarded. Cached hard:
these are static, versioned-with-the-repo assets.
"""
if "/" in name or ".." in name:
return Response(status_code=404)
path = METRICS_ASSETS_DIR / name
if not path.is_file():
return Response(status_code=404)
media_type = mimetypes.guess_type(name)[0] or "application/octet-stream"
return Response(
content=path.read_bytes(),
media_type=media_type,
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _fetch_gt_render(fixture: str) -> bytes | None:
"""Pull a fixture's ground-truth GIF from the private GT dataset.
Path inside the GT repo is ``<fixture>/renders/rotating.webp``. GT
renders are a property of the data revision, not of any submission,
so they're served straight from the GT repo rather than duplicated
per submission. Not memoized for the same reason as :func:`_fetch_render` (GT
renders can be added/updated on a data revision bump);
``hf_hub_download`` handles the per-revision disk cache. Needs the
Space ``HF_TOKEN``'s read scope on the private repo.
"""
try:
local_path = hf_hub_download(
repo_id=HF_DATA_GT_REPO,
filename=f"{fixture}/renders/rotating.webp",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch GT render for %s (%s: %s)",
fixture, type(e).__name__, e,
)
return None
# Long-lived immutable caching: a (submission, fixture) render never
# changes (fixed camera + lighting; re-renders would be a new artifact),
# so the browser/CDN can keep it forever. This is what makes fixture
# swaps and repeat visits free: only the ~33 on-screen turntables are
# fetched on first paint, and everything after that is a cache hit.
RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable"
def _render_proxy_url(submission_id: str, fixture: str) -> str | None:
"""Resolver for a submission's plain turntable: a public render-bucket URL.
The eval job uploads ``renders/<id>/<fixture>/rotating.webp`` to the public
bucket, so the browser fetches it straight from object storage (anonymous,
no Space proxy hop). The gallery only calls this for ``valid`` fixtures; a
missing upload 404s and degrades to the dashed cell via ``<img onerror>``.
"""
return render_public_url(submission_id, fixture, "rotating.webp")
def _render_diff_proxy_url(submission_id: str, fixture: str) -> str | None:
"""Resolver for an editing fixture's edit-diff turntable (public bucket URL).
Used by the gallery grid for editing fixtures (see
``gallery.build_gallery_payload``). A miss (non-editing fixture, or an edit
that never rendered a diff) 404s and degrades to the dashed cell, no
fallback to the plain turntable.
"""
return render_public_url(submission_id, fixture, "edit_diff.webp")
def _gt_proxy_url(fixture: str) -> str | None:
"""Resolver returning the cached proxy URL for a fixture's GT WebP.
GT renders stay in the **private** GT dataset, so they cannot be public
bucket URLs; they are still re-streamed through the Space proxy (which
holds the read token).
"""
return f"/gt-render/{fixture}.webp"
def _gt_diff_proxy_url(fixture: str) -> str | None:
"""Resolver for an editing fixture's GT "answer key" edit-diff WebP.
The one-time GT generation (``tools/generate_gt_edit_diff.py``) writes
``<fixture>/renders/edit_diff_gt.webp`` into the private GT dataset, so it
rides the existing generic GT proxy (``serve_gt_file``) rather than needing
a route of its own. The gallery uses this for the ground-truth row on
editing fixtures; a missing file 404s and degrades to the dashed cell.
"""
return f"/gt/{fixture}/renders/edit_diff_gt.webp"
def serve_gt_render(fixture: str) -> Response:
"""Stream a fixture's ground-truth render WebP with long-lived caching."""
webp = _fetch_gt_render(fixture)
if webp is None:
return Response(status_code=404)
return Response(
content=webp,
media_type="image/webp",
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _fetch_gt_file(fixture: str, relpath: str) -> bytes | None:
"""Pull an arbitrary GT asset (``<fixture>/<relpath>``) from the GT dataset.
Serves the hosted report's ground-truth column: the per-view PNGs
(``renders/<view>.png``) and the ``ground_truth.pdf``. The GT dataset is
**private**, so these are proxied through the Space (which holds the read
token) rather than linked directly. ``hf_hub_download`` does the
per-revision disk cache. Returns ``None`` on any failure (the report hides
the broken tile via the browser's normal missing-image handling).
"""
try:
local_path = hf_hub_download(
repo_id=HF_DATA_GT_REPO,
filename=f"{fixture}/{relpath}",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch GT file %s/%s (%s: %s)",
fixture, relpath, type(e).__name__, e,
)
return None
def serve_gt_file(fixture: str, relpath: str) -> Response:
"""Stream a GT asset (view PNG / PDF) with long-lived immutable caching.
Path-traversal-guarded (``..`` rejected). The hosted report references
``/gt/<fixture>/<relpath>`` and the browser fetches it lazily; the bytes
are a property of the data revision (not any submission), so the same
immutable ``Cache-Control`` as the render/input proxies applies.
"""
if ".." in fixture or ".." in relpath:
return Response(status_code=404)
data = _fetch_gt_file(fixture, relpath)
if data is None:
return Response(status_code=404)
media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
return Response(
content=data,
media_type=media_type,
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _gallery_iframe_html() -> str:
"""Build the gallery as a self-contained ``srcdoc`` iframe.
Reads the live rows and renders the page (turntables referenced as
cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the
browser), then inlines the whole document into an iframe ``srcdoc``
so it gets its own style context (no Gradio CSS collision). A Hub
read failure degrades to an empty gallery rather than crashing the
tab.
"""
try:
rows = _load_rows_from_hub()
except LeaderboardDataError:
logger.exception("Gallery row load failed; rendering empty gallery")
rows = []
doc = render_gallery_page(
rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url,
_gt_diff_proxy_url,
)
escaped = html.escape(doc, quote=True)
# The gallery JS (`fitIframe`) sizes this iframe to be the single scroller:
# it shrinks to the content for few rows, otherwise fills down to the bottom
# of the viewport so only the iframe's own body scrolls (keeping the sticky
# header + ground-truth row locked) and the outer Gradio page does not also
# scroll. The inline `height` is just the pre-script fallback; JS overrides
# it, so no `max-height` here (it would clamp the measured fill height).
return (
f'<iframe srcdoc="{escaped}" '
'style="width:100%; height:80vh; border:0; display:block;" '
'title="CADGenBench gallery"></iframe>'
)
def _fetch_task_input(fixture: str, relpath: str) -> bytes | None:
"""Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo.
Serves the Task-browser tab's drawings / starting-shape renders.
The inputs dataset is private, so these are proxied through the
Space (which holds the read token) rather than linked directly —
mirroring :func:`_fetch_render`. Not memoized for the same reason:
inputs can be added/updated on a data revision bump, and
``hf_hub_download`` already does per-revision disk caching. Returns
``None`` on any failure (the page hides the broken tile).
"""
try:
local_path = hf_hub_download(
repo_id=HF_DATA_REPO,
filename=f"{fixture}/{relpath}",
repo_type="dataset",
)
return Path(local_path).read_bytes()
except Exception as e: # noqa: BLE001 - any Hub failure -> 404
logger.warning(
"Failed to fetch task input %s/%s (%s: %s)",
fixture, relpath, type(e).__name__, e,
)
return None
def _task_input_url(fixture: str, relpath: str) -> str:
"""Resolver returning the Space proxy URL for a task input asset.
Returns the route string without fetching bytes (the browser
lazy-fetches only the on-screen task's images). An absolute path
resolves against the Space origin even inside the iframe ``srcdoc``.
"""
return f"/task-input/{fixture}/{relpath}"
def serve_task_input(fixture: str, relpath: str) -> Response:
"""Stream a fixture input asset with long-lived immutable caching.
Path-traversal-guarded (``..`` rejected). The task browser
references ``/task-input/<fixture>/<relpath>`` and the browser
fetches it lazily; re-streams the dataset bytes (the Space holds the
read token) with the same immutable ``Cache-Control`` as the render
proxies so the CDN/browser cache them hard.
"""
if ".." in fixture or ".." in relpath:
return Response(status_code=404)
data = _fetch_task_input(fixture, relpath)
if data is None:
return Response(status_code=404)
media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
return Response(
content=data,
media_type=media_type,
headers={"Cache-Control": RENDER_CACHE_CONTROL},
)
def _tasks_iframe_html() -> str:
"""Build the Task browser as a self-contained ``srcdoc`` iframe.
Snapshots just the ``<fixture>/description.yaml`` files from the
inputs dataset (lightweight: the drawings/renders themselves load
lazily via the ``/task-input`` proxy), shapes them into task cards,
and inlines the page into an iframe so it keeps its own style
context (no Gradio CSS collision). A Hub read failure degrades to an
empty browser rather than crashing the tab.
"""
try:
local = snapshot_download(
repo_id=HF_DATA_REPO,
repo_type="dataset",
allow_patterns=["*/description.yaml"],
)
tasks = load_tasks_from_dir(Path(local))
except Exception: # noqa: BLE001 - degrade to empty browser, never crash
logger.exception("Task load failed; rendering empty task browser")
tasks = []
doc = render_tasks_page(tasks, _task_input_url)
escaped = html.escape(doc, quote=True)
return (
f'<iframe srcdoc="{escaped}" '
'style="width:100%; height:90vh; border:0; display:block;" '
'title="CADGenBench tasks"></iframe>'
)
@lru_cache(maxsize=1)
def _logo_data_uri() -> str:
"""Return the header logo as a base64 ``data:`` URI.
Inlined rather than served as a static file so the ``<img>`` renders
with no dependency on Gradio/FastAPI static-path allowlisting — it
works identically when the Space runs locally on a random port and
on huggingface.co. The PNG itself lives in the repo at
``assets/logo.png`` (reviewable as a real binary) and is read
relative to this module so the Docker image's working dir doesn't
matter. Cached because the bytes never change within a process.
"""
logo_path = Path(__file__).parent / "assets" / "logo.png"
data = base64.b64encode(logo_path.read_bytes()).decode("ascii")
return f"data:image/png;base64,{data}"
# Reclaim vertical space so the gallery can show more rows in one viewport:
# hide the Gradio footer ("Built with Gradio - Settings") and tighten the
# page's outer padding / inter-block gap. Scoped to cosmetics only. The
# logo is height-constrained (width auto-scales) so it sits in a compact
# band near the old `### ` title's footprint. The wordmark PNG has a
# transparent background and black ink, so on a dark theme it would
# vanish: the `.dark` rule inverts it to white ink (Gradio toggles the
# `.dark` class on the container; the prefers-color-scheme query covers
# system-driven dark mode too).
_APP_CSS = (
"footer{display:none !important;}"
".gradio-container{padding-top:4px !important; padding-bottom:0 !important;}"
# Collapse the title block's own box and the flex gap Gradio puts
# between it and the tab bar so the wordmark sits right above the
# leaderboard instead of floating with a gap. The negative bottom
# margin pulls the tab nav up snug against the logo.
"#cgb-title{margin:0 !important;padding:0 !important;min-width:0 !important;}"
"#cgb-title .cgb-logo{height:46px;width:auto;display:block;margin:0;}"
".gradio-container .tabs{margin-top:-6px !important;}"
".dark #cgb-title .cgb-logo{filter:invert(1);}"
"@media (prefers-color-scheme: dark){"
"#cgb-title .cgb-logo{filter:invert(1);}}"
)
with gr.Blocks(
title="CADGenBench Leaderboard", theme=gr.themes.Soft(), css=_APP_CSS,
) as blocks:
# Single compact title line (keeps vertical space for the gallery rows).
# The wordmark logo replaces the old `### CADGenBench Leaderboard`
# markdown; alt text preserves the name for screen readers / when
# images are blocked.
gr.HTML(
f'<img class="cgb-logo" src="{_logo_data_uri()}" '
'alt="CADGenBench Leaderboard">',
elem_id="cgb-title",
)
with gr.Tab("Leaderboard"):
# Visual-first leaderboard. The bespoke surface (sticky GT row,
# fixture picker, turntable grid, compare modal) is a
# self-contained HTML doc inlined into an iframe `srcdoc` so it
# keeps its own style context. Thumbnails are lazy-loaded from
# the cached `/render` / `/gt-render` proxy routes (requires the
# Space to be public). Built at boot, rebuilt on page load, and
# refreshed after admin actions.
gallery_html = gr.HTML(value=_gallery_iframe_html())
gallery_refresh_btn = gr.Button("Refresh gallery", size="sm")
gallery_refresh_btn.click(
fn=_gallery_iframe_html, outputs=gallery_html,
)
with gr.Tab("Detailed View"):
# Load both tiers once at boot. `_safe_load_split` keeps a Hub
# read failure from crashing the Space: on failure the frames
# come up empty and `initial_error` carries the message the
# banner renders.
initial_validated, initial_unvalidated, initial_error = _safe_load_split()
# Loud, persistent banner shown only when the live results
# can't be read from the Hub (e.g. an under-scoped Space
# HF_TOKEN). Kept in sync by the refresh / Timer handlers. The
# leaderboard never falls back to stale/bundled data, so this
# banner is the signal that empty tables are a read failure,
# not a genuinely empty leaderboard.
data_error_banner = gr.Markdown(
value=_data_error_banner_md(initial_error),
visible=initial_error is not None,
)
# Collapsed accordions above the tables. Validation guidelines
# gives the short two-tier story + link to the full policy
# doc; Citation carries the verbatim BibTeX entry. Both start
# closed so the leaderboard itself stays above the fold.
with gr.Accordion("Validation guidelines", open=False):
gr.Markdown(VALIDATION_GUIDELINES_MD)
with gr.Accordion("Citation", open=False):
# language=None -> plain monospaced render (gr.Code doesn't
# ship a BibTeX highlighter); show_line_numbers off because
# the entry is meant to be copy-pasted, not annotated.
gr.Code(
value=CITATION_BIBTEX,
language=None,
show_line_numbers=False,
)
# Two stacked tables, split by `validation_status`. Validated
# on top so the curated results are above the fold; unvalidated
# below carries every other row (auto-published, awaiting
# methodology review). See decisions/validation-policy.md.
# Initial values come from the boot-time `_safe_load_split`
# above (empty + banner on a Hub read failure).
validated_view = Leaderboard(
value=initial_validated,
datatype=VALIDATED_LEADERBOARD_DATATYPES,
search_columns=["submission_name", "submitter_name"],
hide_columns=LEADERBOARD_HIDE_COLUMNS,
label="Validated Leaderboard",
interactive=False,
)
unvalidated_view = Leaderboard(
value=initial_unvalidated,
datatype=LEADERBOARD_DATATYPES,
search_columns=["submission_name", "submitter_name"],
hide_columns=LEADERBOARD_HIDE_COLUMNS,
label="Unvalidated Leaderboard",
interactive=False,
)
with gr.Row():
refresh_btn = gr.Button("Refresh", size="sm")
# One file, both tables, `validation_status` discriminator
# column. Fresh CSV is generated on every click so the
# download reflects the latest data, not a stale snapshot
# captured at boot.
download_btn = gr.DownloadButton(
label="Download CSV", size="sm",
)
refresh_btn.click(
fn=_refresh_leaderboard_with_toast,
outputs=[validated_view, unvalidated_view, data_error_banner],
)
download_btn.click(fn=build_combined_csv, outputs=download_btn)
# No inline row-click detail panel: the submission_name cell is a
# deep-link that opens the self-contained per-submission report in
# a new tab (see `_submission_name_md` in leaderboard.py). Now that
# the Space is public, HF's edge serves `/reports/<id>.html` to
# browser users, so we link to it directly instead of inlining the
# (tens-to-hundreds-of-MB) report through the Gradio event payload.
with gr.Tab("Tasks"):
# Read-only task browser: mirrors the per-submission report's
# summary-table -> detail-card navigation (j/k, Esc) but shows
# only the prompt + input (drawing / starting shape), no scores
# or ground truth. Self-contained HTML inlined into an iframe
# `srcdoc` like the gallery; input images lazy-load from the
# `/task-input` proxy. Built at boot, rebuilt on page load.
tasks_html = gr.HTML(value=_tasks_iframe_html())
tasks_refresh_btn = gr.Button("Refresh tasks", size="sm")
tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html)
with gr.Tab("Metrics"):
# Static explainer for the (new) scoring metrics. Served as a
# standalone `/metrics` route too, so the per-submission report's
# headline pills can deep-link to `/metrics#<anchor>`; the tab just
# embeds that same page in an iframe (single source of truth).
gr.HTML(
'<iframe src="/metrics" '
'style="width:100%; height:85vh; border:0; display:block;" '
'title="CADGenBench metrics"></iframe>'
)
with gr.Tab("Submit"):
gr.Markdown(
f"""
**Submission format.** A single zip with:
- one folder per sample in `{HF_DATA_REPO}`; include `output.step` for
samples where your system produced a candidate. Missing `output.step`
scores zero for that sample;
- a top-level `meta.json`:
```json
{{
"submitter_name": "your name or team",
"submission_name": "MyAgent v2.3 (or whatever describes your system)",
"agent_url": "https://github.com/... (optional)",
"notes": "free text, optional, max 500 chars, single line, plain text",
"agree_to_publish": true
}}
```
**Submission name.** Free text describing the system being benchmarked,
however you choose to describe it. The benchmark is system-agnostic: your
submission may use no LLM, one, or many. If you want to disclose your
stack, put it here or in `notes`.
**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
and stripped to a single line. Shown in the per-submission detail view,
not in the main leaderboard table.
**Consent.** `"agree_to_publish": true` in `meta.json` is your consent
to publish the resulting row on the public leaderboard.
For the full submission contract (output format, validity gate, canonical
pose, and a local self-check), see
[`docs/benchmark/submission.md`]({SUBMISSION_DOC_URL}).
"""
)
# OAuth gate. The user must log in via the HF button before
# the Submit button becomes interactive; the row gets the
# canonical `hf_username` from `gr.OAuthProfile.username`
# (not a free-text claim in meta.json). README front-matter
# already carries `hf_oauth: true` so HF's OAuth integration
# is wired up at the Space level.
login_btn = gr.LoginButton()
zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
# Starts disabled; the `blocks.load` handler below flips it
# to interactive when an OAuthProfile is present.
submit_btn = gr.Button("Submit", variant="primary", interactive=False)
# Persistent status panel. handle_submit is a generator that
# streams stage updates (validating -> uploading/queuing ->
# queued) and any rejection reason here, so the outcome
# survives instead of vanishing with a transient toast. The
# handler also reads `gr.OAuthProfile` implicitly via its
# parameter type annotation (Gradio's dependency-injection
# convention).
submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE)
submit_btn.click(
fn=handle_submit,
inputs=[zip_in],
outputs=[submit_status],
)
with gr.Tab("About"):
gr.Markdown(ABOUT_MD)
with gr.Tab("Admin"):
# Maintainer-only controls. The Admin *tab* is visible to everyone
# (a hint the path exists), but ALL admin UI -- the table, the
# actions, the danger zones -- lives in `admin_panel`, a column that
# stays hidden unless the logged-in user is in CADGENBENCH_ADMINS.
# The `blocks.load` handler below flips that column's visibility and
# only loads table data for admins; a server-side `is_admin` re-check
# still guards every handler. Non-admins (and logged-out visitors)
# see only the login/logout button + a status line, nothing else.
# See decisions/validation-policy.md.
admin_login_btn = gr.LoginButton()
admin_status = gr.Markdown(
"Log in with an admin account to access the controls."
)
# Everything below is admin-only: hidden by default, revealed by
# `_gate_admin_controls` only for a logged-in user in the admin set.
with gr.Column(visible=False) as admin_panel:
gr.Markdown(
"## Admin\n"
"Tick rows in the **select** column, then promote them into "
"the **Validated** tier (recording an evidence type), demote "
"them back to **Unvalidated**, delete them, or rescore them "
"against the current ground truth. Actions apply to every "
"ticked row at once."
)
# Only the leading `select` column is editable; the rest is
# read-only context. Click-to-tick drives every action below.
# Starts empty; `_gate_admin_controls` loads rows on page load
# for admins only, so non-admins never receive the data.
admin_table = gr.Dataframe(
value=_empty_admin_table(),
datatype=[
"bool", "str", "str", "str", "str", "str", "str",
"number", "str",
],
static_columns=list(range(1, len(ADMIN_COLUMNS))),
interactive=False,
label="Submissions (tick select to choose rows)",
wrap=True,
)
admin_selection_md = gr.Markdown("_No rows selected._")
admin_method_radio = gr.Radio(
choices=list(VALID_METHODS),
value="manual",
label="validation_method (applied to all rows on promote)",
interactive=False,
)
with gr.Row():
promote_btn = gr.Button(
"Mark validated", variant="primary", interactive=False,
)
demote_btn = gr.Button("Mark unvalidated", interactive=False)
with gr.Accordion("Danger zone: delete", open=False):
gr.Markdown(
"Permanently deletes the ticked rows **and** their "
"uploaded zip + report files from the submissions "
"dataset. This cannot be undone (only a manual revert of "
"the dataset commit).\n\n"
"**Stop & delete** additionally cancels any still-running "
"evaluation job(s) for the ticked rows before deleting — "
"use it for pending submissions whose GPU eval is in "
"flight."
)
delete_confirm = gr.Checkbox(
label=(
"I understand this permanently deletes the selected "
"submissions and their files."
),
value=False,
interactive=False,
)
with gr.Row():
delete_btn = gr.Button(
"Delete selected", variant="stop", interactive=False,
)
stop_delete_btn = gr.Button(
"Stop & delete selected", variant="stop",
interactive=False,
)
with gr.Accordion("Danger zone: rescore", open=False):
gr.Markdown(
"Re-evaluates submissions against the **current** "
"ground truth + data: each row flips back to pending, the "
"gallery renders and the per-submission report HTML are "
"regenerated, and the score is recomputed. Use after a "
"ground-truth swap or a metric change that invalidates "
"the existing scores.\n\n"
"Rescoring is **re-runnable**: if a row's eval fails, "
"mark it and rescore again (or rescore all) — each run is "
"independent and converges.\n\n"
"- **Rescore selected** re-evaluates the ticked rows.\n"
f"- **Rescore all** re-evaluates every submission that "
f"has a stored zip and isn't already pending — type "
f"`{RESCORE_ALL_PHRASE}` to arm it."
)
rescore_confirm = gr.Checkbox(
label=(
"I understand this flips the selected rows to pending "
"and recomputes their scores."
),
value=False,
interactive=False,
)
rescore_selected_btn = gr.Button(
"Rescore selected", variant="stop", interactive=False,
)
rescore_all_phrase = gr.Textbox(
label=(
f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide "
f"rescore"
),
placeholder=RESCORE_ALL_PHRASE,
interactive=False,
)
rescore_all_btn = gr.Button(
"Rescore ALL submissions", variant="stop",
interactive=False,
)
admin_refresh_btn = gr.Button("Refresh", size="sm")
admin_table.change(
fn=_admin_selection_status,
inputs=admin_table,
outputs=admin_selection_md,
)
promote_btn.click(
fn=_admin_promote,
inputs=[admin_table, admin_method_radio],
outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
)
demote_btn.click(
fn=_admin_demote,
inputs=[admin_table],
outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
)
delete_confirm.change(
fn=_arm_delete,
inputs=[delete_confirm],
outputs=[delete_btn, stop_delete_btn],
)
delete_btn.click(
fn=_admin_delete,
inputs=[admin_table, delete_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
delete_confirm, delete_btn, stop_delete_btn,
],
)
stop_delete_btn.click(
fn=_admin_stop_delete,
inputs=[admin_table, delete_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
delete_confirm, delete_btn, stop_delete_btn,
],
)
rescore_confirm.change(
fn=_arm_rescore_selected,
inputs=[rescore_confirm],
outputs=[rescore_selected_btn],
)
rescore_selected_btn.click(
fn=_admin_rescore_selected,
inputs=[admin_table, rescore_confirm],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
rescore_confirm, rescore_selected_btn,
],
)
rescore_all_phrase.change(
fn=_arm_rescore_all,
inputs=[rescore_all_phrase],
outputs=[rescore_all_btn],
)
rescore_all_btn.click(
fn=_admin_rescore_all,
inputs=[rescore_all_phrase],
outputs=[
admin_table, validated_view, unvalidated_view, gallery_html,
rescore_all_phrase, rescore_all_btn,
],
)
admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
# Keep the admin table on the same 10s cadence as the leaderboard
# so a row that lands (or a pending row that completes) after the
# tab loaded shows up without a manual Refresh. Selection is
# preserved across ticks so an in-progress set of checkboxes
# survives the reload.
admin_auto_refresh_timer = gr.Timer(10)
admin_auto_refresh_timer.tick(
fn=_auto_refresh_admin_table,
inputs=admin_table,
outputs=admin_table,
)
# gradio_leaderboard.Leaderboard handles its own update path
# cleanly; bind a Timer to push fresh dataframes every 10 seconds.
# Single tick runs `_auto_refresh_leaderboard` once and pushes the
# two halves into the validated / unvalidated widgets plus the
# data-unavailable banner. The handler swallows a Hub read failure
# into empty frames + a loud warning toast so a degraded read never
# crashes the tick loop or silently blanks the tables.
auto_refresh_timer = gr.Timer(10)
auto_refresh_timer.tick(
fn=_auto_refresh_leaderboard,
outputs=[validated_view, unvalidated_view, data_error_banner],
)
# On page load, read the visitor's OAuth profile (None if not
# logged in) and flip the Submit button's interactivity. Runs once
# per page load; LoginButton clicks also re-trigger this through
# Gradio's auth-event plumbing.
blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
blocks.load(fn=_tasks_iframe_html, outputs=tasks_html)
# Same per-load OAuth read, gating the Admin tab on membership in the
# CADGENBENCH_ADMINS set. Logged-out / non-admin visitors get the
# admin_panel hidden entirely (no table, no controls) -- just the
# login/logout button and a status line.
blocks.load(
fn=_gate_admin_controls,
outputs=[
admin_panel,
admin_table,
admin_method_radio,
promote_btn,
demote_btn,
delete_confirm,
delete_btn,
stop_delete_btn,
rescore_confirm,
rescore_selected_btn,
rescore_all_phrase,
rescore_all_btn,
admin_status,
],
)
# Mount Gradio under a FastAPI parent so the custom proxy route
# above lives at the same origin as the UI. Direct routes on `app`
# get checked before the Gradio sub-app, so `/reports/<sid>.html`
# never gets shadowed.
app = FastAPI()
app.add_api_route(
"/reports/{submission_id}.html",
serve_report,
methods=["GET"],
)
# Static metrics explainer. Same origin as the report proxy so report
# pills can deep-link to `/metrics#<anchor>`; also embedded in the
# Metrics tab. Registered before the Gradio mount so it isn't shadowed.
app.add_api_route(
"/metrics",
serve_metrics_page,
methods=["GET"],
)
# Illustration assets the metrics page embeds (vendored under assets/metrics/).
app.add_api_route(
"/metrics-assets/{name}",
serve_metrics_asset,
methods=["GET"],
)
# Cached render proxies the gallery's lazy-loaded turntables point at.
# Registered before the Gradio mount so they're not shadowed by the
# catch-all sub-app.
# Candidate renders are served directly from the public render bucket (URLs
# come from the gallery resolvers), so only the private GT render still needs a
# token-holding Space proxy route.
app.add_api_route(
"/gt-render/{fixture}.webp",
serve_gt_render,
methods=["GET"],
)
# Ground-truth assets the hosted report links lazily (per-view PNGs + PDF).
# GT is private, so this token-holding proxy streams them; the `:path`
# converter lets `relpath` carry a slash (e.g. renders/iso.png). Registered
# before the Gradio mount so it isn't shadowed by the catch-all sub-app.
app.add_api_route(
"/gt/{fixture}/{relpath:path}",
serve_gt_file,
methods=["GET"],
)
# Task-browser input assets (drawings + starting-shape renders). The
# `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png).
# Registered before the Gradio mount so it's not shadowed.
app.add_api_route(
"/task-input/{fixture}/{relpath:path}",
serve_task_input,
methods=["GET"],
)
# Gradio picks REAL Hugging Face OAuth vs. a local "mock" login via
# ``gradio.utils.get_space()``, which is only truthy when ``SYSTEM ==
# "spaces"``. HF sets that on Gradio-SDK Spaces but NOT on ``sdk: docker``
# Spaces like this one. Without it, ``mount_gradio_app`` wires up the MOCK
# OAuth routes, which never contact hf.co and instead log every visitor in
# as the container token's owner (our ``HF_TOKEN`` account) -- leaking that
# identity into the LoginButton and, since that account is in
# ``CADGENBENCH_ADMINS``, handing every visitor admin. Force it on only when
# we're actually running on a Space (``SPACE_ID`` is HF-injected on all
# Spaces, Docker included) so the real ``hf_oauth: true`` flow runs; locally
# (no ``SPACE_ID``) it stays unset so Gradio's local mock login still works
# for dev. Must precede the mount, which is what triggers ``attach_oauth``.
if os.environ.get("SPACE_ID") and os.environ.get("SYSTEM") != "spaces":
os.environ["SYSTEM"] = "spaces"
app = gr.mount_gradio_app(app, blocks, path="/")
if __name__ == "__main__":
host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
uvicorn.run(app, host=host, port=port)