Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

Michael Rabinovich

Gallery: show GT "answer-key" edit-diff for editing fixtures

49e27be 2 days ago

62.4 kB

	# Copyright 2026 Hugging Face
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""CADGenBench Leaderboard Space - Gradio UI + report-proxy mount.

	Read path lives in :mod:`leaderboard`. Submit-tab validation lives in
	:mod:`submit`. Both are wired into the Gradio Blocks below. The
	Gradio app is mounted under a FastAPI parent so the custom
	``/reports/{submission_id}.html`` route can re-serve dataset HTML
	with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it
	as ``text/plain`` by policy, which makes the browser show source
	rather than render).
	"""
	from __future__ import annotations

	import base64
	import html
	import logging
	import mimetypes
	import os
	from functools import lru_cache
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import uvicorn
	from fastapi import FastAPI
	from fastapi.responses import HTMLResponse, Response
	from gradio_leaderboard import Leaderboard
	from huggingface_hub import hf_hub_download, snapshot_download

	from leaderboard import (
	ADMIN_COLUMNS,
	ADMIN_SELECT_COL,
	HF_DATA_GT_REPO,
	HF_DATA_REPO,
	HF_SUBMISSIONS_REPO,
	LEADERBOARD_COLS,
	LEADERBOARD_DATATYPES,
	LEADERBOARD_HIDE_COLUMNS,
	VALIDATED_LEADERBOARD_COLS,
	VALIDATED_LEADERBOARD_DATATYPES,
	LeaderboardDataError,
	_fmt_timestamp,
	_load_rows_from_hub,
	build_combined_csv,
	load_admin_table,
	load_leaderboard_split,
	render_public_url,
	)
	from gallery import render_gallery_page
	from metrics_page import build_metrics_page
	from tasks import load_tasks_from_dir, render_tasks_page
	from admin import (
	VALID_METHODS,
	delete_rows,
	demote_rows,
	is_admin,
	promote_rows,
	rescore_all,
	rescore_rows,
	stop_and_delete_rows,
	)
	from submit import handle_submit

	logger = logging.getLogger(__name__)

	# Surface module-level logger.info / logger.warning / logger.exception
	# calls from leaderboard.py + submit.py in the Space's runtime logs.
	# Otherwise they go nowhere and any refresh / worker pathology is
	# silent. Format keeps timestamps + module + level + message.
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
	)


	# Canonical policy doc lives in the code repo so contributors reading
	# the GitHub repo see it without needing to visit the Space. Linked
	# from both the Detailed View tab's Validation Guidelines accordion and
	# the About tab.
	VALIDATION_DOC_URL = (
	"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
	)
	# Canonical submission contract (output layout, validity gate, canonical
	# pose, local self-check). Linked from the Submit tab so the tab itself
	# stays a short "how to package + upload" note rather than re-documenting
	# the full contract.
	SUBMISSION_DOC_URL = (
	"https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/submission.md"
	)

	ABOUT_MD = f"""## About

	CADGenBench evaluates AI-driven CAD generation: how well a model can
	turn a description of a mechanical part into a valid, geometrically
	correct 3D model.

	- Reference baseline: an iterative AI agent that writes build123d Python.
	- Submission flow: upload a zip of per-fixture STEP files; the Space
	runs the eval and appends a row to the submissions dataset.
	- Datasets: fixture inputs in
	[`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO});
	submissions and computed results in
	[`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}).
	- Code: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench).
	- Validation policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).
	- Data: CAD geometry from [Mecado](https://www.mecado.com).
	"""

	# Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md
	# (Locked decisions section). Shown in the Citation accordion as a
	# copy-paste handle for anyone citing this benchmark; the About tab
	# already links the source code via huggingface/cadgenbench so the
	# Space URL is the right deep-link target for the citation.
	CITATION_BIBTEX = r"""@misc{cadgenbench2026,
	author = {Rabinovich, Michael and {Hugging Face}},
	title = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation},
	year = {2026},
	publisher = {Hugging Face},
	howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/CADGenBench}},
	}"""

	VALIDATION_GUIDELINES_MD = f"""Submissions appear on the Unvalidated table the moment evaluation completes. Maintainers promote rows to Validated after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`).

	Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""

	SUBMIT_STATUS_IDLE = (
	"_Log in, attach a zip, and click Submit. Progress and any "
	"errors appear here._"
	)


	def _data_error_banner_md(message: str \| None) -> str:
	"""Markdown for the top-of-tab data-unavailable banner.

	Empty string when there's no error (the banner is also hidden via
	``visible=False`` in that case). When the live ``results.jsonl``
	can't be read, the banner is the loud, persistent signal that the
	tables below are empty by design (we never fall back to stale or
	bundled data) rather than because the leaderboard is genuinely
	empty.
	"""
	if not message:
	return ""
	return (
	"> ⚠️ Leaderboard data unavailable. The live results could not "
	"be read from the Hub, so the tables below are empty. No stale or "
	"cached data is ever shown in its place.\n>\n"
	f"> Details: `{message}`"
	)


	def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str \| None]:
	"""Load both tiers, turning a Hub failure into empty frames + a message.

	The reader (:func:`load_leaderboard_split`) deliberately raises
	on any read failure (no silent fallback). The Space, however, must
	stay up and loudly surface the failure rather than crash, so this
	wrapper converts :class:`LeaderboardDataError` into empty,
	correctly-shaped DataFrames plus an error string the caller renders
	in the banner / a toast. Returns ``(validated, unvalidated, error)``
	with ``error`` ``None`` on success.
	"""
	try:
	validated, unvalidated = load_leaderboard_split()
	return validated, unvalidated, None
	except LeaderboardDataError as e:
	logger.exception("Leaderboard data load failed")
	return (
	pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
	pd.DataFrame(columns=LEADERBOARD_COLS),
	str(e),
	)


	def _safe_load_admin() -> tuple[pd.DataFrame, str \| None]:
	"""Admin-table counterpart to :func:`_safe_load_split`.

	Same no-crash contract: a Hub read failure yields an empty,
	correctly-shaped admin frame plus the error string instead of
	propagating the exception (which would take the whole Space down at
	boot, since the admin table loads at module-construction time).
	"""
	try:
	return load_admin_table(), None
	except LeaderboardDataError as e:
	logger.exception("Admin table load failed")
	return pd.DataFrame(columns=ADMIN_COLUMNS), str(e)


	def _refresh_leaderboard_with_toast():
	"""Manual Refresh button handler: toast + fresh DataFrames + banner.

	Surfaces the outcome loudly either way: ``gr.Info`` on success,
	``gr.Warning`` when the live read failed. The third output keeps
	the data-unavailable banner in sync (shown with the error,
	cleared on success).
	"""
	validated, unvalidated, error = _safe_load_split()
	if error:
	gr.Warning(f"Leaderboard data unavailable: {error}")
	else:
	gr.Info("Leaderboard refreshed.")
	return (
	validated,
	unvalidated,
	gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
	)


	def _auto_refresh_leaderboard():
	"""Timer-tick handler: fresh DataFrames + banner, no success toast.

	Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on
	success (a toast every 10s would be noise). A read failure still
	fires a loud ``gr.Warning`` and updates the banner so a degraded
	Hub read can't quietly leave the tables blank.
	"""
	validated, unvalidated, error = _safe_load_split()
	if error:
	gr.Warning(f"Leaderboard data unavailable: {error}")
	return (
	validated,
	unvalidated,
	gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
	)


	def _enable_submit_when_logged_in(
	profile: gr.OAuthProfile \| None,
	) -> gr.Button:
	"""Flip the Submit button's interactivity based on login state.

	Runs once per page load via ``blocks.load``. Gradio injects
	``gr.OAuthProfile`` automatically (``None`` if the visitor isn't
	logged in via the LoginButton). The visible-disable mirrors the
	server-side gate in :func:`submit.handle_submit`; the handler
	still raises ``gr.Error`` defensively if it ever gets called
	without a profile.
	"""
	return gr.Button(interactive=profile is not None)


	def _selected_ids(table_df: pd.DataFrame \| None) -> list[str]:
	"""Submission ids of the rows whose ``select`` checkbox is ticked."""
	if (
	table_df is None
	or len(table_df) == 0
	or ADMIN_SELECT_COL not in table_df.columns
	or "submission_id" not in table_df.columns
	):
	return []
	mask = table_df[ADMIN_SELECT_COL].apply(bool)
	return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s]


	def _admin_selection_status(table_df: pd.DataFrame \| None) -> str:
	"""Live count line under the admin table, updated as boxes are ticked."""
	n = len(_selected_ids(table_df))
	return f"{n} row(s) selected." if n else "_No rows selected._"


	def _gate_admin_controls(
	profile: gr.OAuthProfile \| None,
	) -> tuple[
	gr.Column, gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox,
	gr.Button, gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str,
	]:
	"""Reveal the admin panel only for a logged-in user in the admin set.

	Runs on every page load and re-runs on LoginButton auth events. The
	entire admin panel (table + every control) lives in a column that
	stays hidden unless the visitor is logged in AND in the admin set, so
	non-admins and logged-out visitors see only the login/logout button
	and a status line -- no table, no buttons. For admins the panel is
	shown, its controls enabled, and the table refreshed from live Hub
	data. Data is only loaded into the table for admins, and a server-side
	``is_admin`` re-check still guards every handler. The armed-by-
	confirmation buttons (delete, stop-and-delete, rescore-selected,
	rescore-all) always load disarmed: they only enable once their confirm
	box is ticked / phrase typed.
	"""
	admin = is_admin(profile)
	if admin:
	admin_df, error = _safe_load_admin()
	if error:
	gr.Warning(f"Admin table unavailable: {error}")
	else:
	admin_df = _empty_admin_table()
	if profile is None:
	status = "Log in with an admin account to access the controls."
	elif admin:
	status = f"Signed in as `{profile.username}`. Admin controls enabled."
	else:
	status = (
	f"Signed in as `{profile.username}`, which is not in the admin "
	"set. You can log out with the button above."
	)
	return (
	gr.Column(visible=admin),
	gr.Dataframe(value=admin_df, interactive=admin),
	gr.Radio(interactive=admin),
	gr.Button(interactive=admin),
	gr.Button(interactive=admin),
	gr.Checkbox(interactive=admin, value=False),
	gr.Button(interactive=False),
	gr.Button(interactive=False),
	gr.Checkbox(interactive=admin, value=False),
	gr.Button(interactive=False),
	gr.Textbox(interactive=admin, value=""),
	gr.Button(interactive=False),
	status,
	)


	def _arm_delete(
	confirm: bool, profile: gr.OAuthProfile \| None,
	) -> tuple[gr.Button, gr.Button]:
	"""Arm both destructive buttons once an admin ticks the confirm box.

	The plain delete and the stop-and-delete share the single confirm
	checkbox, so a deliberate tick is required before either fires.
	"""
	armed = bool(confirm) and is_admin(profile)
	return gr.Button(interactive=armed), gr.Button(interactive=armed)


	def _empty_admin_table() -> pd.DataFrame:
	"""An admin frame with headers but no rows -- what non-admins get.

	The admin panel is hidden from non-admins, but the table refreshers
	still run server-side; returning an empty frame ensures no submission
	data is ever streamed into a non-admin's (hidden) table.
	"""
	return pd.DataFrame(columns=list(ADMIN_COLUMNS))


	def _refresh_admin_table(profile: gr.OAuthProfile \| None) -> pd.DataFrame:
	"""Admin Refresh button handler: reload the admin table, toast on failure.

	Uses the no-crash :func:`_safe_load_admin` so a Hub read failure
	surfaces as a loud ``gr.Warning`` plus an empty table rather than an
	uncaught exception. Returns an empty frame to non-admins so a tampered
	client can't pull the table out from behind the hidden panel.
	"""
	if not is_admin(profile):
	return _empty_admin_table()
	admin_df, error = _safe_load_admin()
	if error:
	gr.Warning(f"Admin table unavailable: {error}")
	return admin_df


	def _reapply_selection(
	fresh: pd.DataFrame, selected: set[str],
	) -> pd.DataFrame:
	"""Re-tick the ``select`` column on rows the maintainer had selected.

	A freshly-loaded admin frame comes back all-unchecked; this carries
	the prior ticks forward by ``submission_id`` so a background refresh
	doesn't wipe an in-progress selection. Ids that vanished (e.g. a row
	deleted out from under the table) simply drop out.
	"""
	if (
	selected
	and ADMIN_SELECT_COL in fresh.columns
	and "submission_id" in fresh.columns
	):
	fresh[ADMIN_SELECT_COL] = (
	fresh["submission_id"].astype(str).isin(selected)
	)
	return fresh


	def _auto_refresh_admin_table(
	current_df: pd.DataFrame \| None,
	profile: gr.OAuthProfile \| None,
	) -> pd.DataFrame:
	"""Timer-tick handler: reload the admin table, preserving ticked rows.

	The leaderboard tables auto-refresh every 10s but the admin table did
	not, so a pending row submitted after the tab loaded stayed invisible
	until a manual Refresh. This keeps it current on the same cadence.
	Unlike the leaderboard handler it stays silent (no per-tick toast)
	and, on a Hub read failure, returns the current frame unchanged so a
	transient blip never blanks the table or drops the user's selection.
	Non-admins get an empty frame so the (hidden) table is never fed data.
	"""
	if not is_admin(profile):
	return _empty_admin_table()
	admin_df, error = _safe_load_admin()
	if error:
	return current_df if current_df is not None else admin_df
	return _reapply_selection(admin_df, set(_selected_ids(current_df)))


	def _admin_promote(
	table_df: pd.DataFrame \| None,
	method: str \| None,
	profile: gr.OAuthProfile \| None,
	) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
	"""Promote ticked rows, then refresh admin, leaderboard, and gallery.

	Re-checks :func:`admin.is_admin` server-side so a tampered client
	that re-enables the button still can't write.
	"""
	if not is_admin(profile):
	raise gr.Error("You are not in the admin set.")
	ids = _selected_ids(table_df)
	if not ids:
	raise gr.Error("Tick at least one row first.")
	if not method:
	raise gr.Error("Pick a validation_method first.")
	try:
	promote_rows(ids, method)
	except (LookupError, ValueError) as e:
	raise gr.Error(str(e))
	gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).")
	validated, unvalidated, _ = _safe_load_split()
	admin_df, _ = _safe_load_admin()
	return admin_df, validated, unvalidated, _gallery_iframe_html()


	def _admin_demote(
	table_df: pd.DataFrame \| None,
	profile: gr.OAuthProfile \| None,
	) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
	"""Demote ticked rows, then refresh admin, leaderboard, and gallery."""
	if not is_admin(profile):
	raise gr.Error("You are not in the admin set.")
	ids = _selected_ids(table_df)
	if not ids:
	raise gr.Error("Tick at least one row first.")
	try:
	demote_rows(ids)
	except (LookupError, ValueError) as e:
	raise gr.Error(str(e))
	gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.")
	validated, unvalidated, _ = _safe_load_split()
	admin_df, _ = _safe_load_admin()
	return admin_df, validated, unvalidated, _gallery_iframe_html()


	def _admin_delete(
	table_df: pd.DataFrame \| None,
	confirm: bool,
	profile: gr.OAuthProfile \| None,
	) -> tuple[
	pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
	gr.Button,
	]:
	"""Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.

	Resets the confirm checkbox and re-disables both destructive buttons
	on the way out so the next deletion needs a fresh, deliberate confirm.
	"""
	if not is_admin(profile):
	raise gr.Error("You are not in the admin set.")
	if not confirm:
	raise gr.Error("Tick the confirmation box to enable delete.")
	ids = _selected_ids(table_df)
	if not ids:
	raise gr.Error("Tick at least one row first.")
	try:
	delete_rows(ids)
	except ValueError as e:
	raise gr.Error(str(e))
	gr.Info(f"Deleted {len(ids)} submission(s).")
	validated, unvalidated, _ = _safe_load_split()
	admin_df, _ = _safe_load_admin()
	return (
	admin_df,
	validated,
	unvalidated,
	_gallery_iframe_html(),
	gr.Checkbox(value=False),
	gr.Button(interactive=False),
	gr.Button(interactive=False),
	)


	def _admin_stop_delete(
	table_df: pd.DataFrame \| None,
	confirm: bool,
	profile: gr.OAuthProfile \| None,
	) -> tuple[
	pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
	gr.Button,
	]:
	"""Stop running eval job(s) for ticked rows, delete them, then disarm.

	Same gating + disarm contract as :func:`_admin_delete`; the only
	difference is it calls :func:`admin.stop_and_delete_rows`, which
	best-effort cancels the submissions' in-flight HF Jobs before
	deleting. Use this for pending rows whose GPU eval is still running.
	"""
	if not is_admin(profile):
	raise gr.Error("You are not in the admin set.")
	if not confirm:
	raise gr.Error("Tick the confirmation box to enable delete.")
	ids = _selected_ids(table_df)
	if not ids:
	raise gr.Error("Tick at least one row first.")
	try:
	stop_and_delete_rows(ids)
	except ValueError as e:
	raise gr.Error(str(e))
	gr.Info(f"Stopped + deleted {len(ids)} submission(s).")
	validated, unvalidated, _ = _safe_load_split()
	admin_df, _ = _safe_load_admin()
	return (
	admin_df,
	validated,
	unvalidated,
	_gallery_iframe_html(),
	gr.Checkbox(value=False),
	gr.Button(interactive=False),
	gr.Button(interactive=False),
	)


	# Exact phrase an admin must type to arm the board-wide rescore. A
	# free-text match (not a checkbox) is the deliberate "are you sure"
	# friction: it can't be tripped by a stray click and forces the admin
	# to consciously type the words before the heavy, score-invalidating
	# action arms.
	RESCORE_ALL_PHRASE = "RESCORE ALL"


	def _arm_rescore_selected(
	confirm: bool, profile: gr.OAuthProfile \| None,
	) -> gr.Button:
	"""Arm the rescore-selected button once an admin ticks its confirm box."""
	return gr.Button(interactive=bool(confirm) and is_admin(profile))


	def _arm_rescore_all(
	phrase: str \| None, profile: gr.OAuthProfile \| None,
	) -> gr.Button:
	"""Arm the rescore-all button only on an exact phrase match by an admin."""
	matched = (phrase or "").strip() == RESCORE_ALL_PHRASE
	return gr.Button(interactive=matched and is_admin(profile))


	def _rescore_result_message(dispatched: int, skipped: list[str]) -> str:
	"""Toast text summarising a rescore dispatch."""
	msg = (
	f"Rescoring {dispatched} submission(s): rows flipped to pending and "
	f"re-evaluating in the background. The leaderboard repopulates as "
	f"each finishes."
	)
	if skipped:
	msg += (
	f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed "
	f"rows can't be rescored)."
	)
	return msg


	def _admin_rescore_selected(
	table_df: pd.DataFrame \| None,
	confirm: bool,
	profile: gr.OAuthProfile \| None,
	) -> tuple[
	pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
	]:
	"""Re-evaluate the ticked rows, refresh the views, then disarm.

	Same gating contract as the destructive handlers: server-side
	``is_admin`` re-check, an explicit confirm tick, and a non-empty
	selection. Resets the confirm box + disarms the button on the way
	out so the next rescore needs a fresh, deliberate confirm.
	"""
	if not is_admin(profile):
	raise gr.Error("You are not in the admin set.")
	if not confirm:
	raise gr.Error("Tick the confirmation box to enable rescore.")
	ids = _selected_ids(table_df)
	if not ids:
	raise gr.Error("Tick at least one row first.")
	try:
	dispatched, skipped = rescore_rows(ids)
	except (LookupError, ValueError) as e:
	raise gr.Error(str(e))
	gr.Info(_rescore_result_message(dispatched, skipped))
	validated, unvalidated, _ = _safe_load_split()
	admin_df, _ = _safe_load_admin()
	return (
	admin_df,
	validated,
	unvalidated,
	_gallery_iframe_html(),
	gr.Checkbox(value=False),
	gr.Button(interactive=False),
	)


	def _admin_rescore_all(
	phrase: str \| None,
	profile: gr.OAuthProfile \| None,
	) -> tuple[
	pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button,
	]:
	"""Re-evaluate every rescoreable row, refresh the views, then disarm.

	The heavy, board-wide action: re-checks ``is_admin`` and the exact
	confirmation phrase server-side (so a tampered client that
	re-enables the button still can't fire), clears the phrase box, and
	disarms the button afterwards.
	"""
	if not is_admin(profile):
	raise gr.Error("You are not in the admin set.")
	if (phrase or "").strip() != RESCORE_ALL_PHRASE:
	raise gr.Error(
	f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore."
	)
	try:
	dispatched, skipped = rescore_all()
	except ValueError as e:
	raise gr.Error(str(e))
	gr.Info(_rescore_result_message(dispatched, skipped))
	validated, unvalidated, _ = _safe_load_split()
	admin_df, _ = _safe_load_admin()
	return (
	admin_df,
	validated,
	unvalidated,
	_gallery_iframe_html(),
	gr.Textbox(value=""),
	gr.Button(interactive=False),
	)


	@lru_cache(maxsize=128)
	def _fetch_report_html(submission_id: str) -> bytes \| None:
	"""Pull ``reports/<id>.html`` off the submissions dataset.

	Cached in-process so repeat clicks on the same row don't hit
	the Hub. Returns ``None`` on any failure so the caller can
	serve a clean 404 rather than leaking a stack trace.
	"""
	try:
	local_path = hf_hub_download(
	repo_id=HF_SUBMISSIONS_REPO,
	filename=f"reports/{submission_id}.html",
	repo_type="dataset",
	)
	return Path(local_path).read_bytes()
	except Exception as e: # noqa: BLE001 - any Hub failure -> 404
	logger.warning(
	"Failed to fetch report for %s (%s: %s)",
	submission_id, type(e).__name__, e,
	)
	return None


	def serve_report(submission_id: str) -> Response:
	"""Proxy a per-submission HTML report through the Space.

	HF Hub serves dataset HTML under ``/resolve/`` with
	``Content-Type: text/plain`` (security: dataset files can't host
	live HTML), so a direct dataset link shows source instead of
	rendering. This route lives on the Space (which can legitimately
	serve text/html) and re-streams the file's bytes with the right
	content-type.
	"""
	content = _fetch_report_html(submission_id)
	if content is None:
	return HTMLResponse(
	content="<h1>Report not found</h1>",
	status_code=404,
	)
	return Response(content=content, media_type="text/html; charset=utf-8")


	def serve_metrics_page() -> Response:
	"""Serve the static metrics explainer at ``/metrics``.

	Same-origin as the report proxy (``/reports/<id>.html``), so a
	hosted report's headline pills can deep-link to ``/metrics#<anchor>``
	and land on the matching section. The "Metrics" Gradio tab embeds
	this same route in an iframe.
	"""
	return HTMLResponse(content=build_metrics_page())


	# Illustration assets the metrics page embeds (e.g. the interface-match
	# mating-group WebP). Vendored into the Space repo under `assets/metrics/`
	# and served here so the page renders self-contained, with no dependency
	# on the code repo's raw GitHub URLs staying reachable.
	METRICS_ASSETS_DIR = Path(__file__).parent / "assets" / "metrics"


	def serve_metrics_asset(name: str) -> Response:
	"""Serve a bundled metrics illustration from ``assets/metrics/``.

	Flat namespace (no nested paths), traversal-guarded. Cached hard:
	these are static, versioned-with-the-repo assets.
	"""
	if "/" in name or ".." in name:
	return Response(status_code=404)
	path = METRICS_ASSETS_DIR / name
	if not path.is_file():
	return Response(status_code=404)
	media_type = mimetypes.guess_type(name)[0] or "application/octet-stream"
	return Response(
	content=path.read_bytes(),
	media_type=media_type,
	headers={"Cache-Control": RENDER_CACHE_CONTROL},
	)


	def _fetch_gt_render(fixture: str) -> bytes \| None:
	"""Pull a fixture's ground-truth GIF from the private GT dataset.

	Path inside the GT repo is ``<fixture>/renders/rotating.webp``. GT
	renders are a property of the data revision, not of any submission,
	so they're served straight from the GT repo rather than duplicated
	per submission. Not memoized for the same reason as :func:`_fetch_render` (GT
	renders can be added/updated on a data revision bump);
	``hf_hub_download`` handles the per-revision disk cache. Needs the
	Space ``HF_TOKEN``'s read scope on the private repo.
	"""
	try:
	local_path = hf_hub_download(
	repo_id=HF_DATA_GT_REPO,
	filename=f"{fixture}/renders/rotating.webp",
	repo_type="dataset",
	)
	return Path(local_path).read_bytes()
	except Exception as e: # noqa: BLE001 - any Hub failure -> 404
	logger.warning(
	"Failed to fetch GT render for %s (%s: %s)",
	fixture, type(e).__name__, e,
	)
	return None


	# Long-lived immutable caching: a (submission, fixture) render never
	# changes (fixed camera + lighting; re-renders would be a new artifact),
	# so the browser/CDN can keep it forever. This is what makes fixture
	# swaps and repeat visits free: only the ~33 on-screen turntables are
	# fetched on first paint, and everything after that is a cache hit.
	RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable"


	def _render_proxy_url(submission_id: str, fixture: str) -> str \| None:
	"""Resolver for a submission's plain turntable: a public render-bucket URL.

	The eval job uploads ``renders/<id>/<fixture>/rotating.webp`` to the public
	bucket, so the browser fetches it straight from object storage (anonymous,
	no Space proxy hop). The gallery only calls this for ``valid`` fixtures; a
	missing upload 404s and degrades to the dashed cell via ``<img onerror>``.
	"""
	return render_public_url(submission_id, fixture, "rotating.webp")


	def _render_diff_proxy_url(submission_id: str, fixture: str) -> str \| None:
	"""Resolver for an editing fixture's edit-diff turntable (public bucket URL).

	Used by the gallery grid for editing fixtures (see
	``gallery.build_gallery_payload``). A miss (non-editing fixture, or an edit
	that never rendered a diff) 404s and degrades to the dashed cell, no
	fallback to the plain turntable.
	"""
	return render_public_url(submission_id, fixture, "edit_diff.webp")


	def _gt_proxy_url(fixture: str) -> str \| None:
	"""Resolver returning the cached proxy URL for a fixture's GT WebP.

	GT renders stay in the private GT dataset, so they cannot be public
	bucket URLs; they are still re-streamed through the Space proxy (which
	holds the read token).
	"""
	return f"/gt-render/{fixture}.webp"


	def _gt_diff_proxy_url(fixture: str) -> str \| None:
	"""Resolver for an editing fixture's GT "answer key" edit-diff WebP.

	The one-time GT generation (``tools/generate_gt_edit_diff.py``) writes
	``<fixture>/renders/edit_diff_gt.webp`` into the private GT dataset, so it
	rides the existing generic GT proxy (``serve_gt_file``) rather than needing
	a route of its own. The gallery uses this for the ground-truth row on
	editing fixtures; a missing file 404s and degrades to the dashed cell.
	"""
	return f"/gt/{fixture}/renders/edit_diff_gt.webp"


	def serve_gt_render(fixture: str) -> Response:
	"""Stream a fixture's ground-truth render WebP with long-lived caching."""
	webp = _fetch_gt_render(fixture)
	if webp is None:
	return Response(status_code=404)
	return Response(
	content=webp,
	media_type="image/webp",
	headers={"Cache-Control": RENDER_CACHE_CONTROL},
	)


	def _fetch_gt_file(fixture: str, relpath: str) -> bytes \| None:
	"""Pull an arbitrary GT asset (``<fixture>/<relpath>``) from the GT dataset.

	Serves the hosted report's ground-truth column: the per-view PNGs
	(``renders/<view>.png``) and the ``ground_truth.pdf``. The GT dataset is
	private, so these are proxied through the Space (which holds the read
	token) rather than linked directly. ``hf_hub_download`` does the
	per-revision disk cache. Returns ``None`` on any failure (the report hides
	the broken tile via the browser's normal missing-image handling).
	"""
	try:
	local_path = hf_hub_download(
	repo_id=HF_DATA_GT_REPO,
	filename=f"{fixture}/{relpath}",
	repo_type="dataset",
	)
	return Path(local_path).read_bytes()
	except Exception as e: # noqa: BLE001 - any Hub failure -> 404
	logger.warning(
	"Failed to fetch GT file %s/%s (%s: %s)",
	fixture, relpath, type(e).__name__, e,
	)
	return None


	def serve_gt_file(fixture: str, relpath: str) -> Response:
	"""Stream a GT asset (view PNG / PDF) with long-lived immutable caching.

	Path-traversal-guarded (``..`` rejected). The hosted report references
	``/gt/<fixture>/<relpath>`` and the browser fetches it lazily; the bytes
	are a property of the data revision (not any submission), so the same
	immutable ``Cache-Control`` as the render/input proxies applies.
	"""
	if ".." in fixture or ".." in relpath:
	return Response(status_code=404)
	data = _fetch_gt_file(fixture, relpath)
	if data is None:
	return Response(status_code=404)
	media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
	return Response(
	content=data,
	media_type=media_type,
	headers={"Cache-Control": RENDER_CACHE_CONTROL},
	)


	def _gallery_iframe_html() -> str:
	"""Build the gallery as a self-contained ``srcdoc`` iframe.

	Reads the live rows and renders the page (turntables referenced as
	cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the
	browser), then inlines the whole document into an iframe ``srcdoc``
	so it gets its own style context (no Gradio CSS collision). A Hub
	read failure degrades to an empty gallery rather than crashing the
	tab.
	"""
	try:
	rows = _load_rows_from_hub()
	except LeaderboardDataError:
	logger.exception("Gallery row load failed; rendering empty gallery")
	rows = []
	doc = render_gallery_page(
	rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url,
	_gt_diff_proxy_url,
	)
	escaped = html.escape(doc, quote=True)
	# The gallery JS (`fitIframe`) sizes this iframe to be the single scroller:
	# it shrinks to the content for few rows, otherwise fills down to the bottom
	# of the viewport so only the iframe's own body scrolls (keeping the sticky
	# header + ground-truth row locked) and the outer Gradio page does not also
	# scroll. The inline `height` is just the pre-script fallback; JS overrides
	# it, so no `max-height` here (it would clamp the measured fill height).
	return (
	f'<iframe srcdoc="{escaped}" '
	'style="width:100%; height:80vh; border:0; display:block;" '
	'title="CADGenBench gallery"></iframe>'
	)


	def _fetch_task_input(fixture: str, relpath: str) -> bytes \| None:
	"""Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo.

	Serves the Task-browser tab's drawings / starting-shape renders.
	The inputs dataset is private, so these are proxied through the
	Space (which holds the read token) rather than linked directly —
	mirroring :func:`_fetch_render`. Not memoized for the same reason:
	inputs can be added/updated on a data revision bump, and
	``hf_hub_download`` already does per-revision disk caching. Returns
	``None`` on any failure (the page hides the broken tile).
	"""
	try:
	local_path = hf_hub_download(
	repo_id=HF_DATA_REPO,
	filename=f"{fixture}/{relpath}",
	repo_type="dataset",
	)
	return Path(local_path).read_bytes()
	except Exception as e: # noqa: BLE001 - any Hub failure -> 404
	logger.warning(
	"Failed to fetch task input %s/%s (%s: %s)",
	fixture, relpath, type(e).__name__, e,
	)
	return None


	def _task_input_url(fixture: str, relpath: str) -> str:
	"""Resolver returning the Space proxy URL for a task input asset.

	Returns the route string without fetching bytes (the browser
	lazy-fetches only the on-screen task's images). An absolute path
	resolves against the Space origin even inside the iframe ``srcdoc``.
	"""
	return f"/task-input/{fixture}/{relpath}"


	def serve_task_input(fixture: str, relpath: str) -> Response:
	"""Stream a fixture input asset with long-lived immutable caching.

	Path-traversal-guarded (``..`` rejected). The task browser
	references ``/task-input/<fixture>/<relpath>`` and the browser
	fetches it lazily; re-streams the dataset bytes (the Space holds the
	read token) with the same immutable ``Cache-Control`` as the render
	proxies so the CDN/browser cache them hard.
	"""
	if ".." in fixture or ".." in relpath:
	return Response(status_code=404)
	data = _fetch_task_input(fixture, relpath)
	if data is None:
	return Response(status_code=404)
	media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
	return Response(
	content=data,
	media_type=media_type,
	headers={"Cache-Control": RENDER_CACHE_CONTROL},
	)


	def _tasks_iframe_html() -> str:
	"""Build the Task browser as a self-contained ``srcdoc`` iframe.

	Snapshots just the ``<fixture>/description.yaml`` files from the
	inputs dataset (lightweight: the drawings/renders themselves load
	lazily via the ``/task-input`` proxy), shapes them into task cards,
	and inlines the page into an iframe so it keeps its own style
	context (no Gradio CSS collision). A Hub read failure degrades to an
	empty browser rather than crashing the tab.
	"""
	try:
	local = snapshot_download(
	repo_id=HF_DATA_REPO,
	repo_type="dataset",
	allow_patterns=["*/description.yaml"],
	)
	tasks = load_tasks_from_dir(Path(local))
	except Exception: # noqa: BLE001 - degrade to empty browser, never crash
	logger.exception("Task load failed; rendering empty task browser")
	tasks = []
	doc = render_tasks_page(tasks, _task_input_url)
	escaped = html.escape(doc, quote=True)
	return (
	f'<iframe srcdoc="{escaped}" '
	'style="width:100%; height:90vh; border:0; display:block;" '
	'title="CADGenBench tasks"></iframe>'
	)


	@lru_cache(maxsize=1)
	def _logo_data_uri() -> str:
	"""Return the header logo as a base64 ``data:`` URI.

	Inlined rather than served as a static file so the ``<img>`` renders
	with no dependency on Gradio/FastAPI static-path allowlisting — it
	works identically when the Space runs locally on a random port and
	on huggingface.co. The PNG itself lives in the repo at
	``assets/logo.png`` (reviewable as a real binary) and is read
	relative to this module so the Docker image's working dir doesn't
	matter. Cached because the bytes never change within a process.
	"""
	logo_path = Path(__file__).parent / "assets" / "logo.png"
	data = base64.b64encode(logo_path.read_bytes()).decode("ascii")
	return f"data:image/png;base64,{data}"


	# Reclaim vertical space so the gallery can show more rows in one viewport:
	# hide the Gradio footer ("Built with Gradio - Settings") and tighten the
	# page's outer padding / inter-block gap. Scoped to cosmetics only. The
	# logo is height-constrained (width auto-scales) so it sits in a compact
	# band near the old `### ` title's footprint. The wordmark PNG has a
	# transparent background and black ink, so on a dark theme it would
	# vanish: the `.dark` rule inverts it to white ink (Gradio toggles the
	# `.dark` class on the container; the prefers-color-scheme query covers
	# system-driven dark mode too).
	_APP_CSS = (
	"footer{display:none !important;}"
	".gradio-container{padding-top:4px !important; padding-bottom:0 !important;}"
	# Collapse the title block's own box and the flex gap Gradio puts
	# between it and the tab bar so the wordmark sits right above the
	# leaderboard instead of floating with a gap. The negative bottom
	# margin pulls the tab nav up snug against the logo.
	"#cgb-title{margin:0 !important;padding:0 !important;min-width:0 !important;}"
	"#cgb-title .cgb-logo{height:46px;width:auto;display:block;margin:0;}"
	".gradio-container .tabs{margin-top:-6px !important;}"
	".dark #cgb-title .cgb-logo{filter:invert(1);}"
	"@media (prefers-color-scheme: dark){"
	"#cgb-title .cgb-logo{filter:invert(1);}}"
	)

	with gr.Blocks(
	title="CADGenBench Leaderboard", theme=gr.themes.Soft(), css=_APP_CSS,
	) as blocks:
	# Single compact title line (keeps vertical space for the gallery rows).
	# The wordmark logo replaces the old `### CADGenBench Leaderboard`
	# markdown; alt text preserves the name for screen readers / when
	# images are blocked.
	gr.HTML(
	f'<img class="cgb-logo" src="{_logo_data_uri()}" '
	'alt="CADGenBench Leaderboard">',
	elem_id="cgb-title",
	)

	with gr.Tab("Leaderboard"):
	# Visual-first leaderboard. The bespoke surface (sticky GT row,
	# fixture picker, turntable grid, compare modal) is a
	# self-contained HTML doc inlined into an iframe `srcdoc` so it
	# keeps its own style context. Thumbnails are lazy-loaded from
	# the cached `/render` / `/gt-render` proxy routes (requires the
	# Space to be public). Built at boot, rebuilt on page load, and
	# refreshed after admin actions.
	gallery_html = gr.HTML(value=_gallery_iframe_html())
	gallery_refresh_btn = gr.Button("Refresh gallery", size="sm")
	gallery_refresh_btn.click(
	fn=_gallery_iframe_html, outputs=gallery_html,
	)

	with gr.Tab("Detailed View"):
	# Load both tiers once at boot. `_safe_load_split` keeps a Hub
	# read failure from crashing the Space: on failure the frames
	# come up empty and `initial_error` carries the message the
	# banner renders.
	initial_validated, initial_unvalidated, initial_error = _safe_load_split()

	# Loud, persistent banner shown only when the live results
	# can't be read from the Hub (e.g. an under-scoped Space
	# HF_TOKEN). Kept in sync by the refresh / Timer handlers. The
	# leaderboard never falls back to stale/bundled data, so this
	# banner is the signal that empty tables are a read failure,
	# not a genuinely empty leaderboard.
	data_error_banner = gr.Markdown(
	value=_data_error_banner_md(initial_error),
	visible=initial_error is not None,
	)

	# Collapsed accordions above the tables. Validation guidelines
	# gives the short two-tier story + link to the full policy
	# doc; Citation carries the verbatim BibTeX entry. Both start
	# closed so the leaderboard itself stays above the fold.
	with gr.Accordion("Validation guidelines", open=False):
	gr.Markdown(VALIDATION_GUIDELINES_MD)
	with gr.Accordion("Citation", open=False):
	# language=None -> plain monospaced render (gr.Code doesn't
	# ship a BibTeX highlighter); show_line_numbers off because
	# the entry is meant to be copy-pasted, not annotated.
	gr.Code(
	value=CITATION_BIBTEX,
	language=None,
	show_line_numbers=False,
	)

	# Two stacked tables, split by `validation_status`. Validated
	# on top so the curated results are above the fold; unvalidated
	# below carries every other row (auto-published, awaiting
	# methodology review). See decisions/validation-policy.md.
	# Initial values come from the boot-time `_safe_load_split`
	# above (empty + banner on a Hub read failure).
	validated_view = Leaderboard(
	value=initial_validated,
	datatype=VALIDATED_LEADERBOARD_DATATYPES,
	search_columns=["submission_name", "submitter_name"],
	hide_columns=LEADERBOARD_HIDE_COLUMNS,
	label="Validated Leaderboard",
	interactive=False,
	)
	unvalidated_view = Leaderboard(
	value=initial_unvalidated,
	datatype=LEADERBOARD_DATATYPES,
	search_columns=["submission_name", "submitter_name"],
	hide_columns=LEADERBOARD_HIDE_COLUMNS,
	label="Unvalidated Leaderboard",
	interactive=False,
	)
	with gr.Row():
	refresh_btn = gr.Button("Refresh", size="sm")
	# One file, both tables, `validation_status` discriminator
	# column. Fresh CSV is generated on every click so the
	# download reflects the latest data, not a stale snapshot
	# captured at boot.
	download_btn = gr.DownloadButton(
	label="Download CSV", size="sm",
	)
	refresh_btn.click(
	fn=_refresh_leaderboard_with_toast,
	outputs=[validated_view, unvalidated_view, data_error_banner],
	)
	download_btn.click(fn=build_combined_csv, outputs=download_btn)

	# No inline row-click detail panel: the submission_name cell is a
	# deep-link that opens the self-contained per-submission report in
	# a new tab (see `_submission_name_md` in leaderboard.py). Now that
	# the Space is public, HF's edge serves `/reports/<id>.html` to
	# browser users, so we link to it directly instead of inlining the
	# (tens-to-hundreds-of-MB) report through the Gradio event payload.

	with gr.Tab("Tasks"):
	# Read-only task browser: mirrors the per-submission report's
	# summary-table -> detail-card navigation (j/k, Esc) but shows
	# only the prompt + input (drawing / starting shape), no scores
	# or ground truth. Self-contained HTML inlined into an iframe
	# `srcdoc` like the gallery; input images lazy-load from the
	# `/task-input` proxy. Built at boot, rebuilt on page load.
	tasks_html = gr.HTML(value=_tasks_iframe_html())
	tasks_refresh_btn = gr.Button("Refresh tasks", size="sm")
	tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html)

	with gr.Tab("Metrics"):
	# Static explainer for the (new) scoring metrics. Served as a
	# standalone `/metrics` route too, so the per-submission report's
	# headline pills can deep-link to `/metrics#<anchor>`; the tab just
	# embeds that same page in an iframe (single source of truth).
	gr.HTML(
	'<iframe src="/metrics" '
	'style="width:100%; height:85vh; border:0; display:block;" '
	'title="CADGenBench metrics"></iframe>'
	)

	with gr.Tab("Submit"):
	gr.Markdown(
	f"""
	Submission format. A single zip with:

	- one folder per sample in `{HF_DATA_REPO}`; include `output.step` for
	samples where your system produced a candidate. Missing `output.step`
	scores zero for that sample;
	- a top-level `meta.json`:

	```json
	{{
	"submitter_name": "your name or team",
	"submission_name": "MyAgent v2.3 (or whatever describes your system)",
	"agent_url": "https://github.com/... (optional)",
	"notes": "free text, optional, max 500 chars, single line, plain text",
	"agree_to_publish": true
	}}
	```

	Submission name. Free text describing the system being benchmarked,
	however you choose to describe it. The benchmark is system-agnostic: your
	submission may use no LLM, one, or many. If you want to disclose your
	stack, put it here or in `notes`.

	Notes field. Plain text only (no markdown / HTML). Capped at 500 chars
	and stripped to a single line. Shown in the per-submission detail view,
	not in the main leaderboard table.

	Consent. `"agree_to_publish": true` in `meta.json` is your consent
	to publish the resulting row on the public leaderboard.

	For the full submission contract (output format, validity gate, canonical
	pose, and a local self-check), see
	[`docs/benchmark/submission.md`]({SUBMISSION_DOC_URL}).
	"""
	)
	# OAuth gate. The user must log in via the HF button before
	# the Submit button becomes interactive; the row gets the
	# canonical `hf_username` from `gr.OAuthProfile.username`
	# (not a free-text claim in meta.json). README front-matter
	# already carries `hf_oauth: true` so HF's OAuth integration
	# is wired up at the Space level.
	login_btn = gr.LoginButton()
	zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
	# Starts disabled; the `blocks.load` handler below flips it
	# to interactive when an OAuthProfile is present.
	submit_btn = gr.Button("Submit", variant="primary", interactive=False)
	# Persistent status panel. handle_submit is a generator that
	# streams stage updates (validating -> uploading/queuing ->
	# queued) and any rejection reason here, so the outcome
	# survives instead of vanishing with a transient toast. The
	# handler also reads `gr.OAuthProfile` implicitly via its
	# parameter type annotation (Gradio's dependency-injection
	# convention).
	submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE)
	submit_btn.click(
	fn=handle_submit,
	inputs=[zip_in],
	outputs=[submit_status],
	)

	with gr.Tab("About"):
	gr.Markdown(ABOUT_MD)

	with gr.Tab("Admin"):
	# Maintainer-only controls. The Admin tab is visible to everyone
	# (a hint the path exists), but ALL admin UI -- the table, the
	# actions, the danger zones -- lives in `admin_panel`, a column that
	# stays hidden unless the logged-in user is in CADGENBENCH_ADMINS.
	# The `blocks.load` handler below flips that column's visibility and
	# only loads table data for admins; a server-side `is_admin` re-check
	# still guards every handler. Non-admins (and logged-out visitors)
	# see only the login/logout button + a status line, nothing else.
	# See decisions/validation-policy.md.
	admin_login_btn = gr.LoginButton()
	admin_status = gr.Markdown(
	"Log in with an admin account to access the controls."
	)
	# Everything below is admin-only: hidden by default, revealed by
	# `_gate_admin_controls` only for a logged-in user in the admin set.
	with gr.Column(visible=False) as admin_panel:
	gr.Markdown(
	"## Admin\n"
	"Tick rows in the select column, then promote them into "
	"the Validated tier (recording an evidence type), demote "
	"them back to Unvalidated, delete them, or rescore them "
	"against the current ground truth. Actions apply to every "
	"ticked row at once."
	)
	# Only the leading `select` column is editable; the rest is
	# read-only context. Click-to-tick drives every action below.
	# Starts empty; `_gate_admin_controls` loads rows on page load
	# for admins only, so non-admins never receive the data.
	admin_table = gr.Dataframe(
	value=_empty_admin_table(),
	datatype=[
	"bool", "str", "str", "str", "str", "str", "str",
	"number", "str",
	],
	static_columns=list(range(1, len(ADMIN_COLUMNS))),
	interactive=False,
	label="Submissions (tick select to choose rows)",
	wrap=True,
	)
	admin_selection_md = gr.Markdown("_No rows selected._")
	admin_method_radio = gr.Radio(
	choices=list(VALID_METHODS),
	value="manual",
	label="validation_method (applied to all rows on promote)",
	interactive=False,
	)
	with gr.Row():
	promote_btn = gr.Button(
	"Mark validated", variant="primary", interactive=False,
	)
	demote_btn = gr.Button("Mark unvalidated", interactive=False)
	with gr.Accordion("Danger zone: delete", open=False):
	gr.Markdown(
	"Permanently deletes the ticked rows and their "
	"uploaded zip + report files from the submissions "
	"dataset. This cannot be undone (only a manual revert of "
	"the dataset commit).\n\n"
	"Stop & delete additionally cancels any still-running "
	"evaluation job(s) for the ticked rows before deleting — "
	"use it for pending submissions whose GPU eval is in "
	"flight."
	)
	delete_confirm = gr.Checkbox(
	label=(
	"I understand this permanently deletes the selected "
	"submissions and their files."
	),
	value=False,
	interactive=False,
	)
	with gr.Row():
	delete_btn = gr.Button(
	"Delete selected", variant="stop", interactive=False,
	)
	stop_delete_btn = gr.Button(
	"Stop & delete selected", variant="stop",
	interactive=False,
	)
	with gr.Accordion("Danger zone: rescore", open=False):
	gr.Markdown(
	"Re-evaluates submissions against the current "
	"ground truth + data: each row flips back to pending, the "
	"gallery renders and the per-submission report HTML are "
	"regenerated, and the score is recomputed. Use after a "
	"ground-truth swap or a metric change that invalidates "
	"the existing scores.\n\n"
	"Rescoring is re-runnable: if a row's eval fails, "
	"mark it and rescore again (or rescore all) — each run is "
	"independent and converges.\n\n"
	"- Rescore selected re-evaluates the ticked rows.\n"
	f"- Rescore all re-evaluates every submission that "
	f"has a stored zip and isn't already pending — type "
	f"`{RESCORE_ALL_PHRASE}` to arm it."
	)
	rescore_confirm = gr.Checkbox(
	label=(
	"I understand this flips the selected rows to pending "
	"and recomputes their scores."
	),
	value=False,
	interactive=False,
	)
	rescore_selected_btn = gr.Button(
	"Rescore selected", variant="stop", interactive=False,
	)
	rescore_all_phrase = gr.Textbox(
	label=(
	f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide "
	f"rescore"
	),
	placeholder=RESCORE_ALL_PHRASE,
	interactive=False,
	)
	rescore_all_btn = gr.Button(
	"Rescore ALL submissions", variant="stop",
	interactive=False,
	)
	admin_refresh_btn = gr.Button("Refresh", size="sm")

	admin_table.change(
	fn=_admin_selection_status,
	inputs=admin_table,
	outputs=admin_selection_md,
	)
	promote_btn.click(
	fn=_admin_promote,
	inputs=[admin_table, admin_method_radio],
	outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
	)
	demote_btn.click(
	fn=_admin_demote,
	inputs=[admin_table],
	outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
	)
	delete_confirm.change(
	fn=_arm_delete,
	inputs=[delete_confirm],
	outputs=[delete_btn, stop_delete_btn],
	)
	delete_btn.click(
	fn=_admin_delete,
	inputs=[admin_table, delete_confirm],
	outputs=[
	admin_table, validated_view, unvalidated_view, gallery_html,
	delete_confirm, delete_btn, stop_delete_btn,
	],
	)
	stop_delete_btn.click(
	fn=_admin_stop_delete,
	inputs=[admin_table, delete_confirm],
	outputs=[
	admin_table, validated_view, unvalidated_view, gallery_html,
	delete_confirm, delete_btn, stop_delete_btn,
	],
	)
	rescore_confirm.change(
	fn=_arm_rescore_selected,
	inputs=[rescore_confirm],
	outputs=[rescore_selected_btn],
	)
	rescore_selected_btn.click(
	fn=_admin_rescore_selected,
	inputs=[admin_table, rescore_confirm],
	outputs=[
	admin_table, validated_view, unvalidated_view, gallery_html,
	rescore_confirm, rescore_selected_btn,
	],
	)
	rescore_all_phrase.change(
	fn=_arm_rescore_all,
	inputs=[rescore_all_phrase],
	outputs=[rescore_all_btn],
	)
	rescore_all_btn.click(
	fn=_admin_rescore_all,
	inputs=[rescore_all_phrase],
	outputs=[
	admin_table, validated_view, unvalidated_view, gallery_html,
	rescore_all_phrase, rescore_all_btn,
	],
	)
	admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)

	# Keep the admin table on the same 10s cadence as the leaderboard
	# so a row that lands (or a pending row that completes) after the
	# tab loaded shows up without a manual Refresh. Selection is
	# preserved across ticks so an in-progress set of checkboxes
	# survives the reload.
	admin_auto_refresh_timer = gr.Timer(10)
	admin_auto_refresh_timer.tick(
	fn=_auto_refresh_admin_table,
	inputs=admin_table,
	outputs=admin_table,
	)

	# gradio_leaderboard.Leaderboard handles its own update path
	# cleanly; bind a Timer to push fresh dataframes every 10 seconds.
	# Single tick runs `_auto_refresh_leaderboard` once and pushes the
	# two halves into the validated / unvalidated widgets plus the
	# data-unavailable banner. The handler swallows a Hub read failure
	# into empty frames + a loud warning toast so a degraded read never
	# crashes the tick loop or silently blanks the tables.
	auto_refresh_timer = gr.Timer(10)
	auto_refresh_timer.tick(
	fn=_auto_refresh_leaderboard,
	outputs=[validated_view, unvalidated_view, data_error_banner],
	)

	# On page load, read the visitor's OAuth profile (None if not
	# logged in) and flip the Submit button's interactivity. Runs once
	# per page load; LoginButton clicks also re-trigger this through
	# Gradio's auth-event plumbing.
	blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
	blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
	blocks.load(fn=_tasks_iframe_html, outputs=tasks_html)

	# Same per-load OAuth read, gating the Admin tab on membership in the
	# CADGENBENCH_ADMINS set. Logged-out / non-admin visitors get the
	# admin_panel hidden entirely (no table, no controls) -- just the
	# login/logout button and a status line.
	blocks.load(
	fn=_gate_admin_controls,
	outputs=[
	admin_panel,
	admin_table,
	admin_method_radio,
	promote_btn,
	demote_btn,
	delete_confirm,
	delete_btn,
	stop_delete_btn,
	rescore_confirm,
	rescore_selected_btn,
	rescore_all_phrase,
	rescore_all_btn,
	admin_status,
	],
	)


	# Mount Gradio under a FastAPI parent so the custom proxy route
	# above lives at the same origin as the UI. Direct routes on `app`
	# get checked before the Gradio sub-app, so `/reports/<sid>.html`
	# never gets shadowed.
	app = FastAPI()
	app.add_api_route(
	"/reports/{submission_id}.html",
	serve_report,
	methods=["GET"],
	)
	# Static metrics explainer. Same origin as the report proxy so report
	# pills can deep-link to `/metrics#<anchor>`; also embedded in the
	# Metrics tab. Registered before the Gradio mount so it isn't shadowed.
	app.add_api_route(
	"/metrics",
	serve_metrics_page,
	methods=["GET"],
	)
	# Illustration assets the metrics page embeds (vendored under assets/metrics/).
	app.add_api_route(
	"/metrics-assets/{name}",
	serve_metrics_asset,
	methods=["GET"],
	)
	# Cached render proxies the gallery's lazy-loaded turntables point at.
	# Registered before the Gradio mount so they're not shadowed by the
	# catch-all sub-app.
	# Candidate renders are served directly from the public render bucket (URLs
	# come from the gallery resolvers), so only the private GT render still needs a
	# token-holding Space proxy route.
	app.add_api_route(
	"/gt-render/{fixture}.webp",
	serve_gt_render,
	methods=["GET"],
	)
	# Ground-truth assets the hosted report links lazily (per-view PNGs + PDF).
	# GT is private, so this token-holding proxy streams them; the `:path`
	# converter lets `relpath` carry a slash (e.g. renders/iso.png). Registered
	# before the Gradio mount so it isn't shadowed by the catch-all sub-app.
	app.add_api_route(
	"/gt/{fixture}/{relpath:path}",
	serve_gt_file,
	methods=["GET"],
	)
	# Task-browser input assets (drawings + starting-shape renders). The
	# `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png).
	# Registered before the Gradio mount so it's not shadowed.
	app.add_api_route(
	"/task-input/{fixture}/{relpath:path}",
	serve_task_input,
	methods=["GET"],
	)
	# Gradio picks REAL Hugging Face OAuth vs. a local "mock" login via
	# ``gradio.utils.get_space()``, which is only truthy when ``SYSTEM ==
	# "spaces"``. HF sets that on Gradio-SDK Spaces but NOT on ``sdk: docker``
	# Spaces like this one. Without it, ``mount_gradio_app`` wires up the MOCK
	# OAuth routes, which never contact hf.co and instead log every visitor in
	# as the container token's owner (our ``HF_TOKEN`` account) -- leaking that
	# identity into the LoginButton and, since that account is in
	# ``CADGENBENCH_ADMINS``, handing every visitor admin. Force it on only when
	# we're actually running on a Space (``SPACE_ID`` is HF-injected on all
	# Spaces, Docker included) so the real ``hf_oauth: true`` flow runs; locally
	# (no ``SPACE_ID``) it stays unset so Gradio's local mock login still works
	# for dev. Must precede the mount, which is what triggers ``attach_oauth``.
	if os.environ.get("SPACE_ID") and os.environ.get("SYSTEM") != "spaces":
	os.environ["SYSTEM"] = "spaces"
	app = gr.mount_gradio_app(app, blocks, path="/")


	if __name__ == "__main__":
	host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
	port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
	uvicorn.run(app, host=host, port=port)