# Copyright 2026 Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""CADGenBench Leaderboard Space - Gradio UI + report-proxy mount.

Read path lives in :mod:`leaderboard`. Submit-tab validation lives in
:mod:`submit`. Both are wired into the Gradio Blocks below. The
Gradio app is mounted under a FastAPI parent so the custom
``/reports/{submission_id}.html`` route can re-serve dataset HTML
with ``Content-Type: text/html`` (HF Hub's ``/resolve/`` serves it
as ``text/plain`` by policy, which makes the browser show source
rather than render).
"""
from __future__ import annotations

import base64
import html
import logging
import mimetypes
import os
from functools import lru_cache
from pathlib import Path

import gradio as gr
import pandas as pd
import uvicorn
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, Response
from gradio_leaderboard import Leaderboard
from huggingface_hub import hf_hub_download, snapshot_download

from leaderboard import (
    ADMIN_COLUMNS,
    ADMIN_SELECT_COL,
    HF_DATA_GT_REPO,
    HF_DATA_REPO,
    HF_SUBMISSIONS_REPO,
    LEADERBOARD_COLS,
    LEADERBOARD_DATATYPES,
    LEADERBOARD_HIDE_COLUMNS,
    VALIDATED_LEADERBOARD_COLS,
    VALIDATED_LEADERBOARD_DATATYPES,
    LeaderboardDataError,
    _fmt_timestamp,
    _load_rows_from_hub,
    build_combined_csv,
    load_admin_table,
    load_leaderboard_split,
    render_public_url,
)
from gallery import render_gallery_page
from metrics_page import build_metrics_page
from tasks import load_tasks_from_dir, render_tasks_page
from admin import (
    VALID_METHODS,
    delete_rows,
    demote_rows,
    is_admin,
    promote_rows,
    rescore_all,
    rescore_rows,
    stop_and_delete_rows,
)
from submit import handle_submit

logger = logging.getLogger(__name__)

# Surface module-level logger.info / logger.warning / logger.exception
# calls from leaderboard.py + submit.py in the Space's runtime logs.
# Otherwise they go nowhere and any refresh / worker pathology is
# silent. Format keeps timestamps + module + level + message.
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)


# Canonical policy doc lives in the code repo so contributors reading
# the GitHub repo see it without needing to visit the Space. Linked
# from both the Detailed View tab's Validation Guidelines accordion and
# the About tab.
VALIDATION_DOC_URL = (
    "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/validation.md"
)
# Canonical submission contract (output layout, validity gate, canonical
# pose, local self-check). Linked from the Submit tab so the tab itself
# stays a short "how to package + upload" note rather than re-documenting
# the full contract.
SUBMISSION_DOC_URL = (
    "https://github.com/huggingface/cadgenbench/blob/main/docs/benchmark/submission.md"
)

ABOUT_MD = f"""## About

**CADGenBench** evaluates AI-driven CAD generation: how well a model can
turn a description of a mechanical part into a valid, geometrically
correct 3D model.

- **Reference baseline**: an iterative AI agent that writes build123d Python.
- **Submission flow**: upload a zip of per-fixture STEP files; the Space
  runs the eval and appends a row to the submissions dataset.
- **Datasets**: fixture inputs in
  [`{HF_DATA_REPO}`](https://huggingface.co/datasets/{HF_DATA_REPO});
  submissions and computed results in
  [`{HF_SUBMISSIONS_REPO}`](https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}).
- **Code**: [`huggingface/cadgenbench`](https://github.com/huggingface/cadgenbench).
- **Validation policy**: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL}).
- **Data**: CAD geometry from [Mecado](https://www.mecado.com).
"""

# Verbatim BibTeX entry locked in space-setup/bundles/1-2-space-ux.md
# (Locked decisions section). Shown in the Citation accordion as a
# copy-paste handle for anyone citing this benchmark; the About tab
# already links the source code via huggingface/cadgenbench so the
# Space URL is the right deep-link target for the citation.
CITATION_BIBTEX = r"""@misc{cadgenbench2026,
  author       = {Rabinovich, Michael and {Hugging Face}},
  title        = {{CADGenBench}: a benchmark for {AI}-driven {CAD} generation},
  year         = {2026},
  publisher    = {Hugging Face},
  howpublished = {\url{https://huggingface.co/spaces/HuggingAI4Engineering/CADGenBench}},
}"""

VALIDATION_GUIDELINES_MD = f"""Submissions appear on the **Unvalidated** table the moment evaluation completes. Maintainers promote rows to **Validated** after methodology review, accepting one of four evidence types (`code`, `traces`, `api`, `manual`).

Full policy: [`docs/benchmark/validation.md`]({VALIDATION_DOC_URL})."""

SUBMIT_STATUS_IDLE = (
    "_Log in, attach a zip, and click **Submit**. Progress and any "
    "errors appear here._"
)


def _data_error_banner_md(message: str | None) -> str:
    """Markdown for the top-of-tab data-unavailable banner.

    Empty string when there's no error (the banner is also hidden via
    ``visible=False`` in that case). When the live ``results.jsonl``
    can't be read, the banner is the loud, persistent signal that the
    tables below are empty *by design* (we never fall back to stale or
    bundled data) rather than because the leaderboard is genuinely
    empty.
    """
    if not message:
        return ""
    return (
        "> ⚠️ **Leaderboard data unavailable.** The live results could not "
        "be read from the Hub, so the tables below are empty. No stale or "
        "cached data is ever shown in its place.\n>\n"
        f"> Details: `{message}`"
    )


def _safe_load_split() -> tuple[pd.DataFrame, pd.DataFrame, str | None]:
    """Load both tiers, turning a Hub failure into empty frames + a message.

    The reader (:func:`load_leaderboard_split`) deliberately *raises*
    on any read failure (no silent fallback). The Space, however, must
    stay up and loudly surface the failure rather than crash, so this
    wrapper converts :class:`LeaderboardDataError` into empty,
    correctly-shaped DataFrames plus an error string the caller renders
    in the banner / a toast. Returns ``(validated, unvalidated, error)``
    with ``error`` ``None`` on success.
    """
    try:
        validated, unvalidated = load_leaderboard_split()
        return validated, unvalidated, None
    except LeaderboardDataError as e:
        logger.exception("Leaderboard data load failed")
        return (
            pd.DataFrame(columns=VALIDATED_LEADERBOARD_COLS),
            pd.DataFrame(columns=LEADERBOARD_COLS),
            str(e),
        )


def _safe_load_admin() -> tuple[pd.DataFrame, str | None]:
    """Admin-table counterpart to :func:`_safe_load_split`.

    Same no-crash contract: a Hub read failure yields an empty,
    correctly-shaped admin frame plus the error string instead of
    propagating the exception (which would take the whole Space down at
    boot, since the admin table loads at module-construction time).
    """
    try:
        return load_admin_table(), None
    except LeaderboardDataError as e:
        logger.exception("Admin table load failed")
        return pd.DataFrame(columns=ADMIN_COLUMNS), str(e)


def _refresh_leaderboard_with_toast():
    """Manual Refresh button handler: toast + fresh DataFrames + banner.

    Surfaces the outcome loudly either way: ``gr.Info`` on success,
    ``gr.Warning`` when the live read failed. The third output keeps
    the data-unavailable banner in sync (shown with the error,
    cleared on success).
    """
    validated, unvalidated, error = _safe_load_split()
    if error:
        gr.Warning(f"Leaderboard data unavailable: {error}")
    else:
        gr.Info("Leaderboard refreshed.")
    return (
        validated,
        unvalidated,
        gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
    )


def _auto_refresh_leaderboard():
    """Timer-tick handler: fresh DataFrames + banner, no success toast.

    Mirrors :func:`_refresh_leaderboard_with_toast` but stays silent on
    success (a toast every 10s would be noise). A read failure still
    fires a loud ``gr.Warning`` and updates the banner so a degraded
    Hub read can't quietly leave the tables blank.
    """
    validated, unvalidated, error = _safe_load_split()
    if error:
        gr.Warning(f"Leaderboard data unavailable: {error}")
    return (
        validated,
        unvalidated,
        gr.Markdown(value=_data_error_banner_md(error), visible=error is not None),
    )


def _enable_submit_when_logged_in(
    profile: gr.OAuthProfile | None,
) -> gr.Button:
    """Flip the Submit button's interactivity based on login state.

    Runs once per page load via ``blocks.load``. Gradio injects
    ``gr.OAuthProfile`` automatically (``None`` if the visitor isn't
    logged in via the LoginButton). The visible-disable mirrors the
    server-side gate in :func:`submit.handle_submit`; the handler
    still raises ``gr.Error`` defensively if it ever gets called
    without a profile.
    """
    return gr.Button(interactive=profile is not None)


def _selected_ids(table_df: pd.DataFrame | None) -> list[str]:
    """Submission ids of the rows whose ``select`` checkbox is ticked."""
    if (
        table_df is None
        or len(table_df) == 0
        or ADMIN_SELECT_COL not in table_df.columns
        or "submission_id" not in table_df.columns
    ):
        return []
    mask = table_df[ADMIN_SELECT_COL].apply(bool)
    return [str(s) for s in table_df.loc[mask, "submission_id"].tolist() if s]


def _admin_selection_status(table_df: pd.DataFrame | None) -> str:
    """Live count line under the admin table, updated as boxes are ticked."""
    n = len(_selected_ids(table_df))
    return f"**{n}** row(s) selected." if n else "_No rows selected._"


def _gate_admin_controls(
    profile: gr.OAuthProfile | None,
) -> tuple[
    gr.Column, gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox,
    gr.Button, gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str,
]:
    """Reveal the admin panel only for a logged-in user in the admin set.

    Runs on every page load and re-runs on LoginButton auth events. The
    entire admin panel (table + every control) lives in a column that
    stays hidden unless the visitor is logged in AND in the admin set, so
    non-admins and logged-out visitors see only the login/logout button
    and a status line -- no table, no buttons. For admins the panel is
    shown, its controls enabled, and the table refreshed from live Hub
    data. Data is only loaded into the table for admins, and a server-side
    ``is_admin`` re-check still guards every handler. The armed-by-
    confirmation buttons (delete, stop-and-delete, rescore-selected,
    rescore-all) always load disarmed: they only enable once their confirm
    box is ticked / phrase typed.
    """
    admin = is_admin(profile)
    if admin:
        admin_df, error = _safe_load_admin()
        if error:
            gr.Warning(f"Admin table unavailable: {error}")
    else:
        admin_df = _empty_admin_table()
    if profile is None:
        status = "Log in with an admin account to access the controls."
    elif admin:
        status = f"Signed in as `{profile.username}`. Admin controls enabled."
    else:
        status = (
            f"Signed in as `{profile.username}`, which is not in the admin "
            "set. You can log out with the button above."
        )
    return (
        gr.Column(visible=admin),
        gr.Dataframe(value=admin_df, interactive=admin),
        gr.Radio(interactive=admin),
        gr.Button(interactive=admin),
        gr.Button(interactive=admin),
        gr.Checkbox(interactive=admin, value=False),
        gr.Button(interactive=False),
        gr.Button(interactive=False),
        gr.Checkbox(interactive=admin, value=False),
        gr.Button(interactive=False),
        gr.Textbox(interactive=admin, value=""),
        gr.Button(interactive=False),
        status,
    )


def _arm_delete(
    confirm: bool, profile: gr.OAuthProfile | None,
) -> tuple[gr.Button, gr.Button]:
    """Arm both destructive buttons once an admin ticks the confirm box.

    The plain delete and the stop-and-delete share the single confirm
    checkbox, so a deliberate tick is required before either fires.
    """
    armed = bool(confirm) and is_admin(profile)
    return gr.Button(interactive=armed), gr.Button(interactive=armed)


def _empty_admin_table() -> pd.DataFrame:
    """An admin frame with headers but no rows -- what non-admins get.

    The admin panel is hidden from non-admins, but the table refreshers
    still run server-side; returning an empty frame ensures no submission
    data is ever streamed into a non-admin's (hidden) table.
    """
    return pd.DataFrame(columns=list(ADMIN_COLUMNS))


def _refresh_admin_table(profile: gr.OAuthProfile | None) -> pd.DataFrame:
    """Admin Refresh button handler: reload the admin table, toast on failure.

    Uses the no-crash :func:`_safe_load_admin` so a Hub read failure
    surfaces as a loud ``gr.Warning`` plus an empty table rather than an
    uncaught exception. Returns an empty frame to non-admins so a tampered
    client can't pull the table out from behind the hidden panel.
    """
    if not is_admin(profile):
        return _empty_admin_table()
    admin_df, error = _safe_load_admin()
    if error:
        gr.Warning(f"Admin table unavailable: {error}")
    return admin_df


def _reapply_selection(
    fresh: pd.DataFrame, selected: set[str],
) -> pd.DataFrame:
    """Re-tick the ``select`` column on rows the maintainer had selected.

    A freshly-loaded admin frame comes back all-unchecked; this carries
    the prior ticks forward by ``submission_id`` so a background refresh
    doesn't wipe an in-progress selection. Ids that vanished (e.g. a row
    deleted out from under the table) simply drop out.
    """
    if (
        selected
        and ADMIN_SELECT_COL in fresh.columns
        and "submission_id" in fresh.columns
    ):
        fresh[ADMIN_SELECT_COL] = (
            fresh["submission_id"].astype(str).isin(selected)
        )
    return fresh


def _auto_refresh_admin_table(
    current_df: pd.DataFrame | None,
    profile: gr.OAuthProfile | None,
) -> pd.DataFrame:
    """Timer-tick handler: reload the admin table, preserving ticked rows.

    The leaderboard tables auto-refresh every 10s but the admin table did
    not, so a pending row submitted after the tab loaded stayed invisible
    until a manual Refresh. This keeps it current on the same cadence.
    Unlike the leaderboard handler it stays silent (no per-tick toast)
    and, on a Hub read failure, returns the current frame unchanged so a
    transient blip never blanks the table or drops the user's selection.
    Non-admins get an empty frame so the (hidden) table is never fed data.
    """
    if not is_admin(profile):
        return _empty_admin_table()
    admin_df, error = _safe_load_admin()
    if error:
        return current_df if current_df is not None else admin_df
    return _reapply_selection(admin_df, set(_selected_ids(current_df)))


def _admin_promote(
    table_df: pd.DataFrame | None,
    method: str | None,
    profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
    """Promote ticked rows, then refresh admin, leaderboard, and gallery.

    Re-checks :func:`admin.is_admin` server-side so a tampered client
    that re-enables the button still can't write.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    if not method:
        raise gr.Error("Pick a validation_method first.")
    try:
        promote_rows(ids, method)
    except (LookupError, ValueError) as e:
        raise gr.Error(str(e))
    gr.Info(f"Promoted {len(ids)} row(s) to validated ({method}).")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return admin_df, validated, unvalidated, _gallery_iframe_html()


def _admin_demote(
    table_df: pd.DataFrame | None,
    profile: gr.OAuthProfile | None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
    """Demote ticked rows, then refresh admin, leaderboard, and gallery."""
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        demote_rows(ids)
    except (LookupError, ValueError) as e:
        raise gr.Error(str(e))
    gr.Info(f"Demoted {len(ids)} row(s) to unvalidated.")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return admin_df, validated, unvalidated, _gallery_iframe_html()


def _admin_delete(
    table_df: pd.DataFrame | None,
    confirm: bool,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
    gr.Button,
]:
    """Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.

    Resets the confirm checkbox and re-disables both destructive buttons
    on the way out so the next deletion needs a fresh, deliberate confirm.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if not confirm:
        raise gr.Error("Tick the confirmation box to enable delete.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        delete_rows(ids)
    except ValueError as e:
        raise gr.Error(str(e))
    gr.Info(f"Deleted {len(ids)} submission(s).")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Checkbox(value=False),
        gr.Button(interactive=False),
        gr.Button(interactive=False),
    )


def _admin_stop_delete(
    table_df: pd.DataFrame | None,
    confirm: bool,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
    gr.Button,
]:
    """Stop running eval job(s) for ticked rows, delete them, then disarm.

    Same gating + disarm contract as :func:`_admin_delete`; the only
    difference is it calls :func:`admin.stop_and_delete_rows`, which
    best-effort cancels the submissions' in-flight HF Jobs before
    deleting. Use this for pending rows whose GPU eval is still running.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if not confirm:
        raise gr.Error("Tick the confirmation box to enable delete.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        stop_and_delete_rows(ids)
    except ValueError as e:
        raise gr.Error(str(e))
    gr.Info(f"Stopped + deleted {len(ids)} submission(s).")
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Checkbox(value=False),
        gr.Button(interactive=False),
        gr.Button(interactive=False),
    )


# Exact phrase an admin must type to arm the board-wide rescore. A
# free-text match (not a checkbox) is the deliberate "are you sure"
# friction: it can't be tripped by a stray click and forces the admin
# to consciously type the words before the heavy, score-invalidating
# action arms.
RESCORE_ALL_PHRASE = "RESCORE ALL"


def _arm_rescore_selected(
    confirm: bool, profile: gr.OAuthProfile | None,
) -> gr.Button:
    """Arm the rescore-selected button once an admin ticks its confirm box."""
    return gr.Button(interactive=bool(confirm) and is_admin(profile))


def _arm_rescore_all(
    phrase: str | None, profile: gr.OAuthProfile | None,
) -> gr.Button:
    """Arm the rescore-all button only on an exact phrase match by an admin."""
    matched = (phrase or "").strip() == RESCORE_ALL_PHRASE
    return gr.Button(interactive=matched and is_admin(profile))


def _rescore_result_message(dispatched: int, skipped: list[str]) -> str:
    """Toast text summarising a rescore dispatch."""
    msg = (
        f"Rescoring {dispatched} submission(s): rows flipped to pending and "
        f"re-evaluating in the background. The leaderboard repopulates as "
        f"each finishes."
    )
    if skipped:
        msg += (
            f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed "
            f"rows can't be rescored)."
        )
    return msg


def _admin_rescore_selected(
    table_df: pd.DataFrame | None,
    confirm: bool,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
]:
    """Re-evaluate the ticked rows, refresh the views, then disarm.

    Same gating contract as the destructive handlers: server-side
    ``is_admin`` re-check, an explicit confirm tick, and a non-empty
    selection. Resets the confirm box + disarms the button on the way
    out so the next rescore needs a fresh, deliberate confirm.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if not confirm:
        raise gr.Error("Tick the confirmation box to enable rescore.")
    ids = _selected_ids(table_df)
    if not ids:
        raise gr.Error("Tick at least one row first.")
    try:
        dispatched, skipped = rescore_rows(ids)
    except (LookupError, ValueError) as e:
        raise gr.Error(str(e))
    gr.Info(_rescore_result_message(dispatched, skipped))
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Checkbox(value=False),
        gr.Button(interactive=False),
    )


def _admin_rescore_all(
    phrase: str | None,
    profile: gr.OAuthProfile | None,
) -> tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button,
]:
    """Re-evaluate every rescoreable row, refresh the views, then disarm.

    The heavy, board-wide action: re-checks ``is_admin`` and the exact
    confirmation phrase server-side (so a tampered client that
    re-enables the button still can't fire), clears the phrase box, and
    disarms the button afterwards.
    """
    if not is_admin(profile):
        raise gr.Error("You are not in the admin set.")
    if (phrase or "").strip() != RESCORE_ALL_PHRASE:
        raise gr.Error(
            f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore."
        )
    try:
        dispatched, skipped = rescore_all()
    except ValueError as e:
        raise gr.Error(str(e))
    gr.Info(_rescore_result_message(dispatched, skipped))
    validated, unvalidated, _ = _safe_load_split()
    admin_df, _ = _safe_load_admin()
    return (
        admin_df,
        validated,
        unvalidated,
        _gallery_iframe_html(),
        gr.Textbox(value=""),
        gr.Button(interactive=False),
    )


@lru_cache(maxsize=128)
def _fetch_report_html(submission_id: str) -> bytes | None:
    """Pull ``reports/<id>.html`` off the submissions dataset.

    Cached in-process so repeat clicks on the same row don't hit
    the Hub. Returns ``None`` on any failure so the caller can
    serve a clean 404 rather than leaking a stack trace.
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_SUBMISSIONS_REPO,
            filename=f"reports/{submission_id}.html",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch report for %s (%s: %s)",
            submission_id, type(e).__name__, e,
        )
        return None


def serve_report(submission_id: str) -> Response:
    """Proxy a per-submission HTML report through the Space.

    HF Hub serves dataset HTML under ``/resolve/`` with
    ``Content-Type: text/plain`` (security: dataset files can't host
    live HTML), so a direct dataset link shows source instead of
    rendering. This route lives on the Space (which can legitimately
    serve text/html) and re-streams the file's bytes with the right
    content-type.
    """
    content = _fetch_report_html(submission_id)
    if content is None:
        return HTMLResponse(
            content="<h1>Report not found</h1>",
            status_code=404,
        )
    return Response(content=content, media_type="text/html; charset=utf-8")


def serve_metrics_page() -> Response:
    """Serve the static metrics explainer at ``/metrics``.

    Same-origin as the report proxy (``/reports/<id>.html``), so a
    hosted report's headline pills can deep-link to ``/metrics#<anchor>``
    and land on the matching section. The "Metrics" Gradio tab embeds
    this same route in an iframe.
    """
    return HTMLResponse(content=build_metrics_page())


# Illustration assets the metrics page embeds (e.g. the interface-match
# mating-group WebP). Vendored into the Space repo under `assets/metrics/`
# and served here so the page renders self-contained, with no dependency
# on the code repo's raw GitHub URLs staying reachable.
METRICS_ASSETS_DIR = Path(__file__).parent / "assets" / "metrics"


def serve_metrics_asset(name: str) -> Response:
    """Serve a bundled metrics illustration from ``assets/metrics/``.

    Flat namespace (no nested paths), traversal-guarded. Cached hard:
    these are static, versioned-with-the-repo assets.
    """
    if "/" in name or ".." in name:
        return Response(status_code=404)
    path = METRICS_ASSETS_DIR / name
    if not path.is_file():
        return Response(status_code=404)
    media_type = mimetypes.guess_type(name)[0] or "application/octet-stream"
    return Response(
        content=path.read_bytes(),
        media_type=media_type,
        headers={"Cache-Control": RENDER_CACHE_CONTROL},
    )


def _fetch_gt_render(fixture: str) -> bytes | None:
    """Pull a fixture's ground-truth GIF from the private GT dataset.

    Path inside the GT repo is ``<fixture>/renders/rotating.webp``. GT
    renders are a property of the data revision, not of any submission,
    so they're served straight from the GT repo rather than duplicated
    per submission. Not memoized for the same reason as :func:`_fetch_render` (GT
    renders can be added/updated on a data revision bump);
    ``hf_hub_download`` handles the per-revision disk cache. Needs the
    Space ``HF_TOKEN``'s read scope on the private repo.
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_DATA_GT_REPO,
            filename=f"{fixture}/renders/rotating.webp",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch GT render for %s (%s: %s)",
            fixture, type(e).__name__, e,
        )
        return None


# Long-lived immutable caching: a (submission, fixture) render never
# changes (fixed camera + lighting; re-renders would be a new artifact),
# so the browser/CDN can keep it forever. This is what makes fixture
# swaps and repeat visits free: only the ~33 on-screen turntables are
# fetched on first paint, and everything after that is a cache hit.
RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable"


def _render_proxy_url(submission_id: str, fixture: str) -> str | None:
    """Resolver for a submission's plain turntable: a public render-bucket URL.

    The eval job uploads ``renders/<id>/<fixture>/rotating.webp`` to the public
    bucket, so the browser fetches it straight from object storage (anonymous,
    no Space proxy hop). The gallery only calls this for ``valid`` fixtures; a
    missing upload 404s and degrades to the dashed cell via ``<img onerror>``.
    """
    return render_public_url(submission_id, fixture, "rotating.webp")


def _render_diff_proxy_url(submission_id: str, fixture: str) -> str | None:
    """Resolver for an editing fixture's edit-diff turntable (public bucket URL).

    Used by the gallery grid for editing fixtures (see
    ``gallery.build_gallery_payload``). A miss (non-editing fixture, or an edit
    that never rendered a diff) 404s and degrades to the dashed cell, no
    fallback to the plain turntable.
    """
    return render_public_url(submission_id, fixture, "edit_diff.webp")


def _gt_proxy_url(fixture: str) -> str | None:
    """Resolver returning the cached proxy URL for a fixture's GT WebP.

    GT renders stay in the **private** GT dataset, so they cannot be public
    bucket URLs; they are still re-streamed through the Space proxy (which
    holds the read token).
    """
    return f"/gt-render/{fixture}.webp"


def _gt_diff_proxy_url(fixture: str) -> str | None:
    """Resolver for an editing fixture's GT "answer key" edit-diff WebP.

    The one-time GT generation (``tools/generate_gt_edit_diff.py``) writes
    ``<fixture>/renders/edit_diff_gt.webp`` into the private GT dataset, so it
    rides the existing generic GT proxy (``serve_gt_file``) rather than needing
    a route of its own. The gallery uses this for the ground-truth row on
    editing fixtures; a missing file 404s and degrades to the dashed cell.
    """
    return f"/gt/{fixture}/renders/edit_diff_gt.webp"


def serve_gt_render(fixture: str) -> Response:
    """Stream a fixture's ground-truth render WebP with long-lived caching."""
    webp = _fetch_gt_render(fixture)
    if webp is None:
        return Response(status_code=404)
    return Response(
        content=webp,
        media_type="image/webp",
        headers={"Cache-Control": RENDER_CACHE_CONTROL},
    )


def _fetch_gt_file(fixture: str, relpath: str) -> bytes | None:
    """Pull an arbitrary GT asset (``<fixture>/<relpath>``) from the GT dataset.

    Serves the hosted report's ground-truth column: the per-view PNGs
    (``renders/<view>.png``) and the ``ground_truth.pdf``. The GT dataset is
    **private**, so these are proxied through the Space (which holds the read
    token) rather than linked directly. ``hf_hub_download`` does the
    per-revision disk cache. Returns ``None`` on any failure (the report hides
    the broken tile via the browser's normal missing-image handling).
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_DATA_GT_REPO,
            filename=f"{fixture}/{relpath}",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch GT file %s/%s (%s: %s)",
            fixture, relpath, type(e).__name__, e,
        )
        return None


def serve_gt_file(fixture: str, relpath: str) -> Response:
    """Stream a GT asset (view PNG / PDF) with long-lived immutable caching.

    Path-traversal-guarded (``..`` rejected). The hosted report references
    ``/gt/<fixture>/<relpath>`` and the browser fetches it lazily; the bytes
    are a property of the data revision (not any submission), so the same
    immutable ``Cache-Control`` as the render/input proxies applies.
    """
    if ".." in fixture or ".." in relpath:
        return Response(status_code=404)
    data = _fetch_gt_file(fixture, relpath)
    if data is None:
        return Response(status_code=404)
    media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
    return Response(
        content=data,
        media_type=media_type,
        headers={"Cache-Control": RENDER_CACHE_CONTROL},
    )


def _gallery_iframe_html() -> str:
    """Build the gallery as a self-contained ``srcdoc`` iframe.

    Reads the live rows and renders the page (turntables referenced as
    cached ``/render`` / ``/gt-render`` proxy URLs, lazy-loaded by the
    browser), then inlines the whole document into an iframe ``srcdoc``
    so it gets its own style context (no Gradio CSS collision). A Hub
    read failure degrades to an empty gallery rather than crashing the
    tab.
    """
    try:
        rows = _load_rows_from_hub()
    except LeaderboardDataError:
        logger.exception("Gallery row load failed; rendering empty gallery")
        rows = []
    doc = render_gallery_page(
        rows, _render_proxy_url, _gt_proxy_url, _render_diff_proxy_url,
        _gt_diff_proxy_url,
    )
    escaped = html.escape(doc, quote=True)
    # The gallery JS (`fitIframe`) sizes this iframe to be the single scroller:
    # it shrinks to the content for few rows, otherwise fills down to the bottom
    # of the viewport so only the iframe's own body scrolls (keeping the sticky
    # header + ground-truth row locked) and the outer Gradio page does not also
    # scroll. The inline `height` is just the pre-script fallback; JS overrides
    # it, so no `max-height` here (it would clamp the measured fill height).
    return (
        f'<iframe srcdoc="{escaped}" '
        'style="width:100%; height:80vh; border:0; display:block;" '
        'title="CADGenBench gallery"></iframe>'
    )


def _fetch_task_input(fixture: str, relpath: str) -> bytes | None:
    """Pull a fixture input asset (``<fixture>/<relpath>``) from the inputs repo.

    Serves the Task-browser tab's drawings / starting-shape renders.
    The inputs dataset is private, so these are proxied through the
    Space (which holds the read token) rather than linked directly —
    mirroring :func:`_fetch_render`. Not memoized for the same reason:
    inputs can be added/updated on a data revision bump, and
    ``hf_hub_download`` already does per-revision disk caching. Returns
    ``None`` on any failure (the page hides the broken tile).
    """
    try:
        local_path = hf_hub_download(
            repo_id=HF_DATA_REPO,
            filename=f"{fixture}/{relpath}",
            repo_type="dataset",
        )
        return Path(local_path).read_bytes()
    except Exception as e:  # noqa: BLE001 - any Hub failure -> 404
        logger.warning(
            "Failed to fetch task input %s/%s (%s: %s)",
            fixture, relpath, type(e).__name__, e,
        )
        return None


def _task_input_url(fixture: str, relpath: str) -> str:
    """Resolver returning the Space proxy URL for a task input asset.

    Returns the route string without fetching bytes (the browser
    lazy-fetches only the on-screen task's images). An absolute path
    resolves against the Space origin even inside the iframe ``srcdoc``.
    """
    return f"/task-input/{fixture}/{relpath}"


def serve_task_input(fixture: str, relpath: str) -> Response:
    """Stream a fixture input asset with long-lived immutable caching.

    Path-traversal-guarded (``..`` rejected). The task browser
    references ``/task-input/<fixture>/<relpath>`` and the browser
    fetches it lazily; re-streams the dataset bytes (the Space holds the
    read token) with the same immutable ``Cache-Control`` as the render
    proxies so the CDN/browser cache them hard.
    """
    if ".." in fixture or ".." in relpath:
        return Response(status_code=404)
    data = _fetch_task_input(fixture, relpath)
    if data is None:
        return Response(status_code=404)
    media_type = mimetypes.guess_type(relpath)[0] or "application/octet-stream"
    return Response(
        content=data,
        media_type=media_type,
        headers={"Cache-Control": RENDER_CACHE_CONTROL},
    )


def _tasks_iframe_html() -> str:
    """Build the Task browser as a self-contained ``srcdoc`` iframe.

    Snapshots just the ``<fixture>/description.yaml`` files from the
    inputs dataset (lightweight: the drawings/renders themselves load
    lazily via the ``/task-input`` proxy), shapes them into task cards,
    and inlines the page into an iframe so it keeps its own style
    context (no Gradio CSS collision). A Hub read failure degrades to an
    empty browser rather than crashing the tab.
    """
    try:
        local = snapshot_download(
            repo_id=HF_DATA_REPO,
            repo_type="dataset",
            allow_patterns=["*/description.yaml"],
        )
        tasks = load_tasks_from_dir(Path(local))
    except Exception:  # noqa: BLE001 - degrade to empty browser, never crash
        logger.exception("Task load failed; rendering empty task browser")
        tasks = []
    doc = render_tasks_page(tasks, _task_input_url)
    escaped = html.escape(doc, quote=True)
    return (
        f'<iframe srcdoc="{escaped}" '
        'style="width:100%; height:90vh; border:0; display:block;" '
        'title="CADGenBench tasks"></iframe>'
    )


@lru_cache(maxsize=1)
def _logo_data_uri() -> str:
    """Return the header logo as a base64 ``data:`` URI.

    Inlined rather than served as a static file so the ``<img>`` renders
    with no dependency on Gradio/FastAPI static-path allowlisting — it
    works identically when the Space runs locally on a random port and
    on huggingface.co. The PNG itself lives in the repo at
    ``assets/logo.png`` (reviewable as a real binary) and is read
    relative to this module so the Docker image's working dir doesn't
    matter. Cached because the bytes never change within a process.
    """
    logo_path = Path(__file__).parent / "assets" / "logo.png"
    data = base64.b64encode(logo_path.read_bytes()).decode("ascii")
    return f"data:image/png;base64,{data}"


# Reclaim vertical space so the gallery can show more rows in one viewport:
# hide the Gradio footer ("Built with Gradio - Settings") and tighten the
# page's outer padding / inter-block gap. Scoped to cosmetics only. The
# logo is height-constrained (width auto-scales) so it sits in a compact
# band near the old `### ` title's footprint. The wordmark PNG has a
# transparent background and black ink, so on a dark theme it would
# vanish: the `.dark` rule inverts it to white ink (Gradio toggles the
# `.dark` class on the container; the prefers-color-scheme query covers
# system-driven dark mode too).
_APP_CSS = (
    "footer{display:none !important;}"
    ".gradio-container{padding-top:4px !important; padding-bottom:0 !important;}"
    # Collapse the title block's own box and the flex gap Gradio puts
    # between it and the tab bar so the wordmark sits right above the
    # leaderboard instead of floating with a gap. The negative bottom
    # margin pulls the tab nav up snug against the logo.
    "#cgb-title{margin:0 !important;padding:0 !important;min-width:0 !important;}"
    "#cgb-title .cgb-logo{height:46px;width:auto;display:block;margin:0;}"
    ".gradio-container .tabs{margin-top:-6px !important;}"
    ".dark #cgb-title .cgb-logo{filter:invert(1);}"
    "@media (prefers-color-scheme: dark){"
    "#cgb-title .cgb-logo{filter:invert(1);}}"
)

with gr.Blocks(
    title="CADGenBench Leaderboard", theme=gr.themes.Soft(), css=_APP_CSS,
) as blocks:
    # Single compact title line (keeps vertical space for the gallery rows).
    # The wordmark logo replaces the old `### CADGenBench Leaderboard`
    # markdown; alt text preserves the name for screen readers / when
    # images are blocked.
    gr.HTML(
        f'<img class="cgb-logo" src="{_logo_data_uri()}" '
        'alt="CADGenBench Leaderboard">',
        elem_id="cgb-title",
    )

    with gr.Tab("Leaderboard"):
        # Visual-first leaderboard. The bespoke surface (sticky GT row,
        # fixture picker, turntable grid, compare modal) is a
        # self-contained HTML doc inlined into an iframe `srcdoc` so it
        # keeps its own style context. Thumbnails are lazy-loaded from
        # the cached `/render` / `/gt-render` proxy routes (requires the
        # Space to be public). Built at boot, rebuilt on page load, and
        # refreshed after admin actions.
        gallery_html = gr.HTML(value=_gallery_iframe_html())
        gallery_refresh_btn = gr.Button("Refresh gallery", size="sm")
        gallery_refresh_btn.click(
            fn=_gallery_iframe_html, outputs=gallery_html,
        )

    with gr.Tab("Detailed View"):
        # Load both tiers once at boot. `_safe_load_split` keeps a Hub
        # read failure from crashing the Space: on failure the frames
        # come up empty and `initial_error` carries the message the
        # banner renders.
        initial_validated, initial_unvalidated, initial_error = _safe_load_split()

        # Loud, persistent banner shown only when the live results
        # can't be read from the Hub (e.g. an under-scoped Space
        # HF_TOKEN). Kept in sync by the refresh / Timer handlers. The
        # leaderboard never falls back to stale/bundled data, so this
        # banner is the signal that empty tables are a read failure,
        # not a genuinely empty leaderboard.
        data_error_banner = gr.Markdown(
            value=_data_error_banner_md(initial_error),
            visible=initial_error is not None,
        )

        # Collapsed accordions above the tables. Validation guidelines
        # gives the short two-tier story + link to the full policy
        # doc; Citation carries the verbatim BibTeX entry. Both start
        # closed so the leaderboard itself stays above the fold.
        with gr.Accordion("Validation guidelines", open=False):
            gr.Markdown(VALIDATION_GUIDELINES_MD)
        with gr.Accordion("Citation", open=False):
            # language=None -> plain monospaced render (gr.Code doesn't
            # ship a BibTeX highlighter); show_line_numbers off because
            # the entry is meant to be copy-pasted, not annotated.
            gr.Code(
                value=CITATION_BIBTEX,
                language=None,
                show_line_numbers=False,
            )

        # Two stacked tables, split by `validation_status`. Validated
        # on top so the curated results are above the fold; unvalidated
        # below carries every other row (auto-published, awaiting
        # methodology review). See decisions/validation-policy.md.
        # Initial values come from the boot-time `_safe_load_split`
        # above (empty + banner on a Hub read failure).
        validated_view = Leaderboard(
            value=initial_validated,
            datatype=VALIDATED_LEADERBOARD_DATATYPES,
            search_columns=["submission_name", "submitter_name"],
            hide_columns=LEADERBOARD_HIDE_COLUMNS,
            label="Validated Leaderboard",
            interactive=False,
        )
        unvalidated_view = Leaderboard(
            value=initial_unvalidated,
            datatype=LEADERBOARD_DATATYPES,
            search_columns=["submission_name", "submitter_name"],
            hide_columns=LEADERBOARD_HIDE_COLUMNS,
            label="Unvalidated Leaderboard",
            interactive=False,
        )
        with gr.Row():
            refresh_btn = gr.Button("Refresh", size="sm")
            # One file, both tables, `validation_status` discriminator
            # column. Fresh CSV is generated on every click so the
            # download reflects the latest data, not a stale snapshot
            # captured at boot.
            download_btn = gr.DownloadButton(
                label="Download CSV", size="sm",
            )
        refresh_btn.click(
            fn=_refresh_leaderboard_with_toast,
            outputs=[validated_view, unvalidated_view, data_error_banner],
        )
        download_btn.click(fn=build_combined_csv, outputs=download_btn)

        # No inline row-click detail panel: the submission_name cell is a
        # deep-link that opens the self-contained per-submission report in
        # a new tab (see `_submission_name_md` in leaderboard.py). Now that
        # the Space is public, HF's edge serves `/reports/<id>.html` to
        # browser users, so we link to it directly instead of inlining the
        # (tens-to-hundreds-of-MB) report through the Gradio event payload.

    with gr.Tab("Tasks"):
        # Read-only task browser: mirrors the per-submission report's
        # summary-table -> detail-card navigation (j/k, Esc) but shows
        # only the prompt + input (drawing / starting shape), no scores
        # or ground truth. Self-contained HTML inlined into an iframe
        # `srcdoc` like the gallery; input images lazy-load from the
        # `/task-input` proxy. Built at boot, rebuilt on page load.
        tasks_html = gr.HTML(value=_tasks_iframe_html())
        tasks_refresh_btn = gr.Button("Refresh tasks", size="sm")
        tasks_refresh_btn.click(fn=_tasks_iframe_html, outputs=tasks_html)

    with gr.Tab("Metrics"):
        # Static explainer for the (new) scoring metrics. Served as a
        # standalone `/metrics` route too, so the per-submission report's
        # headline pills can deep-link to `/metrics#<anchor>`; the tab just
        # embeds that same page in an iframe (single source of truth).
        gr.HTML(
            '<iframe src="/metrics" '
            'style="width:100%; height:85vh; border:0; display:block;" '
            'title="CADGenBench metrics"></iframe>'
        )

    with gr.Tab("Submit"):
        gr.Markdown(
            f"""
**Submission format.** A single zip with:

- one folder per sample in `{HF_DATA_REPO}`; include `output.step` for
  samples where your system produced a candidate. Missing `output.step`
  scores zero for that sample;
- a top-level `meta.json`:

```json
{{
  "submitter_name": "your name or team",
  "submission_name": "MyAgent v2.3 (or whatever describes your system)",
  "agent_url": "https://github.com/...   (optional)",
  "notes": "free text, optional, max 500 chars, single line, plain text",
  "agree_to_publish": true
}}
```

**Submission name.** Free text describing the system being benchmarked,
however you choose to describe it. The benchmark is system-agnostic: your
submission may use no LLM, one, or many. If you want to disclose your
stack, put it here or in `notes`.

**Notes field.** Plain text only (no markdown / HTML). Capped at 500 chars
and stripped to a single line. Shown in the per-submission detail view,
not in the main leaderboard table.

**Consent.** `"agree_to_publish": true` in `meta.json` is your consent
to publish the resulting row on the public leaderboard.

For the full submission contract (output format, validity gate, canonical
pose, and a local self-check), see
[`docs/benchmark/submission.md`]({SUBMISSION_DOC_URL}).
"""
        )
        # OAuth gate. The user must log in via the HF button before
        # the Submit button becomes interactive; the row gets the
        # canonical `hf_username` from `gr.OAuthProfile.username`
        # (not a free-text claim in meta.json). README front-matter
        # already carries `hf_oauth: true` so HF's OAuth integration
        # is wired up at the Space level.
        login_btn = gr.LoginButton()
        zip_in = gr.File(label="Submission ZIP", file_types=[".zip"])
        # Starts disabled; the `blocks.load` handler below flips it
        # to interactive when an OAuthProfile is present.
        submit_btn = gr.Button("Submit", variant="primary", interactive=False)
        # Persistent status panel. handle_submit is a generator that
        # streams stage updates (validating -> uploading/queuing ->
        # queued) and any rejection reason here, so the outcome
        # survives instead of vanishing with a transient toast. The
        # handler also reads `gr.OAuthProfile` implicitly via its
        # parameter type annotation (Gradio's dependency-injection
        # convention).
        submit_status = gr.Markdown(value=SUBMIT_STATUS_IDLE)
        submit_btn.click(
            fn=handle_submit,
            inputs=[zip_in],
            outputs=[submit_status],
        )

    with gr.Tab("About"):
        gr.Markdown(ABOUT_MD)

    with gr.Tab("Admin"):
        # Maintainer-only controls. The Admin *tab* is visible to everyone
        # (a hint the path exists), but ALL admin UI -- the table, the
        # actions, the danger zones -- lives in `admin_panel`, a column that
        # stays hidden unless the logged-in user is in CADGENBENCH_ADMINS.
        # The `blocks.load` handler below flips that column's visibility and
        # only loads table data for admins; a server-side `is_admin` re-check
        # still guards every handler. Non-admins (and logged-out visitors)
        # see only the login/logout button + a status line, nothing else.
        # See decisions/validation-policy.md.
        admin_login_btn = gr.LoginButton()
        admin_status = gr.Markdown(
            "Log in with an admin account to access the controls."
        )
        # Everything below is admin-only: hidden by default, revealed by
        # `_gate_admin_controls` only for a logged-in user in the admin set.
        with gr.Column(visible=False) as admin_panel:
            gr.Markdown(
                "## Admin\n"
                "Tick rows in the **select** column, then promote them into "
                "the **Validated** tier (recording an evidence type), demote "
                "them back to **Unvalidated**, delete them, or rescore them "
                "against the current ground truth. Actions apply to every "
                "ticked row at once."
            )
            # Only the leading `select` column is editable; the rest is
            # read-only context. Click-to-tick drives every action below.
            # Starts empty; `_gate_admin_controls` loads rows on page load
            # for admins only, so non-admins never receive the data.
            admin_table = gr.Dataframe(
                value=_empty_admin_table(),
                datatype=[
                    "bool", "str", "str", "str", "str", "str", "str",
                    "number", "str",
                ],
                static_columns=list(range(1, len(ADMIN_COLUMNS))),
                interactive=False,
                label="Submissions (tick select to choose rows)",
                wrap=True,
            )
            admin_selection_md = gr.Markdown("_No rows selected._")
            admin_method_radio = gr.Radio(
                choices=list(VALID_METHODS),
                value="manual",
                label="validation_method (applied to all rows on promote)",
                interactive=False,
            )
            with gr.Row():
                promote_btn = gr.Button(
                    "Mark validated", variant="primary", interactive=False,
                )
                demote_btn = gr.Button("Mark unvalidated", interactive=False)
            with gr.Accordion("Danger zone: delete", open=False):
                gr.Markdown(
                    "Permanently deletes the ticked rows **and** their "
                    "uploaded zip + report files from the submissions "
                    "dataset. This cannot be undone (only a manual revert of "
                    "the dataset commit).\n\n"
                    "**Stop & delete** additionally cancels any still-running "
                    "evaluation job(s) for the ticked rows before deleting — "
                    "use it for pending submissions whose GPU eval is in "
                    "flight."
                )
                delete_confirm = gr.Checkbox(
                    label=(
                        "I understand this permanently deletes the selected "
                        "submissions and their files."
                    ),
                    value=False,
                    interactive=False,
                )
                with gr.Row():
                    delete_btn = gr.Button(
                        "Delete selected", variant="stop", interactive=False,
                    )
                    stop_delete_btn = gr.Button(
                        "Stop & delete selected", variant="stop",
                        interactive=False,
                    )
            with gr.Accordion("Danger zone: rescore", open=False):
                gr.Markdown(
                    "Re-evaluates submissions against the **current** "
                    "ground truth + data: each row flips back to pending, the "
                    "gallery renders and the per-submission report HTML are "
                    "regenerated, and the score is recomputed. Use after a "
                    "ground-truth swap or a metric change that invalidates "
                    "the existing scores.\n\n"
                    "Rescoring is **re-runnable**: if a row's eval fails, "
                    "mark it and rescore again (or rescore all) — each run is "
                    "independent and converges.\n\n"
                    "- **Rescore selected** re-evaluates the ticked rows.\n"
                    f"- **Rescore all** re-evaluates every submission that "
                    f"has a stored zip and isn't already pending — type "
                    f"`{RESCORE_ALL_PHRASE}` to arm it."
                )
                rescore_confirm = gr.Checkbox(
                    label=(
                        "I understand this flips the selected rows to pending "
                        "and recomputes their scores."
                    ),
                    value=False,
                    interactive=False,
                )
                rescore_selected_btn = gr.Button(
                    "Rescore selected", variant="stop", interactive=False,
                )
                rescore_all_phrase = gr.Textbox(
                    label=(
                        f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide "
                        f"rescore"
                    ),
                    placeholder=RESCORE_ALL_PHRASE,
                    interactive=False,
                )
                rescore_all_btn = gr.Button(
                    "Rescore ALL submissions", variant="stop",
                    interactive=False,
                )
            admin_refresh_btn = gr.Button("Refresh", size="sm")

        admin_table.change(
            fn=_admin_selection_status,
            inputs=admin_table,
            outputs=admin_selection_md,
        )
        promote_btn.click(
            fn=_admin_promote,
            inputs=[admin_table, admin_method_radio],
            outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
        )
        demote_btn.click(
            fn=_admin_demote,
            inputs=[admin_table],
            outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
        )
        delete_confirm.change(
            fn=_arm_delete,
            inputs=[delete_confirm],
            outputs=[delete_btn, stop_delete_btn],
        )
        delete_btn.click(
            fn=_admin_delete,
            inputs=[admin_table, delete_confirm],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                delete_confirm, delete_btn, stop_delete_btn,
            ],
        )
        stop_delete_btn.click(
            fn=_admin_stop_delete,
            inputs=[admin_table, delete_confirm],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                delete_confirm, delete_btn, stop_delete_btn,
            ],
        )
        rescore_confirm.change(
            fn=_arm_rescore_selected,
            inputs=[rescore_confirm],
            outputs=[rescore_selected_btn],
        )
        rescore_selected_btn.click(
            fn=_admin_rescore_selected,
            inputs=[admin_table, rescore_confirm],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                rescore_confirm, rescore_selected_btn,
            ],
        )
        rescore_all_phrase.change(
            fn=_arm_rescore_all,
            inputs=[rescore_all_phrase],
            outputs=[rescore_all_btn],
        )
        rescore_all_btn.click(
            fn=_admin_rescore_all,
            inputs=[rescore_all_phrase],
            outputs=[
                admin_table, validated_view, unvalidated_view, gallery_html,
                rescore_all_phrase, rescore_all_btn,
            ],
        )
        admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)

        # Keep the admin table on the same 10s cadence as the leaderboard
        # so a row that lands (or a pending row that completes) after the
        # tab loaded shows up without a manual Refresh. Selection is
        # preserved across ticks so an in-progress set of checkboxes
        # survives the reload.
        admin_auto_refresh_timer = gr.Timer(10)
        admin_auto_refresh_timer.tick(
            fn=_auto_refresh_admin_table,
            inputs=admin_table,
            outputs=admin_table,
        )

    # gradio_leaderboard.Leaderboard handles its own update path
    # cleanly; bind a Timer to push fresh dataframes every 10 seconds.
    # Single tick runs `_auto_refresh_leaderboard` once and pushes the
    # two halves into the validated / unvalidated widgets plus the
    # data-unavailable banner. The handler swallows a Hub read failure
    # into empty frames + a loud warning toast so a degraded read never
    # crashes the tick loop or silently blanks the tables.
    auto_refresh_timer = gr.Timer(10)
    auto_refresh_timer.tick(
        fn=_auto_refresh_leaderboard,
        outputs=[validated_view, unvalidated_view, data_error_banner],
    )

    # On page load, read the visitor's OAuth profile (None if not
    # logged in) and flip the Submit button's interactivity. Runs once
    # per page load; LoginButton clicks also re-trigger this through
    # Gradio's auth-event plumbing.
    blocks.load(fn=_enable_submit_when_logged_in, outputs=submit_btn)
    blocks.load(fn=_gallery_iframe_html, outputs=gallery_html)
    blocks.load(fn=_tasks_iframe_html, outputs=tasks_html)

    # Same per-load OAuth read, gating the Admin tab on membership in the
    # CADGENBENCH_ADMINS set. Logged-out / non-admin visitors get the
    # admin_panel hidden entirely (no table, no controls) -- just the
    # login/logout button and a status line.
    blocks.load(
        fn=_gate_admin_controls,
        outputs=[
            admin_panel,
            admin_table,
            admin_method_radio,
            promote_btn,
            demote_btn,
            delete_confirm,
            delete_btn,
            stop_delete_btn,
            rescore_confirm,
            rescore_selected_btn,
            rescore_all_phrase,
            rescore_all_btn,
            admin_status,
        ],
    )


# Mount Gradio under a FastAPI parent so the custom proxy route
# above lives at the same origin as the UI. Direct routes on `app`
# get checked before the Gradio sub-app, so `/reports/<sid>.html`
# never gets shadowed.
app = FastAPI()
app.add_api_route(
    "/reports/{submission_id}.html",
    serve_report,
    methods=["GET"],
)
# Static metrics explainer. Same origin as the report proxy so report
# pills can deep-link to `/metrics#<anchor>`; also embedded in the
# Metrics tab. Registered before the Gradio mount so it isn't shadowed.
app.add_api_route(
    "/metrics",
    serve_metrics_page,
    methods=["GET"],
)
# Illustration assets the metrics page embeds (vendored under assets/metrics/).
app.add_api_route(
    "/metrics-assets/{name}",
    serve_metrics_asset,
    methods=["GET"],
)
# Cached render proxies the gallery's lazy-loaded turntables point at.
# Registered before the Gradio mount so they're not shadowed by the
# catch-all sub-app.
# Candidate renders are served directly from the public render bucket (URLs
# come from the gallery resolvers), so only the private GT render still needs a
# token-holding Space proxy route.
app.add_api_route(
    "/gt-render/{fixture}.webp",
    serve_gt_render,
    methods=["GET"],
)
# Ground-truth assets the hosted report links lazily (per-view PNGs + PDF).
# GT is private, so this token-holding proxy streams them; the `:path`
# converter lets `relpath` carry a slash (e.g. renders/iso.png). Registered
# before the Gradio mount so it isn't shadowed by the catch-all sub-app.
app.add_api_route(
    "/gt/{fixture}/{relpath:path}",
    serve_gt_file,
    methods=["GET"],
)
# Task-browser input assets (drawings + starting-shape renders). The
# `:path` converter lets `relpath` carry a slash (e.g. renders/iso.png).
# Registered before the Gradio mount so it's not shadowed.
app.add_api_route(
    "/task-input/{fixture}/{relpath:path}",
    serve_task_input,
    methods=["GET"],
)
# Gradio picks REAL Hugging Face OAuth vs. a local "mock" login via
# ``gradio.utils.get_space()``, which is only truthy when ``SYSTEM ==
# "spaces"``. HF sets that on Gradio-SDK Spaces but NOT on ``sdk: docker``
# Spaces like this one. Without it, ``mount_gradio_app`` wires up the MOCK
# OAuth routes, which never contact hf.co and instead log every visitor in
# as the container token's owner (our ``HF_TOKEN`` account) -- leaking that
# identity into the LoginButton and, since that account is in
# ``CADGENBENCH_ADMINS``, handing every visitor admin. Force it on only when
# we're actually running on a Space (``SPACE_ID`` is HF-injected on all
# Spaces, Docker included) so the real ``hf_oauth: true`` flow runs; locally
# (no ``SPACE_ID``) it stays unset so Gradio's local mock login still works
# for dev. Must precede the mount, which is what triggers ``attach_oauth``.
if os.environ.get("SPACE_ID") and os.environ.get("SYSTEM") != "spaces":
    os.environ["SYSTEM"] = "spaces"
app = gr.mount_gradio_app(app, blocks, path="/")


if __name__ == "__main__":
    host = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
    port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
    uvicorn.run(app, host=host, port=port)