Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

Michael Rabinovich Cursor commited on about 4 hours ago

Commit

2893b22

1 Parent(s): c7f83a5

leaderboard: admin rescore (selected + all) reusing the eval pipeline

Add a maintainer-only rescore that re-evaluates submissions against the
current ground truth/data, reusing the submit worker end to end: each
target row is reset to the pending regime (scores cleared, submitted_at
preserved) and _spawn_worker is re-dispatched, which re-renders the
gallery, regenerates reports/<id>.{html,json}, and recomputes the score.

- rescore_rows(ids): selected rows; LookupError on unknown id before any
write. rescore_all(): every row with a stored zip that isn't pending
(skips legacy zip-less seed rows + in-flight evals).
- Bulk dispatch is staggered on a background thread so a board-wide
rescore doesn't fire N run_job calls at once; fully re-runnable, so an
interrupted rescore converges on a retry.
- UI in the Admin tab: "Rescore selected" behind a confirm checkbox and
"Rescore ALL" behind a type-to-confirm phrase, both admin-gated with a
server-side re-check and disarmed after firing / on load.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (3) hide show

admin.py +207 -0
app.py +194 -6
tests/test_admin.py +158 -0

admin.py CHANGED Viewed

@@ -26,8 +26,11 @@ locking story.
 """
 from __future__ import annotations
 import logging
 import os
 from typing import Any, Iterable
 import gradio as gr
@@ -41,7 +44,9 @@ from submit import (
     REPORTS_DIR,
     SUBMISSIONS_DIR,
     _HF_API,
     _hub_rmw_results,
 )
 logger = logging.getLogger(__name__)
@@ -61,6 +66,30 @@ _JOB_TERMINAL_STAGES: frozenset[str] = frozenset(
 # validation policy doc.
 VALID_METHODS: tuple[str, ...] = ("code", "traces", "api", "manual")
 def admin_usernames() -> set[str]:
     """Parse ``CADGENBENCH_ADMINS`` into a set of HF usernames.
@@ -160,6 +189,184 @@ def demote_rows(submission_ids: Iterable[str]) -> None:
     )
 def delete_rows(submission_ids: Iterable[str]) -> None:
     """Permanently delete every listed submission: artifacts then row.

 """
 from __future__ import annotations
+import json
 import logging
 import os
+import threading
+import time
 from typing import Any, Iterable
 import gradio as gr
     REPORTS_DIR,
     SUBMISSIONS_DIR,
     _HF_API,
+    _download_results_jsonl,
     _hub_rmw_results,
+    _spawn_worker,
 )
 logger = logging.getLogger(__name__)
 # validation policy doc.
 VALID_METHODS: tuple[str, ...] = ("code", "traces", "api", "manual")
+# Score-shaped fields cleared when a row is flipped back to ``pending``
+# for a rescore. Mirrors the pending regime in
+# cadgenbench-submissions/schema.md: every aggregate is ``null`` until
+# the fresh eval flips the row back to ``completed``. ``submitted_at``
+# is intentionally *not* touched -- the schema defines it as the
+# immutable timestamp the row was first written, so a rescore preserves
+# the original submit provenance.
+_RESCORE_CLEARED_SCORE_FIELDS: tuple[str, ...] = (
+    "aggregate_score",
+    "validity_rate",
+    "score_by_task_type",
+    "per_task_scores",
+    "per_fixture_scores",
+    "per_fixture_breakdown",
+)
+# Gap between successive worker dispatches in a bulk rescore. Each
+# worker dispatches its own HF Job and then polls; staggering the
+# starts keeps a rescore-all from firing N ``run_job`` control-plane
+# calls in one burst (which can rate-limit) while HF's own queue
+# absorbs anything past the account's concurrent-slot cap. Small enough
+# to be invisible for a one-or-two-row rescore.
+RESCORE_DISPATCH_STAGGER_SECONDS = 2.0
 def admin_usernames() -> set[str]:
     """Parse ``CADGENBENCH_ADMINS`` into a set of HF usernames.
     )
+def _current_fixture_names() -> list[str]:
+    """Sorted fixture set of the *current* ``cadgenbench-data`` revision.
+    A rescore re-evaluates each stored zip against whatever the data
+    repo exposes now (the whole point after a GT swap), so the fixture
+    set comes from the live inputs dir rather than from whatever the
+    submission was originally scored against. This is the same source
+    :func:`submit._validate_fixture_set` checks new uploads against, so
+    the single-vs-sharded dispatch split matches the submit path.
+    """
+    from cadgenbench.common.paths import data_inputs_dir
+    root = data_inputs_dir()
+    return sorted(p.name for p in root.iterdir() if p.is_dir())
+def _dispatch_rescore_workers(
+    targets: dict[str, str], fixture_names: list[str],
+) -> None:
+    """Spawn one eval worker per target on a staggered background thread.
+    *targets* maps ``submission_id -> submission_blob_url``. Runs the
+    dispatch loop on its own daemon thread so the caller (a Gradio
+    handler) returns the moment the rows are flipped to pending, rather
+    than blocking while N workers are kicked off. Each worker is the
+    same fire-and-forget dispatch+poll thread the submit path uses, so
+    a rescore reuses the entire eval pipeline (sharding, render upload,
+    report regeneration, row flip) unchanged.
+    """
+    items = list(targets.items())
+    def _run() -> None:
+        for i, (submission_id, blob_url) in enumerate(items):
+            if i:
+                time.sleep(RESCORE_DISPATCH_STAGGER_SECONDS)
+            try:
+                _spawn_worker(submission_id, blob_url, fixture_names)
+            except Exception as e:  # noqa: BLE001 - one bad dispatch must not stall the rest
+                logger.exception(
+                    "rescore: failed to spawn worker for %s (%s: %s)",
+                    submission_id, type(e).__name__, e,
+                )
+    threading.Thread(
+        target=_run, name="cgb-rescore-dispatch", daemon=True,
+    ).start()
+def _rescore(ids: set[str], *, require_found: bool) -> tuple[int, list[str]]:
+    """Flip *ids* back to pending, then dispatch a fresh eval for each.
+    Single ``results.jsonl`` write resets every target row to the
+    pending regime (status ``pending``, ``failure_reason`` cleared, all
+    score fields nulled) and captures its stored ``submission_blob_url``;
+    a row with no stored zip (legacy seed rows) can't be rescored and is
+    collected as *skipped* instead. After the write commits, workers are
+    dispatched on a staggered background thread.
+    Idempotent and re-runnable: a rescore that's interrupted (Space
+    restart) leaves its in-flight rows pending, which the boot-time
+    stuck-pending sweep flips to failed, and re-running the rescore on
+    those rows converges. ``submitted_at`` is preserved (immutable per
+    the schema).
+    Args:
+        require_found: when True (selected-rows path) every id must
+            exist in ``results.jsonl`` or :class:`LookupError` is raised
+            before any worker is dispatched; when False (rescore-all
+            path) the id set was just derived from the file so a missing
+            id only means a concurrent delete and is ignored.
+    Returns:
+        ``(dispatched_count, skipped_ids)`` -- how many workers were
+        queued and which ids were skipped for lacking a stored zip.
+    """
+    captured: dict[str, str] = {}
+    skipped: set[str] = set()
+    def mutate(rows: list[dict[str, Any]]) -> None:
+        seen = set()
+        for row in rows:
+            sid = row.get("submission_id")
+            if sid not in ids:
+                continue
+            seen.add(sid)
+            blob_url = row.get("submission_blob_url")
+            if not blob_url:
+                skipped.add(sid)
+                continue
+            row["status"] = "pending"
+            row["failure_reason"] = None
+            for field in _RESCORE_CLEARED_SCORE_FIELDS:
+                row[field] = None
+            captured[sid] = blob_url
+        if require_found:
+            _raise_for_missing(ids, seen)
+    _hub_rmw_results(
+        mutate,
+        commit_message=f"rescore: reset {len(ids)} row(s) to pending",
+    )
+    if captured:
+        _dispatch_rescore_workers(captured, _current_fixture_names())
+    return len(captured), sorted(skipped)
+def rescore_rows(submission_ids: Iterable[str]) -> tuple[int, list[str]]:
+    """Re-evaluate every listed submission against the current data.
+    Resets each row to pending and re-dispatches the eval, which
+    re-renders the gallery, regenerates ``reports/<id>.{html,json}``,
+    and recomputes the scores. Use after a ground-truth or metric change
+    that invalidates existing scores.
+    Raises:
+        ValueError: no ids were given.
+        LookupError: one or more ids are absent from ``results.jsonl``
+            (no row is reset and no worker is dispatched).
+    Returns:
+        ``(dispatched_count, skipped_ids)``; *skipped_ids* are rows that
+        have no stored zip to re-evaluate (legacy seed rows).
+    """
+    ids = _clean_id_set(submission_ids)
+    return _rescore(ids, require_found=True)
+def _rescoreable_ids_from_hub() -> set[str]:
+    """Every submission_id with a stored zip that isn't mid-eval.
+    Reads the live ``results.jsonl`` and returns the ids eligible for a
+    bulk rescore: a row needs a ``submission_blob_url`` (so there's a
+    zip to re-evaluate) and must not already be ``pending`` (skipping
+    in-flight evals avoids double-dispatching a row a worker is already
+    driving). Completed and failed rows both qualify.
+    """
+    body = _download_results_jsonl()
+    ids: set[str] = set()
+    for line in body.splitlines():
+        if not line.strip():
+            continue
+        try:
+            row = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not row.get("submission_blob_url"):
+            continue
+        if row.get("status") == "pending":
+            continue
+        sid = row.get("submission_id")
+        if sid:
+            ids.add(sid)
+    return ids
+def rescore_all() -> tuple[int, list[str]]:
+    """Re-evaluate every rescoreable submission (see :func:`rescore_rows`).
+    Targets every row with a stored zip that isn't already pending. This
+    is the heavy, board-wide action a maintainer runs after a GT swap.
+    Raises:
+        ValueError: nothing is rescoreable (empty board, or every row is
+            pending / lacks a stored zip).
+    Returns:
+        ``(dispatched_count, skipped_ids)``.
+    """
+    ids = _rescoreable_ids_from_hub()
+    if not ids:
+        raise ValueError(
+            "No rescoreable submissions (every row is pending or has no "
+            "stored zip)."
+        )
+    return _rescore(ids, require_found=False)
 def delete_rows(submission_ids: Iterable[str]) -> None:
     """Permanently delete every listed submission: artifacts then row.

app.py CHANGED Viewed

@@ -66,6 +66,8 @@ from admin import (
     demote_rows,
     is_admin,
     promote_rows,
     stop_and_delete_rows,
 )
 from submit import handle_submit
@@ -265,7 +267,7 @@ def _gate_admin_controls(
     profile: gr.OAuthProfile | None,
 ) -> tuple[
     gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
-    gr.Button, str,
 ]:
     """Enable the admin controls only for a logged-in user in the admin set.
@@ -274,8 +276,9 @@ def _gate_admin_controls(
     staying pinned to whatever rows existed when the Space process
     booted. Non-admins and logged-out visitors get the tab with the
     table read-only and every control disabled, mirroring the server-side
-    re-check in each handler. The delete + stop-and-delete buttons always
-    load disarmed: they only enable once the confirm checkbox is ticked.
     """
     admin_df, error = _safe_load_admin()
     if error:
@@ -298,6 +301,10 @@ def _gate_admin_controls(
         gr.Checkbox(interactive=admin, value=False),
         gr.Button(interactive=False),
         gr.Button(interactive=False),
         status,
     )
@@ -489,6 +496,118 @@ def _admin_stop_delete(
     )
 @lru_cache(maxsize=128)
 def _fetch_report_html(submission_id: str) -> bytes | None:
     """Pull ``reports/<id>.html`` off the submissions dataset.
@@ -948,9 +1067,10 @@ to publish the resulting row on the public leaderboard.
             "## Admin\n"
             "Tick rows in the **select** column, then promote them into the "
             "**Validated** tier (recording an evidence type), demote them back "
-            "to **Unvalidated**, or delete them. Actions apply to every ticked "
-            "row at once. Limited to maintainers in the admin set; everyone "
-            "else sees the tab with the controls disabled."
         )
         admin_login_btn = gr.LoginButton()
         admin_status = gr.Markdown(
@@ -1009,6 +1129,44 @@ to publish the resulting row on the public leaderboard.
                     "Stop & delete selected", variant="stop",
                     interactive=False,
                 )
         admin_refresh_btn = gr.Button("Refresh", size="sm")
         admin_table.change(
@@ -1047,6 +1205,32 @@ to publish the resulting row on the public leaderboard.
                 delete_confirm, delete_btn, stop_delete_btn,
             ],
         )
         admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
         # Keep the admin table on the same 10s cadence as the leaderboard
@@ -1095,6 +1279,10 @@ to publish the resulting row on the public leaderboard.
             delete_confirm,
             delete_btn,
             stop_delete_btn,
             admin_status,
         ],
     )

     demote_rows,
     is_admin,
     promote_rows,
+    rescore_all,
+    rescore_rows,
     stop_and_delete_rows,
 )
 from submit import handle_submit
     profile: gr.OAuthProfile | None,
 ) -> tuple[
     gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
+    gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str,
 ]:
     """Enable the admin controls only for a logged-in user in the admin set.
     staying pinned to whatever rows existed when the Space process
     booted. Non-admins and logged-out visitors get the tab with the
     table read-only and every control disabled, mirroring the server-side
+    re-check in each handler. The armed-by-confirmation buttons (delete,
+    stop-and-delete, rescore-selected, rescore-all) always load disarmed:
+    they only enable once their confirm box is ticked / phrase typed.
     """
     admin_df, error = _safe_load_admin()
     if error:
         gr.Checkbox(interactive=admin, value=False),
         gr.Button(interactive=False),
         gr.Button(interactive=False),
+        gr.Checkbox(interactive=admin, value=False),
+        gr.Button(interactive=False),
+        gr.Textbox(interactive=admin, value=""),
+        gr.Button(interactive=False),
         status,
     )
     )
+# Exact phrase an admin must type to arm the board-wide rescore. A
+# free-text match (not a checkbox) is the deliberate "are you sure"
+# friction: it can't be tripped by a stray click and forces the admin
+# to consciously type the words before the heavy, score-invalidating
+# action arms.
+RESCORE_ALL_PHRASE = "RESCORE ALL"
+def _arm_rescore_selected(
+    confirm: bool, profile: gr.OAuthProfile | None,
+) -> gr.Button:
+    """Arm the rescore-selected button once an admin ticks its confirm box."""
+    return gr.Button(interactive=bool(confirm) and is_admin(profile))
+def _arm_rescore_all(
+    phrase: str | None, profile: gr.OAuthProfile | None,
+) -> gr.Button:
+    """Arm the rescore-all button only on an exact phrase match by an admin."""
+    matched = (phrase or "").strip() == RESCORE_ALL_PHRASE
+    return gr.Button(interactive=matched and is_admin(profile))
+def _rescore_result_message(dispatched: int, skipped: list[str]) -> str:
+    """Toast text summarising a rescore dispatch."""
+    msg = (
+        f"Rescoring {dispatched} submission(s): rows flipped to pending and "
+        f"re-evaluating in the background. The leaderboard repopulates as "
+        f"each finishes."
+    )
+    if skipped:
+        msg += (
+            f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed "
+            f"rows can't be rescored)."
+        )
+    return msg
+def _admin_rescore_selected(
+    table_df: pd.DataFrame | None,
+    confirm: bool,
+    profile: gr.OAuthProfile | None,
+) -> tuple[
+    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
+]:
+    """Re-evaluate the ticked rows, refresh the views, then disarm.
+    Same gating contract as the destructive handlers: server-side
+    ``is_admin`` re-check, an explicit confirm tick, and a non-empty
+    selection. Resets the confirm box + disarms the button on the way
+    out so the next rescore needs a fresh, deliberate confirm.
+    """
+    if not is_admin(profile):
+        raise gr.Error("You are not in the admin set.")
+    if not confirm:
+        raise gr.Error("Tick the confirmation box to enable rescore.")
+    ids = _selected_ids(table_df)
+    if not ids:
+        raise gr.Error("Tick at least one row first.")
+    try:
+        dispatched, skipped = rescore_rows(ids)
+    except (LookupError, ValueError) as e:
+        raise gr.Error(str(e))
+    gr.Info(_rescore_result_message(dispatched, skipped))
+    validated, unvalidated, _ = _safe_load_split()
+    admin_df, _ = _safe_load_admin()
+    return (
+        admin_df,
+        validated,
+        unvalidated,
+        _gallery_iframe_html(),
+        gr.Checkbox(value=False),
+        gr.Button(interactive=False),
+    )
+def _admin_rescore_all(
+    phrase: str | None,
+    profile: gr.OAuthProfile | None,
+) -> tuple[
+    pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button,
+]:
+    """Re-evaluate every rescoreable row, refresh the views, then disarm.
+    The heavy, board-wide action: re-checks ``is_admin`` and the exact
+    confirmation phrase server-side (so a tampered client that
+    re-enables the button still can't fire), clears the phrase box, and
+    disarms the button afterwards.
+    """
+    if not is_admin(profile):
+        raise gr.Error("You are not in the admin set.")
+    if (phrase or "").strip() != RESCORE_ALL_PHRASE:
+        raise gr.Error(
+            f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore."
+        )
+    try:
+        dispatched, skipped = rescore_all()
+    except ValueError as e:
+        raise gr.Error(str(e))
+    gr.Info(_rescore_result_message(dispatched, skipped))
+    validated, unvalidated, _ = _safe_load_split()
+    admin_df, _ = _safe_load_admin()
+    return (
+        admin_df,
+        validated,
+        unvalidated,
+        _gallery_iframe_html(),
+        gr.Textbox(value=""),
+        gr.Button(interactive=False),
+    )
 @lru_cache(maxsize=128)
 def _fetch_report_html(submission_id: str) -> bytes | None:
     """Pull ``reports/<id>.html`` off the submissions dataset.
             "## Admin\n"
             "Tick rows in the **select** column, then promote them into the "
             "**Validated** tier (recording an evidence type), demote them back "
+            "to **Unvalidated**, delete them, or rescore them against the "
+            "current ground truth. Actions apply to every ticked row at once. "
+            "Limited to maintainers in the admin set; everyone else sees the "
+            "tab with the controls disabled."
         )
         admin_login_btn = gr.LoginButton()
         admin_status = gr.Markdown(
                     "Stop & delete selected", variant="stop",
                     interactive=False,
                 )
+        with gr.Accordion("Danger zone: rescore", open=False):
+            gr.Markdown(
+                "Re-evaluates submissions against the **current** "
+                "ground truth + data: each row flips back to pending, the "
+                "gallery renders and the per-submission report HTML are "
+                "regenerated, and the score is recomputed. Use after a "
+                "ground-truth swap or a metric change that invalidates the "
+                "existing scores.\n\n"
+                "Rescoring is **re-runnable**: if a row's eval fails, mark it "
+                "and rescore again (or rescore all) — each run is "
+                "independent and converges.\n\n"
+                "- **Rescore selected** re-evaluates the ticked rows.\n"
+                f"- **Rescore all** re-evaluates every submission that has a "
+                f"stored zip and isn't already pending — type "
+                f"`{RESCORE_ALL_PHRASE}` to arm it."
+            )
+            rescore_confirm = gr.Checkbox(
+                label=(
+                    "I understand this flips the selected rows to pending and "
+                    "recomputes their scores."
+                ),
+                value=False,
+                interactive=False,
+            )
+            rescore_selected_btn = gr.Button(
+                "Rescore selected", variant="stop", interactive=False,
+            )
+            rescore_all_phrase = gr.Textbox(
+                label=(
+                    f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide "
+                    f"rescore"
+                ),
+                placeholder=RESCORE_ALL_PHRASE,
+                interactive=False,
+            )
+            rescore_all_btn = gr.Button(
+                "Rescore ALL submissions", variant="stop", interactive=False,
+            )
         admin_refresh_btn = gr.Button("Refresh", size="sm")
         admin_table.change(
                 delete_confirm, delete_btn, stop_delete_btn,
             ],
         )
+        rescore_confirm.change(
+            fn=_arm_rescore_selected,
+            inputs=[rescore_confirm],
+            outputs=[rescore_selected_btn],
+        )
+        rescore_selected_btn.click(
+            fn=_admin_rescore_selected,
+            inputs=[admin_table, rescore_confirm],
+            outputs=[
+                admin_table, validated_view, unvalidated_view, gallery_html,
+                rescore_confirm, rescore_selected_btn,
+            ],
+        )
+        rescore_all_phrase.change(
+            fn=_arm_rescore_all,
+            inputs=[rescore_all_phrase],
+            outputs=[rescore_all_btn],
+        )
+        rescore_all_btn.click(
+            fn=_admin_rescore_all,
+            inputs=[rescore_all_phrase],
+            outputs=[
+                admin_table, validated_view, unvalidated_view, gallery_html,
+                rescore_all_phrase, rescore_all_btn,
+            ],
+        )
         admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
         # Keep the admin table on the same 10s cadence as the leaderboard
             delete_confirm,
             delete_btn,
             stop_delete_btn,
+            rescore_confirm,
+            rescore_selected_btn,
+            rescore_all_phrase,
+            rescore_all_btn,
             admin_status,
         ],
     )

tests/test_admin.py CHANGED Viewed

@@ -88,6 +88,10 @@ def hub(monkeypatch):
         state["bucket_deleted_paths"].extend(delete or [])
     monkeypatch.setattr(submit, "_download_results_jsonl", fake_download)
     monkeypatch.setattr(submit._HF_API, "upload_file", fake_upload)
     monkeypatch.setattr(submit._HF_API, "delete_file", fake_delete_file)
     monkeypatch.setattr(submit._HF_API, "list_bucket_tree", fake_list_bucket_tree)
@@ -290,3 +294,157 @@ def test_stop_and_delete_empty_selection_raises(hub, jobs):
         admin.stop_and_delete_rows([])
     assert jobs["cancelled"] == []
     assert hub["uploads"] == 0

         state["bucket_deleted_paths"].extend(delete or [])
     monkeypatch.setattr(submit, "_download_results_jsonl", fake_download)
+    # admin.py imported `_download_results_jsonl` by name (used directly by
+    # `rescore_all`), so patch that binding too; the RMW path reaches the
+    # submit-module reference patched above.
+    monkeypatch.setattr(admin, "_download_results_jsonl", fake_download)
     monkeypatch.setattr(submit._HF_API, "upload_file", fake_upload)
     monkeypatch.setattr(submit._HF_API, "delete_file", fake_delete_file)
     monkeypatch.setattr(submit._HF_API, "list_bucket_tree", fake_list_bucket_tree)
         admin.stop_and_delete_rows([])
     assert jobs["cancelled"] == []
     assert hub["uploads"] == 0
+# --- Rescore -------------------------------------------------------------
+# Rows shaped for the rescore path: a completed row with scores + a stored
+# zip, a failed row with a zip, a legacy row with no zip, and a pending row
+# (mid-eval). ``submitted_at`` is set so a test can assert it's preserved.
+RESCORE_ROWS = [
+    {
+        "submission_id": "done",
+        "status": "completed",
+        "failure_reason": None,
+        "submission_blob_url": "https://blob/done.zip",
+        "submitted_at": "2026-01-01T00:00:00Z",
+        "aggregate_score": 0.7,
+        "validity_rate": 1.0,
+        "score_by_task_type": {"generation": 0.7},
+        "per_task_scores": {"generation": {"score": 0.7}},
+        "per_fixture_scores": {"f1": {"cad_score": 0.7}},
+        "per_fixture_breakdown": {"f1": {"validity": 1.0}},
+    },
+    {
+        "submission_id": "broke",
+        "status": "failed",
+        "failure_reason": "boom",
+        "submission_blob_url": "https://blob/broke.zip",
+        "submitted_at": "2026-01-02T00:00:00Z",
+        "aggregate_score": None,
+    },
+    {
+        "submission_id": "legacy",
+        "status": "completed",
+        "submission_blob_url": None,
+        "submitted_at": "2025-01-01T00:00:00Z",
+        "aggregate_score": 0.4,
+    },
+    {
+        "submission_id": "inflight",
+        "status": "pending",
+        "submission_blob_url": "https://blob/inflight.zip",
+        "submitted_at": "2026-02-01T00:00:00Z",
+        "aggregate_score": None,
+    },
+]
+@pytest.fixture
+def dispatch(monkeypatch):
+    """Capture rescore dispatch without spawning real workers/threads.
+    Replaces ``_dispatch_rescore_workers`` (which would start a daemon
+    thread that calls the submit path's ``_spawn_worker``) with a synch-
+    ronous recorder, and stubs the fixture-set lookup so the suite never
+    touches the data repo. ``state["targets"]`` is the ``{id: blob_url}``
+    map handed to dispatch; ``state["fixtures"]`` the fixture list.
+    """
+    state: dict = {"targets": None, "fixtures": None, "calls": 0}
+    def fake_dispatch(targets, fixture_names):
+        state["targets"] = dict(targets)
+        state["fixtures"] = list(fixture_names)
+        state["calls"] += 1
+    monkeypatch.setattr(admin, "_dispatch_rescore_workers", fake_dispatch)
+    monkeypatch.setattr(admin, "_current_fixture_names", lambda: ["f1", "f2"])
+    return state
+def test_rescore_rows_flips_to_pending_and_dispatches(hub, dispatch):
+    """Rescore resets the row to the pending regime and queues a worker."""
+    hub["rows"] = [dict(r) for r in RESCORE_ROWS]
+    dispatched, skipped = admin.rescore_rows(["done"])
+    assert dispatched == 1
+    assert skipped == []
+    row = _row(hub["rows"], "done")
+    assert row["status"] == "pending"
+    assert row["failure_reason"] is None
+    # Every score-shaped field is cleared.
+    for field in admin._RESCORE_CLEARED_SCORE_FIELDS:
+        assert row[field] is None
+    # submitted_at is immutable provenance and must survive untouched.
+    assert row["submitted_at"] == "2026-01-01T00:00:00Z"
+    # The worker was queued with the stored zip url and current fixtures.
+    assert dispatch["targets"] == {"done": "https://blob/done.zip"}
+    assert dispatch["fixtures"] == ["f1", "f2"]
+    assert hub["uploads"] == 1
+def test_rescore_rows_skips_rows_without_zip(hub, dispatch):
+    """A legacy row with no stored zip is skipped, not dispatched or erroring."""
+    hub["rows"] = [dict(r) for r in RESCORE_ROWS]
+    dispatched, skipped = admin.rescore_rows(["legacy"])
+    assert dispatched == 0
+    assert skipped == ["legacy"]
+    # The row is left exactly as-is (still completed, score intact).
+    row = _row(hub["rows"], "legacy")
+    assert row["status"] == "completed"
+    assert row["aggregate_score"] == 0.4
+    # Nothing to dispatch.
+    assert dispatch["calls"] == 0
+    # The reset write still happens (single RMW), but flips nothing here.
+    assert hub["uploads"] == 1
+def test_rescore_rows_missing_id_raises_without_dispatch(hub, dispatch):
+    """An unknown id aborts the batch before any worker is queued."""
+    hub["rows"] = [dict(r) for r in RESCORE_ROWS]
+    with pytest.raises(LookupError):
+        admin.rescore_rows(["done", "ghost"])
+    # The mutate raised inside the RMW, so no row was flipped and no
+    # dispatch happened.
+    assert _row(hub["rows"], "done")["status"] == "completed"
+    assert dispatch["calls"] == 0
+def test_rescore_rows_empty_selection_raises(hub, dispatch):
+    """An empty selection is a caller error."""
+    with pytest.raises(ValueError):
+        admin.rescore_rows([])
+    assert dispatch["calls"] == 0
+def test_rescore_all_targets_completed_and_failed_only(hub, dispatch):
+    """Rescore-all hits rows with a zip, skipping pending + zip-less rows."""
+    hub["rows"] = [dict(r) for r in RESCORE_ROWS]
+    dispatched, skipped = admin.rescore_all()
+    # done + broke have zips and aren't pending; legacy has no zip;
+    # inflight is pending (mid-eval) -> neither dispatched.
+    assert dispatched == 2
+    assert set(dispatch["targets"]) == {"done", "broke"}
+    # Both targeted rows are now pending.
+    assert _row(hub["rows"], "done")["status"] == "pending"
+    assert _row(hub["rows"], "broke")["status"] == "pending"
+    # The pending in-flight row is left strictly alone.
+    assert _row(hub["rows"], "inflight")["status"] == "pending"
+    # The legacy row keeps its old completed score.
+    assert _row(hub["rows"], "legacy")["status"] == "completed"
+def test_rescore_all_empty_board_raises(hub, dispatch):
+    """Rescore-all with nothing rescoreable is a no-op error, no write."""
+    hub["rows"] = [
+        {"submission_id": "inflight", "status": "pending",
+         "submission_blob_url": "https://blob/x.zip"},
+        {"submission_id": "legacy", "status": "completed",
+         "submission_blob_url": None},
+    ]
+    with pytest.raises(ValueError):
+        admin.rescore_all()
+    assert dispatch["calls"] == 0
+    assert hub["uploads"] == 0