Michael Rabinovich Cursor commited on
Commit
2893b22
·
1 Parent(s): c7f83a5

leaderboard: admin rescore (selected + all) reusing the eval pipeline

Browse files

Add a maintainer-only rescore that re-evaluates submissions against the
current ground truth/data, reusing the submit worker end to end: each
target row is reset to the pending regime (scores cleared, submitted_at
preserved) and _spawn_worker is re-dispatched, which re-renders the
gallery, regenerates reports/<id>.{html,json}, and recomputes the score.

- rescore_rows(ids): selected rows; LookupError on unknown id before any
write. rescore_all(): every row with a stored zip that isn't pending
(skips legacy zip-less seed rows + in-flight evals).
- Bulk dispatch is staggered on a background thread so a board-wide
rescore doesn't fire N run_job calls at once; fully re-runnable, so an
interrupted rescore converges on a retry.
- UI in the Admin tab: "Rescore selected" behind a confirm checkbox and
"Rescore ALL" behind a type-to-confirm phrase, both admin-gated with a
server-side re-check and disarmed after firing / on load.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (3) hide show
  1. admin.py +207 -0
  2. app.py +194 -6
  3. tests/test_admin.py +158 -0
admin.py CHANGED
@@ -26,8 +26,11 @@ locking story.
26
  """
27
  from __future__ import annotations
28
 
 
29
  import logging
30
  import os
 
 
31
  from typing import Any, Iterable
32
 
33
  import gradio as gr
@@ -41,7 +44,9 @@ from submit import (
41
  REPORTS_DIR,
42
  SUBMISSIONS_DIR,
43
  _HF_API,
 
44
  _hub_rmw_results,
 
45
  )
46
 
47
  logger = logging.getLogger(__name__)
@@ -61,6 +66,30 @@ _JOB_TERMINAL_STAGES: frozenset[str] = frozenset(
61
  # validation policy doc.
62
  VALID_METHODS: tuple[str, ...] = ("code", "traces", "api", "manual")
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  def admin_usernames() -> set[str]:
66
  """Parse ``CADGENBENCH_ADMINS`` into a set of HF usernames.
@@ -160,6 +189,184 @@ def demote_rows(submission_ids: Iterable[str]) -> None:
160
  )
161
 
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def delete_rows(submission_ids: Iterable[str]) -> None:
164
  """Permanently delete every listed submission: artifacts then row.
165
 
 
26
  """
27
  from __future__ import annotations
28
 
29
+ import json
30
  import logging
31
  import os
32
+ import threading
33
+ import time
34
  from typing import Any, Iterable
35
 
36
  import gradio as gr
 
44
  REPORTS_DIR,
45
  SUBMISSIONS_DIR,
46
  _HF_API,
47
+ _download_results_jsonl,
48
  _hub_rmw_results,
49
+ _spawn_worker,
50
  )
51
 
52
  logger = logging.getLogger(__name__)
 
66
  # validation policy doc.
67
  VALID_METHODS: tuple[str, ...] = ("code", "traces", "api", "manual")
68
 
69
+ # Score-shaped fields cleared when a row is flipped back to ``pending``
70
+ # for a rescore. Mirrors the pending regime in
71
+ # cadgenbench-submissions/schema.md: every aggregate is ``null`` until
72
+ # the fresh eval flips the row back to ``completed``. ``submitted_at``
73
+ # is intentionally *not* touched -- the schema defines it as the
74
+ # immutable timestamp the row was first written, so a rescore preserves
75
+ # the original submit provenance.
76
+ _RESCORE_CLEARED_SCORE_FIELDS: tuple[str, ...] = (
77
+ "aggregate_score",
78
+ "validity_rate",
79
+ "score_by_task_type",
80
+ "per_task_scores",
81
+ "per_fixture_scores",
82
+ "per_fixture_breakdown",
83
+ )
84
+
85
+ # Gap between successive worker dispatches in a bulk rescore. Each
86
+ # worker dispatches its own HF Job and then polls; staggering the
87
+ # starts keeps a rescore-all from firing N ``run_job`` control-plane
88
+ # calls in one burst (which can rate-limit) while HF's own queue
89
+ # absorbs anything past the account's concurrent-slot cap. Small enough
90
+ # to be invisible for a one-or-two-row rescore.
91
+ RESCORE_DISPATCH_STAGGER_SECONDS = 2.0
92
+
93
 
94
  def admin_usernames() -> set[str]:
95
  """Parse ``CADGENBENCH_ADMINS`` into a set of HF usernames.
 
189
  )
190
 
191
 
192
+ def _current_fixture_names() -> list[str]:
193
+ """Sorted fixture set of the *current* ``cadgenbench-data`` revision.
194
+
195
+ A rescore re-evaluates each stored zip against whatever the data
196
+ repo exposes now (the whole point after a GT swap), so the fixture
197
+ set comes from the live inputs dir rather than from whatever the
198
+ submission was originally scored against. This is the same source
199
+ :func:`submit._validate_fixture_set` checks new uploads against, so
200
+ the single-vs-sharded dispatch split matches the submit path.
201
+ """
202
+ from cadgenbench.common.paths import data_inputs_dir
203
+
204
+ root = data_inputs_dir()
205
+ return sorted(p.name for p in root.iterdir() if p.is_dir())
206
+
207
+
208
+ def _dispatch_rescore_workers(
209
+ targets: dict[str, str], fixture_names: list[str],
210
+ ) -> None:
211
+ """Spawn one eval worker per target on a staggered background thread.
212
+
213
+ *targets* maps ``submission_id -> submission_blob_url``. Runs the
214
+ dispatch loop on its own daemon thread so the caller (a Gradio
215
+ handler) returns the moment the rows are flipped to pending, rather
216
+ than blocking while N workers are kicked off. Each worker is the
217
+ same fire-and-forget dispatch+poll thread the submit path uses, so
218
+ a rescore reuses the entire eval pipeline (sharding, render upload,
219
+ report regeneration, row flip) unchanged.
220
+ """
221
+ items = list(targets.items())
222
+
223
+ def _run() -> None:
224
+ for i, (submission_id, blob_url) in enumerate(items):
225
+ if i:
226
+ time.sleep(RESCORE_DISPATCH_STAGGER_SECONDS)
227
+ try:
228
+ _spawn_worker(submission_id, blob_url, fixture_names)
229
+ except Exception as e: # noqa: BLE001 - one bad dispatch must not stall the rest
230
+ logger.exception(
231
+ "rescore: failed to spawn worker for %s (%s: %s)",
232
+ submission_id, type(e).__name__, e,
233
+ )
234
+
235
+ threading.Thread(
236
+ target=_run, name="cgb-rescore-dispatch", daemon=True,
237
+ ).start()
238
+
239
+
240
+ def _rescore(ids: set[str], *, require_found: bool) -> tuple[int, list[str]]:
241
+ """Flip *ids* back to pending, then dispatch a fresh eval for each.
242
+
243
+ Single ``results.jsonl`` write resets every target row to the
244
+ pending regime (status ``pending``, ``failure_reason`` cleared, all
245
+ score fields nulled) and captures its stored ``submission_blob_url``;
246
+ a row with no stored zip (legacy seed rows) can't be rescored and is
247
+ collected as *skipped* instead. After the write commits, workers are
248
+ dispatched on a staggered background thread.
249
+
250
+ Idempotent and re-runnable: a rescore that's interrupted (Space
251
+ restart) leaves its in-flight rows pending, which the boot-time
252
+ stuck-pending sweep flips to failed, and re-running the rescore on
253
+ those rows converges. ``submitted_at`` is preserved (immutable per
254
+ the schema).
255
+
256
+ Args:
257
+ require_found: when True (selected-rows path) every id must
258
+ exist in ``results.jsonl`` or :class:`LookupError` is raised
259
+ before any worker is dispatched; when False (rescore-all
260
+ path) the id set was just derived from the file so a missing
261
+ id only means a concurrent delete and is ignored.
262
+
263
+ Returns:
264
+ ``(dispatched_count, skipped_ids)`` -- how many workers were
265
+ queued and which ids were skipped for lacking a stored zip.
266
+ """
267
+ captured: dict[str, str] = {}
268
+ skipped: set[str] = set()
269
+
270
+ def mutate(rows: list[dict[str, Any]]) -> None:
271
+ seen = set()
272
+ for row in rows:
273
+ sid = row.get("submission_id")
274
+ if sid not in ids:
275
+ continue
276
+ seen.add(sid)
277
+ blob_url = row.get("submission_blob_url")
278
+ if not blob_url:
279
+ skipped.add(sid)
280
+ continue
281
+ row["status"] = "pending"
282
+ row["failure_reason"] = None
283
+ for field in _RESCORE_CLEARED_SCORE_FIELDS:
284
+ row[field] = None
285
+ captured[sid] = blob_url
286
+ if require_found:
287
+ _raise_for_missing(ids, seen)
288
+
289
+ _hub_rmw_results(
290
+ mutate,
291
+ commit_message=f"rescore: reset {len(ids)} row(s) to pending",
292
+ )
293
+
294
+ if captured:
295
+ _dispatch_rescore_workers(captured, _current_fixture_names())
296
+ return len(captured), sorted(skipped)
297
+
298
+
299
+ def rescore_rows(submission_ids: Iterable[str]) -> tuple[int, list[str]]:
300
+ """Re-evaluate every listed submission against the current data.
301
+
302
+ Resets each row to pending and re-dispatches the eval, which
303
+ re-renders the gallery, regenerates ``reports/<id>.{html,json}``,
304
+ and recomputes the scores. Use after a ground-truth or metric change
305
+ that invalidates existing scores.
306
+
307
+ Raises:
308
+ ValueError: no ids were given.
309
+ LookupError: one or more ids are absent from ``results.jsonl``
310
+ (no row is reset and no worker is dispatched).
311
+
312
+ Returns:
313
+ ``(dispatched_count, skipped_ids)``; *skipped_ids* are rows that
314
+ have no stored zip to re-evaluate (legacy seed rows).
315
+ """
316
+ ids = _clean_id_set(submission_ids)
317
+ return _rescore(ids, require_found=True)
318
+
319
+
320
+ def _rescoreable_ids_from_hub() -> set[str]:
321
+ """Every submission_id with a stored zip that isn't mid-eval.
322
+
323
+ Reads the live ``results.jsonl`` and returns the ids eligible for a
324
+ bulk rescore: a row needs a ``submission_blob_url`` (so there's a
325
+ zip to re-evaluate) and must not already be ``pending`` (skipping
326
+ in-flight evals avoids double-dispatching a row a worker is already
327
+ driving). Completed and failed rows both qualify.
328
+ """
329
+ body = _download_results_jsonl()
330
+ ids: set[str] = set()
331
+ for line in body.splitlines():
332
+ if not line.strip():
333
+ continue
334
+ try:
335
+ row = json.loads(line)
336
+ except json.JSONDecodeError:
337
+ continue
338
+ if not row.get("submission_blob_url"):
339
+ continue
340
+ if row.get("status") == "pending":
341
+ continue
342
+ sid = row.get("submission_id")
343
+ if sid:
344
+ ids.add(sid)
345
+ return ids
346
+
347
+
348
+ def rescore_all() -> tuple[int, list[str]]:
349
+ """Re-evaluate every rescoreable submission (see :func:`rescore_rows`).
350
+
351
+ Targets every row with a stored zip that isn't already pending. This
352
+ is the heavy, board-wide action a maintainer runs after a GT swap.
353
+
354
+ Raises:
355
+ ValueError: nothing is rescoreable (empty board, or every row is
356
+ pending / lacks a stored zip).
357
+
358
+ Returns:
359
+ ``(dispatched_count, skipped_ids)``.
360
+ """
361
+ ids = _rescoreable_ids_from_hub()
362
+ if not ids:
363
+ raise ValueError(
364
+ "No rescoreable submissions (every row is pending or has no "
365
+ "stored zip)."
366
+ )
367
+ return _rescore(ids, require_found=False)
368
+
369
+
370
  def delete_rows(submission_ids: Iterable[str]) -> None:
371
  """Permanently delete every listed submission: artifacts then row.
372
 
app.py CHANGED
@@ -66,6 +66,8 @@ from admin import (
66
  demote_rows,
67
  is_admin,
68
  promote_rows,
 
 
69
  stop_and_delete_rows,
70
  )
71
  from submit import handle_submit
@@ -265,7 +267,7 @@ def _gate_admin_controls(
265
  profile: gr.OAuthProfile | None,
266
  ) -> tuple[
267
  gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
268
- gr.Button, str,
269
  ]:
270
  """Enable the admin controls only for a logged-in user in the admin set.
271
 
@@ -274,8 +276,9 @@ def _gate_admin_controls(
274
  staying pinned to whatever rows existed when the Space process
275
  booted. Non-admins and logged-out visitors get the tab with the
276
  table read-only and every control disabled, mirroring the server-side
277
- re-check in each handler. The delete + stop-and-delete buttons always
278
- load disarmed: they only enable once the confirm checkbox is ticked.
 
279
  """
280
  admin_df, error = _safe_load_admin()
281
  if error:
@@ -298,6 +301,10 @@ def _gate_admin_controls(
298
  gr.Checkbox(interactive=admin, value=False),
299
  gr.Button(interactive=False),
300
  gr.Button(interactive=False),
 
 
 
 
301
  status,
302
  )
303
 
@@ -489,6 +496,118 @@ def _admin_stop_delete(
489
  )
490
 
491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  @lru_cache(maxsize=128)
493
  def _fetch_report_html(submission_id: str) -> bytes | None:
494
  """Pull ``reports/<id>.html`` off the submissions dataset.
@@ -948,9 +1067,10 @@ to publish the resulting row on the public leaderboard.
948
  "## Admin\n"
949
  "Tick rows in the **select** column, then promote them into the "
950
  "**Validated** tier (recording an evidence type), demote them back "
951
- "to **Unvalidated**, or delete them. Actions apply to every ticked "
952
- "row at once. Limited to maintainers in the admin set; everyone "
953
- "else sees the tab with the controls disabled."
 
954
  )
955
  admin_login_btn = gr.LoginButton()
956
  admin_status = gr.Markdown(
@@ -1009,6 +1129,44 @@ to publish the resulting row on the public leaderboard.
1009
  "Stop & delete selected", variant="stop",
1010
  interactive=False,
1011
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1012
  admin_refresh_btn = gr.Button("Refresh", size="sm")
1013
 
1014
  admin_table.change(
@@ -1047,6 +1205,32 @@ to publish the resulting row on the public leaderboard.
1047
  delete_confirm, delete_btn, stop_delete_btn,
1048
  ],
1049
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1050
  admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
1051
 
1052
  # Keep the admin table on the same 10s cadence as the leaderboard
@@ -1095,6 +1279,10 @@ to publish the resulting row on the public leaderboard.
1095
  delete_confirm,
1096
  delete_btn,
1097
  stop_delete_btn,
 
 
 
 
1098
  admin_status,
1099
  ],
1100
  )
 
66
  demote_rows,
67
  is_admin,
68
  promote_rows,
69
+ rescore_all,
70
+ rescore_rows,
71
  stop_and_delete_rows,
72
  )
73
  from submit import handle_submit
 
267
  profile: gr.OAuthProfile | None,
268
  ) -> tuple[
269
  gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
270
+ gr.Button, gr.Checkbox, gr.Button, gr.Textbox, gr.Button, str,
271
  ]:
272
  """Enable the admin controls only for a logged-in user in the admin set.
273
 
 
276
  staying pinned to whatever rows existed when the Space process
277
  booted. Non-admins and logged-out visitors get the tab with the
278
  table read-only and every control disabled, mirroring the server-side
279
+ re-check in each handler. The armed-by-confirmation buttons (delete,
280
+ stop-and-delete, rescore-selected, rescore-all) always load disarmed:
281
+ they only enable once their confirm box is ticked / phrase typed.
282
  """
283
  admin_df, error = _safe_load_admin()
284
  if error:
 
301
  gr.Checkbox(interactive=admin, value=False),
302
  gr.Button(interactive=False),
303
  gr.Button(interactive=False),
304
+ gr.Checkbox(interactive=admin, value=False),
305
+ gr.Button(interactive=False),
306
+ gr.Textbox(interactive=admin, value=""),
307
+ gr.Button(interactive=False),
308
  status,
309
  )
310
 
 
496
  )
497
 
498
 
499
+ # Exact phrase an admin must type to arm the board-wide rescore. A
500
+ # free-text match (not a checkbox) is the deliberate "are you sure"
501
+ # friction: it can't be tripped by a stray click and forces the admin
502
+ # to consciously type the words before the heavy, score-invalidating
503
+ # action arms.
504
+ RESCORE_ALL_PHRASE = "RESCORE ALL"
505
+
506
+
507
+ def _arm_rescore_selected(
508
+ confirm: bool, profile: gr.OAuthProfile | None,
509
+ ) -> gr.Button:
510
+ """Arm the rescore-selected button once an admin ticks its confirm box."""
511
+ return gr.Button(interactive=bool(confirm) and is_admin(profile))
512
+
513
+
514
+ def _arm_rescore_all(
515
+ phrase: str | None, profile: gr.OAuthProfile | None,
516
+ ) -> gr.Button:
517
+ """Arm the rescore-all button only on an exact phrase match by an admin."""
518
+ matched = (phrase or "").strip() == RESCORE_ALL_PHRASE
519
+ return gr.Button(interactive=matched and is_admin(profile))
520
+
521
+
522
+ def _rescore_result_message(dispatched: int, skipped: list[str]) -> str:
523
+ """Toast text summarising a rescore dispatch."""
524
+ msg = (
525
+ f"Rescoring {dispatched} submission(s): rows flipped to pending and "
526
+ f"re-evaluating in the background. The leaderboard repopulates as "
527
+ f"each finishes."
528
+ )
529
+ if skipped:
530
+ msg += (
531
+ f" Skipped {len(skipped)} row(s) with no stored zip (legacy seed "
532
+ f"rows can't be rescored)."
533
+ )
534
+ return msg
535
+
536
+
537
+ def _admin_rescore_selected(
538
+ table_df: pd.DataFrame | None,
539
+ confirm: bool,
540
+ profile: gr.OAuthProfile | None,
541
+ ) -> tuple[
542
+ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
543
+ ]:
544
+ """Re-evaluate the ticked rows, refresh the views, then disarm.
545
+
546
+ Same gating contract as the destructive handlers: server-side
547
+ ``is_admin`` re-check, an explicit confirm tick, and a non-empty
548
+ selection. Resets the confirm box + disarms the button on the way
549
+ out so the next rescore needs a fresh, deliberate confirm.
550
+ """
551
+ if not is_admin(profile):
552
+ raise gr.Error("You are not in the admin set.")
553
+ if not confirm:
554
+ raise gr.Error("Tick the confirmation box to enable rescore.")
555
+ ids = _selected_ids(table_df)
556
+ if not ids:
557
+ raise gr.Error("Tick at least one row first.")
558
+ try:
559
+ dispatched, skipped = rescore_rows(ids)
560
+ except (LookupError, ValueError) as e:
561
+ raise gr.Error(str(e))
562
+ gr.Info(_rescore_result_message(dispatched, skipped))
563
+ validated, unvalidated, _ = _safe_load_split()
564
+ admin_df, _ = _safe_load_admin()
565
+ return (
566
+ admin_df,
567
+ validated,
568
+ unvalidated,
569
+ _gallery_iframe_html(),
570
+ gr.Checkbox(value=False),
571
+ gr.Button(interactive=False),
572
+ )
573
+
574
+
575
+ def _admin_rescore_all(
576
+ phrase: str | None,
577
+ profile: gr.OAuthProfile | None,
578
+ ) -> tuple[
579
+ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Textbox, gr.Button,
580
+ ]:
581
+ """Re-evaluate every rescoreable row, refresh the views, then disarm.
582
+
583
+ The heavy, board-wide action: re-checks ``is_admin`` and the exact
584
+ confirmation phrase server-side (so a tampered client that
585
+ re-enables the button still can't fire), clears the phrase box, and
586
+ disarms the button afterwards.
587
+ """
588
+ if not is_admin(profile):
589
+ raise gr.Error("You are not in the admin set.")
590
+ if (phrase or "").strip() != RESCORE_ALL_PHRASE:
591
+ raise gr.Error(
592
+ f"Type '{RESCORE_ALL_PHRASE}' exactly to confirm a full rescore."
593
+ )
594
+ try:
595
+ dispatched, skipped = rescore_all()
596
+ except ValueError as e:
597
+ raise gr.Error(str(e))
598
+ gr.Info(_rescore_result_message(dispatched, skipped))
599
+ validated, unvalidated, _ = _safe_load_split()
600
+ admin_df, _ = _safe_load_admin()
601
+ return (
602
+ admin_df,
603
+ validated,
604
+ unvalidated,
605
+ _gallery_iframe_html(),
606
+ gr.Textbox(value=""),
607
+ gr.Button(interactive=False),
608
+ )
609
+
610
+
611
  @lru_cache(maxsize=128)
612
  def _fetch_report_html(submission_id: str) -> bytes | None:
613
  """Pull ``reports/<id>.html`` off the submissions dataset.
 
1067
  "## Admin\n"
1068
  "Tick rows in the **select** column, then promote them into the "
1069
  "**Validated** tier (recording an evidence type), demote them back "
1070
+ "to **Unvalidated**, delete them, or rescore them against the "
1071
+ "current ground truth. Actions apply to every ticked row at once. "
1072
+ "Limited to maintainers in the admin set; everyone else sees the "
1073
+ "tab with the controls disabled."
1074
  )
1075
  admin_login_btn = gr.LoginButton()
1076
  admin_status = gr.Markdown(
 
1129
  "Stop & delete selected", variant="stop",
1130
  interactive=False,
1131
  )
1132
+ with gr.Accordion("Danger zone: rescore", open=False):
1133
+ gr.Markdown(
1134
+ "Re-evaluates submissions against the **current** "
1135
+ "ground truth + data: each row flips back to pending, the "
1136
+ "gallery renders and the per-submission report HTML are "
1137
+ "regenerated, and the score is recomputed. Use after a "
1138
+ "ground-truth swap or a metric change that invalidates the "
1139
+ "existing scores.\n\n"
1140
+ "Rescoring is **re-runnable**: if a row's eval fails, mark it "
1141
+ "and rescore again (or rescore all) — each run is "
1142
+ "independent and converges.\n\n"
1143
+ "- **Rescore selected** re-evaluates the ticked rows.\n"
1144
+ f"- **Rescore all** re-evaluates every submission that has a "
1145
+ f"stored zip and isn't already pending — type "
1146
+ f"`{RESCORE_ALL_PHRASE}` to arm it."
1147
+ )
1148
+ rescore_confirm = gr.Checkbox(
1149
+ label=(
1150
+ "I understand this flips the selected rows to pending and "
1151
+ "recomputes their scores."
1152
+ ),
1153
+ value=False,
1154
+ interactive=False,
1155
+ )
1156
+ rescore_selected_btn = gr.Button(
1157
+ "Rescore selected", variant="stop", interactive=False,
1158
+ )
1159
+ rescore_all_phrase = gr.Textbox(
1160
+ label=(
1161
+ f"Type '{RESCORE_ALL_PHRASE}' to arm the board-wide "
1162
+ f"rescore"
1163
+ ),
1164
+ placeholder=RESCORE_ALL_PHRASE,
1165
+ interactive=False,
1166
+ )
1167
+ rescore_all_btn = gr.Button(
1168
+ "Rescore ALL submissions", variant="stop", interactive=False,
1169
+ )
1170
  admin_refresh_btn = gr.Button("Refresh", size="sm")
1171
 
1172
  admin_table.change(
 
1205
  delete_confirm, delete_btn, stop_delete_btn,
1206
  ],
1207
  )
1208
+ rescore_confirm.change(
1209
+ fn=_arm_rescore_selected,
1210
+ inputs=[rescore_confirm],
1211
+ outputs=[rescore_selected_btn],
1212
+ )
1213
+ rescore_selected_btn.click(
1214
+ fn=_admin_rescore_selected,
1215
+ inputs=[admin_table, rescore_confirm],
1216
+ outputs=[
1217
+ admin_table, validated_view, unvalidated_view, gallery_html,
1218
+ rescore_confirm, rescore_selected_btn,
1219
+ ],
1220
+ )
1221
+ rescore_all_phrase.change(
1222
+ fn=_arm_rescore_all,
1223
+ inputs=[rescore_all_phrase],
1224
+ outputs=[rescore_all_btn],
1225
+ )
1226
+ rescore_all_btn.click(
1227
+ fn=_admin_rescore_all,
1228
+ inputs=[rescore_all_phrase],
1229
+ outputs=[
1230
+ admin_table, validated_view, unvalidated_view, gallery_html,
1231
+ rescore_all_phrase, rescore_all_btn,
1232
+ ],
1233
+ )
1234
  admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
1235
 
1236
  # Keep the admin table on the same 10s cadence as the leaderboard
 
1279
  delete_confirm,
1280
  delete_btn,
1281
  stop_delete_btn,
1282
+ rescore_confirm,
1283
+ rescore_selected_btn,
1284
+ rescore_all_phrase,
1285
+ rescore_all_btn,
1286
  admin_status,
1287
  ],
1288
  )
tests/test_admin.py CHANGED
@@ -88,6 +88,10 @@ def hub(monkeypatch):
88
  state["bucket_deleted_paths"].extend(delete or [])
89
 
90
  monkeypatch.setattr(submit, "_download_results_jsonl", fake_download)
 
 
 
 
91
  monkeypatch.setattr(submit._HF_API, "upload_file", fake_upload)
92
  monkeypatch.setattr(submit._HF_API, "delete_file", fake_delete_file)
93
  monkeypatch.setattr(submit._HF_API, "list_bucket_tree", fake_list_bucket_tree)
@@ -290,3 +294,157 @@ def test_stop_and_delete_empty_selection_raises(hub, jobs):
290
  admin.stop_and_delete_rows([])
291
  assert jobs["cancelled"] == []
292
  assert hub["uploads"] == 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  state["bucket_deleted_paths"].extend(delete or [])
89
 
90
  monkeypatch.setattr(submit, "_download_results_jsonl", fake_download)
91
+ # admin.py imported `_download_results_jsonl` by name (used directly by
92
+ # `rescore_all`), so patch that binding too; the RMW path reaches the
93
+ # submit-module reference patched above.
94
+ monkeypatch.setattr(admin, "_download_results_jsonl", fake_download)
95
  monkeypatch.setattr(submit._HF_API, "upload_file", fake_upload)
96
  monkeypatch.setattr(submit._HF_API, "delete_file", fake_delete_file)
97
  monkeypatch.setattr(submit._HF_API, "list_bucket_tree", fake_list_bucket_tree)
 
294
  admin.stop_and_delete_rows([])
295
  assert jobs["cancelled"] == []
296
  assert hub["uploads"] == 0
297
+
298
+
299
+ # --- Rescore -------------------------------------------------------------
300
+
301
+ # Rows shaped for the rescore path: a completed row with scores + a stored
302
+ # zip, a failed row with a zip, a legacy row with no zip, and a pending row
303
+ # (mid-eval). ``submitted_at`` is set so a test can assert it's preserved.
304
+ RESCORE_ROWS = [
305
+ {
306
+ "submission_id": "done",
307
+ "status": "completed",
308
+ "failure_reason": None,
309
+ "submission_blob_url": "https://blob/done.zip",
310
+ "submitted_at": "2026-01-01T00:00:00Z",
311
+ "aggregate_score": 0.7,
312
+ "validity_rate": 1.0,
313
+ "score_by_task_type": {"generation": 0.7},
314
+ "per_task_scores": {"generation": {"score": 0.7}},
315
+ "per_fixture_scores": {"f1": {"cad_score": 0.7}},
316
+ "per_fixture_breakdown": {"f1": {"validity": 1.0}},
317
+ },
318
+ {
319
+ "submission_id": "broke",
320
+ "status": "failed",
321
+ "failure_reason": "boom",
322
+ "submission_blob_url": "https://blob/broke.zip",
323
+ "submitted_at": "2026-01-02T00:00:00Z",
324
+ "aggregate_score": None,
325
+ },
326
+ {
327
+ "submission_id": "legacy",
328
+ "status": "completed",
329
+ "submission_blob_url": None,
330
+ "submitted_at": "2025-01-01T00:00:00Z",
331
+ "aggregate_score": 0.4,
332
+ },
333
+ {
334
+ "submission_id": "inflight",
335
+ "status": "pending",
336
+ "submission_blob_url": "https://blob/inflight.zip",
337
+ "submitted_at": "2026-02-01T00:00:00Z",
338
+ "aggregate_score": None,
339
+ },
340
+ ]
341
+
342
+
343
+ @pytest.fixture
344
+ def dispatch(monkeypatch):
345
+ """Capture rescore dispatch without spawning real workers/threads.
346
+
347
+ Replaces ``_dispatch_rescore_workers`` (which would start a daemon
348
+ thread that calls the submit path's ``_spawn_worker``) with a synch-
349
+ ronous recorder, and stubs the fixture-set lookup so the suite never
350
+ touches the data repo. ``state["targets"]`` is the ``{id: blob_url}``
351
+ map handed to dispatch; ``state["fixtures"]`` the fixture list.
352
+ """
353
+ state: dict = {"targets": None, "fixtures": None, "calls": 0}
354
+
355
+ def fake_dispatch(targets, fixture_names):
356
+ state["targets"] = dict(targets)
357
+ state["fixtures"] = list(fixture_names)
358
+ state["calls"] += 1
359
+
360
+ monkeypatch.setattr(admin, "_dispatch_rescore_workers", fake_dispatch)
361
+ monkeypatch.setattr(admin, "_current_fixture_names", lambda: ["f1", "f2"])
362
+ return state
363
+
364
+
365
+ def test_rescore_rows_flips_to_pending_and_dispatches(hub, dispatch):
366
+ """Rescore resets the row to the pending regime and queues a worker."""
367
+ hub["rows"] = [dict(r) for r in RESCORE_ROWS]
368
+ dispatched, skipped = admin.rescore_rows(["done"])
369
+
370
+ assert dispatched == 1
371
+ assert skipped == []
372
+ row = _row(hub["rows"], "done")
373
+ assert row["status"] == "pending"
374
+ assert row["failure_reason"] is None
375
+ # Every score-shaped field is cleared.
376
+ for field in admin._RESCORE_CLEARED_SCORE_FIELDS:
377
+ assert row[field] is None
378
+ # submitted_at is immutable provenance and must survive untouched.
379
+ assert row["submitted_at"] == "2026-01-01T00:00:00Z"
380
+ # The worker was queued with the stored zip url and current fixtures.
381
+ assert dispatch["targets"] == {"done": "https://blob/done.zip"}
382
+ assert dispatch["fixtures"] == ["f1", "f2"]
383
+ assert hub["uploads"] == 1
384
+
385
+
386
+ def test_rescore_rows_skips_rows_without_zip(hub, dispatch):
387
+ """A legacy row with no stored zip is skipped, not dispatched or erroring."""
388
+ hub["rows"] = [dict(r) for r in RESCORE_ROWS]
389
+ dispatched, skipped = admin.rescore_rows(["legacy"])
390
+
391
+ assert dispatched == 0
392
+ assert skipped == ["legacy"]
393
+ # The row is left exactly as-is (still completed, score intact).
394
+ row = _row(hub["rows"], "legacy")
395
+ assert row["status"] == "completed"
396
+ assert row["aggregate_score"] == 0.4
397
+ # Nothing to dispatch.
398
+ assert dispatch["calls"] == 0
399
+ # The reset write still happens (single RMW), but flips nothing here.
400
+ assert hub["uploads"] == 1
401
+
402
+
403
+ def test_rescore_rows_missing_id_raises_without_dispatch(hub, dispatch):
404
+ """An unknown id aborts the batch before any worker is queued."""
405
+ hub["rows"] = [dict(r) for r in RESCORE_ROWS]
406
+ with pytest.raises(LookupError):
407
+ admin.rescore_rows(["done", "ghost"])
408
+ # The mutate raised inside the RMW, so no row was flipped and no
409
+ # dispatch happened.
410
+ assert _row(hub["rows"], "done")["status"] == "completed"
411
+ assert dispatch["calls"] == 0
412
+
413
+
414
+ def test_rescore_rows_empty_selection_raises(hub, dispatch):
415
+ """An empty selection is a caller error."""
416
+ with pytest.raises(ValueError):
417
+ admin.rescore_rows([])
418
+ assert dispatch["calls"] == 0
419
+
420
+
421
+ def test_rescore_all_targets_completed_and_failed_only(hub, dispatch):
422
+ """Rescore-all hits rows with a zip, skipping pending + zip-less rows."""
423
+ hub["rows"] = [dict(r) for r in RESCORE_ROWS]
424
+ dispatched, skipped = admin.rescore_all()
425
+
426
+ # done + broke have zips and aren't pending; legacy has no zip;
427
+ # inflight is pending (mid-eval) -> neither dispatched.
428
+ assert dispatched == 2
429
+ assert set(dispatch["targets"]) == {"done", "broke"}
430
+ # Both targeted rows are now pending.
431
+ assert _row(hub["rows"], "done")["status"] == "pending"
432
+ assert _row(hub["rows"], "broke")["status"] == "pending"
433
+ # The pending in-flight row is left strictly alone.
434
+ assert _row(hub["rows"], "inflight")["status"] == "pending"
435
+ # The legacy row keeps its old completed score.
436
+ assert _row(hub["rows"], "legacy")["status"] == "completed"
437
+
438
+
439
+ def test_rescore_all_empty_board_raises(hub, dispatch):
440
+ """Rescore-all with nothing rescoreable is a no-op error, no write."""
441
+ hub["rows"] = [
442
+ {"submission_id": "inflight", "status": "pending",
443
+ "submission_blob_url": "https://blob/x.zip"},
444
+ {"submission_id": "legacy", "status": "completed",
445
+ "submission_blob_url": None},
446
+ ]
447
+ with pytest.raises(ValueError):
448
+ admin.rescore_all()
449
+ assert dispatch["calls"] == 0
450
+ assert hub["uploads"] == 0