Michael Rabinovich commited on
Commit
c4b5d70
·
1 Parent(s): b0f4559

admin: stop pending jobs before delete

Browse files

Add an admin action that cancels matching in-flight HF Jobs before deleting submissions, and keep the admin table refreshed so pending rows stay visible.

Files changed (3) hide show
  1. admin.py +82 -0
  2. app.py +237 -18
  3. tests/test_admin.py +95 -0
admin.py CHANGED
@@ -31,9 +31,11 @@ import os
31
  from typing import Any, Iterable
32
 
33
  import gradio as gr
 
34
  from huggingface_hub.errors import EntryNotFoundError
35
 
36
  from submit import (
 
37
  HF_SUBMISSIONS_REPO,
38
  REPORTS_DIR,
39
  SUBMISSIONS_DIR,
@@ -45,6 +47,14 @@ logger = logging.getLogger(__name__)
45
 
46
  ADMINS_ENV = "CADGENBENCH_ADMINS"
47
 
 
 
 
 
 
 
 
 
48
  # The evidence types accepted on promotion. Mirrors the
49
  # `validation_method` enum in cadgenbench-submissions/schema.md and the
50
  # validation policy doc.
@@ -193,6 +203,78 @@ def delete_rows(submission_ids: Iterable[str]) -> None:
193
  )
194
 
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  def _raise_for_missing(requested: set[str], seen: set[str]) -> None:
197
  """Raise ``LookupError`` if any requested id was not found in the rows."""
198
  missing = requested - seen
 
31
  from typing import Any, Iterable
32
 
33
  import gradio as gr
34
+ from huggingface_hub import cancel_job, list_jobs
35
  from huggingface_hub.errors import EntryNotFoundError
36
 
37
  from submit import (
38
+ EVAL_JOB_NAMESPACE,
39
  HF_SUBMISSIONS_REPO,
40
  REPORTS_DIR,
41
  SUBMISSIONS_DIR,
 
47
 
48
  ADMINS_ENV = "CADGENBENCH_ADMINS"
49
 
50
+ # HF Job stages that are already finished: cancelling one is a no-op (and
51
+ # usually an error), so the stop step skips them. Mirrors
52
+ # huggingface_hub.JobStage; kept as plain strings so a new terminal
53
+ # stage name added upstream doesn't import-break this module.
54
+ _JOB_TERMINAL_STAGES: frozenset[str] = frozenset(
55
+ {"COMPLETED", "ERROR", "CANCELED", "DELETED"}
56
+ )
57
+
58
  # The evidence types accepted on promotion. Mirrors the
59
  # `validation_method` enum in cadgenbench-submissions/schema.md and the
60
  # validation policy doc.
 
203
  )
204
 
205
 
206
+ def _cancel_jobs_for_submissions(ids: set[str]) -> int:
207
+ """Best-effort cancel every non-terminal eval Job for one of *ids*.
208
+
209
+ Each eval Job is dispatched with its ``submission_id`` baked into the
210
+ command argv (see :func:`submit._dispatch_eval_command`), so there's
211
+ no need to persist a ``job_id`` on the row: we list the eval
212
+ account's jobs and cancel any still-running one whose command
213
+ mentions a target id. This also catches a submission's shard jobs,
214
+ since each shard carries the same id in its command.
215
+
216
+ Never raises. A job that already finished, a listing failure, or a
217
+ cancel race must not block the row delete that follows -- the GPU job
218
+ carries its own ``--timeout`` and self-reaps if a cancel is missed.
219
+ Returns the count of cancel calls that succeeded (for logging only).
220
+ """
221
+ token = os.environ.get("HF_TOKEN")
222
+ try:
223
+ jobs = list_jobs(namespace=EVAL_JOB_NAMESPACE, token=token)
224
+ except Exception as e: # noqa: BLE001 - listing is best-effort
225
+ logger.warning(
226
+ "list_jobs(%s) failed (%s: %s); skipping job cancel, deleting rows",
227
+ EVAL_JOB_NAMESPACE, type(e).__name__, e,
228
+ )
229
+ return 0
230
+
231
+ cancelled = 0
232
+ for job in jobs:
233
+ stage = getattr(getattr(job, "status", None), "stage", None)
234
+ if stage in _JOB_TERMINAL_STAGES:
235
+ continue
236
+ argv = list(getattr(job, "command", None) or []) + list(
237
+ getattr(job, "arguments", None) or []
238
+ )
239
+ if not any(sid in argv for sid in ids):
240
+ continue
241
+ try:
242
+ cancel_job(
243
+ job_id=job.id, namespace=EVAL_JOB_NAMESPACE, token=token,
244
+ )
245
+ cancelled += 1
246
+ logger.info(
247
+ "Cancelled eval job %s (stage %s) before delete", job.id, stage,
248
+ )
249
+ except Exception as e: # noqa: BLE001 - cancel is best-effort
250
+ logger.warning(
251
+ "cancel_job(%s) failed (%s: %s); deleting row anyway",
252
+ job.id, type(e).__name__, e,
253
+ )
254
+ return cancelled
255
+
256
+
257
+ def stop_and_delete_rows(submission_ids: Iterable[str]) -> None:
258
+ """Cancel any running eval Job(s) for the listed rows, then delete them.
259
+
260
+ The "stop" step (:func:`_cancel_jobs_for_submissions`) is
261
+ best-effort and never raises; the "delete" step is the existing
262
+ :func:`delete_rows` (artifacts then row). So this is exactly "stop if
263
+ needed, then delete", and it is the right action for a stuck/pending
264
+ submission whose GPU job is still in flight.
265
+
266
+ Raises:
267
+ ValueError: no ids were given.
268
+ """
269
+ ids = _clean_id_set(submission_ids)
270
+ cancelled = _cancel_jobs_for_submissions(ids)
271
+ logger.info(
272
+ "stop_and_delete: cancelled %d job(s) for %d submission(s)",
273
+ cancelled, len(ids),
274
+ )
275
+ delete_rows(ids)
276
+
277
+
278
  def _raise_for_missing(requested: set[str], seen: set[str]) -> None:
279
  """Raise ``LookupError`` if any requested id was not found in the rows."""
280
  missing = requested - seen
app.py CHANGED
@@ -64,6 +64,7 @@ from admin import (
64
  demote_rows,
65
  is_admin,
66
  promote_rows,
 
67
  )
68
  from submit import handle_submit
69
 
@@ -297,7 +298,10 @@ def _admin_selection_status(table_df: pd.DataFrame | None) -> str:
297
 
298
  def _gate_admin_controls(
299
  profile: gr.OAuthProfile | None,
300
- ) -> tuple[gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button, str]:
 
 
 
301
  """Enable the admin controls only for a logged-in user in the admin set.
302
 
303
  Runs on every page load and re-runs on LoginButton auth events, so
@@ -305,8 +309,8 @@ def _gate_admin_controls(
305
  staying pinned to whatever rows existed when the Space process
306
  booted. Non-admins and logged-out visitors get the tab with the
307
  table read-only and every control disabled, mirroring the server-side
308
- re-check in each handler. The delete button always loads disarmed:
309
- it only enables once the confirm checkbox is ticked.
310
  """
311
  admin_df, error = _safe_load_admin()
312
  if error:
@@ -328,15 +332,21 @@ def _gate_admin_controls(
328
  gr.Button(interactive=admin),
329
  gr.Checkbox(interactive=admin, value=False),
330
  gr.Button(interactive=False),
 
331
  status,
332
  )
333
 
334
 
335
  def _arm_delete(
336
  confirm: bool, profile: gr.OAuthProfile | None,
337
- ) -> gr.Button:
338
- """Enable the delete button only when an admin has ticked the confirm box."""
339
- return gr.Button(interactive=bool(confirm) and is_admin(profile))
 
 
 
 
 
340
 
341
 
342
  def _refresh_admin_table() -> pd.DataFrame:
@@ -352,6 +362,43 @@ def _refresh_admin_table() -> pd.DataFrame:
352
  return admin_df
353
 
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  def _admin_promote(
356
  table_df: pd.DataFrame | None,
357
  method: str | None,
@@ -403,11 +450,14 @@ def _admin_delete(
403
  table_df: pd.DataFrame | None,
404
  confirm: bool,
405
  profile: gr.OAuthProfile | None,
406
- ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button]:
 
 
 
407
  """Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.
408
 
409
- Resets the confirm checkbox and re-disables the delete button on
410
- the way out so the next deletion needs a fresh, deliberate confirm.
411
  """
412
  if not is_admin(profile):
413
  raise gr.Error("You are not in the admin set.")
@@ -430,6 +480,47 @@ def _admin_delete(
430
  _gallery_iframe_html(),
431
  gr.Checkbox(value=False),
432
  gr.Button(interactive=False),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  )
434
 
435
 
@@ -601,16 +692,99 @@ def _data_uri(png_bytes: bytes | None) -> str | None:
601
  return "data:image/png;base64," + base64.b64encode(png_bytes).decode("ascii")
602
 
603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  def _render_data_uri(submission_id: str, fixture: str) -> str | None:
605
- """Resolver for a submission's per-fixture gallery thumbnail."""
 
 
 
 
606
  return _data_uri(_fetch_render(submission_id, fixture))
607
 
608
 
609
  def _gt_data_uri(fixture: str) -> str | None:
610
- """Resolver for a fixture's ground-truth gallery thumbnail."""
611
  return _data_uri(_fetch_gt_render(fixture))
612
 
613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
  def _gallery_iframe_html() -> str:
615
  """Build the gallery as a self-contained ``srcdoc`` iframe.
616
 
@@ -625,7 +799,8 @@ def _gallery_iframe_html() -> str:
625
  except LeaderboardDataError:
626
  logger.exception("Gallery row load failed; rendering empty gallery")
627
  rows = []
628
- doc = render_gallery_page(rows, _render_data_uri, _gt_data_uri)
 
629
  escaped = html.escape(doc, quote=True)
630
  return (
631
  f'<iframe srcdoc="{escaped}" '
@@ -855,7 +1030,10 @@ to publish the resulting row on the public leaderboard.
855
  gr.Markdown(
856
  "Permanently deletes the ticked rows **and** their uploaded "
857
  "zip + report files from the submissions dataset. This cannot "
858
- "be undone (only a manual revert of the dataset commit)."
 
 
 
859
  )
860
  delete_confirm = gr.Checkbox(
861
  label=(
@@ -865,9 +1043,14 @@ to publish the resulting row on the public leaderboard.
865
  value=False,
866
  interactive=False,
867
  )
868
- delete_btn = gr.Button(
869
- "Delete selected", variant="stop", interactive=False,
870
- )
 
 
 
 
 
871
  admin_refresh_btn = gr.Button("Refresh", size="sm")
872
 
873
  admin_table.change(
@@ -886,18 +1069,40 @@ to publish the resulting row on the public leaderboard.
886
  outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
887
  )
888
  delete_confirm.change(
889
- fn=_arm_delete, inputs=[delete_confirm], outputs=delete_btn,
 
 
890
  )
891
  delete_btn.click(
892
  fn=_admin_delete,
893
  inputs=[admin_table, delete_confirm],
894
  outputs=[
895
  admin_table, validated_view, unvalidated_view, gallery_html,
896
- delete_confirm, delete_btn,
 
 
 
 
 
 
 
 
897
  ],
898
  )
899
  admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
900
 
 
 
 
 
 
 
 
 
 
 
 
 
901
  # gradio_leaderboard.Leaderboard handles its own update path
902
  # cleanly; bind a Timer to push fresh dataframes every 10 seconds.
903
  # Single tick runs `_auto_refresh_leaderboard` once and pushes the
@@ -930,6 +1135,7 @@ to publish the resulting row on the public leaderboard.
930
  demote_btn,
931
  delete_confirm,
932
  delete_btn,
 
933
  admin_status,
934
  ],
935
  )
@@ -945,6 +1151,19 @@ app.add_api_route(
945
  serve_report,
946
  methods=["GET"],
947
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
948
  app = gr.mount_gradio_app(app, blocks, path="/")
949
 
950
 
 
64
  demote_rows,
65
  is_admin,
66
  promote_rows,
67
+ stop_and_delete_rows,
68
  )
69
  from submit import handle_submit
70
 
 
298
 
299
  def _gate_admin_controls(
300
  profile: gr.OAuthProfile | None,
301
+ ) -> tuple[
302
+ gr.Dataframe, gr.Radio, gr.Button, gr.Button, gr.Checkbox, gr.Button,
303
+ gr.Button, str,
304
+ ]:
305
  """Enable the admin controls only for a logged-in user in the admin set.
306
 
307
  Runs on every page load and re-runs on LoginButton auth events, so
 
309
  staying pinned to whatever rows existed when the Space process
310
  booted. Non-admins and logged-out visitors get the tab with the
311
  table read-only and every control disabled, mirroring the server-side
312
+ re-check in each handler. The delete + stop-and-delete buttons always
313
+ load disarmed: they only enable once the confirm checkbox is ticked.
314
  """
315
  admin_df, error = _safe_load_admin()
316
  if error:
 
332
  gr.Button(interactive=admin),
333
  gr.Checkbox(interactive=admin, value=False),
334
  gr.Button(interactive=False),
335
+ gr.Button(interactive=False),
336
  status,
337
  )
338
 
339
 
340
  def _arm_delete(
341
  confirm: bool, profile: gr.OAuthProfile | None,
342
+ ) -> tuple[gr.Button, gr.Button]:
343
+ """Arm both destructive buttons once an admin ticks the confirm box.
344
+
345
+ The plain delete and the stop-and-delete share the single confirm
346
+ checkbox, so a deliberate tick is required before either fires.
347
+ """
348
+ armed = bool(confirm) and is_admin(profile)
349
+ return gr.Button(interactive=armed), gr.Button(interactive=armed)
350
 
351
 
352
  def _refresh_admin_table() -> pd.DataFrame:
 
362
  return admin_df
363
 
364
 
365
+ def _reapply_selection(
366
+ fresh: pd.DataFrame, selected: set[str],
367
+ ) -> pd.DataFrame:
368
+ """Re-tick the ``select`` column on rows the maintainer had selected.
369
+
370
+ A freshly-loaded admin frame comes back all-unchecked; this carries
371
+ the prior ticks forward by ``submission_id`` so a background refresh
372
+ doesn't wipe an in-progress selection. Ids that vanished (e.g. a row
373
+ deleted out from under the table) simply drop out.
374
+ """
375
+ if (
376
+ selected
377
+ and ADMIN_SELECT_COL in fresh.columns
378
+ and "submission_id" in fresh.columns
379
+ ):
380
+ fresh[ADMIN_SELECT_COL] = (
381
+ fresh["submission_id"].astype(str).isin(selected)
382
+ )
383
+ return fresh
384
+
385
+
386
+ def _auto_refresh_admin_table(current_df: pd.DataFrame | None) -> pd.DataFrame:
387
+ """Timer-tick handler: reload the admin table, preserving ticked rows.
388
+
389
+ The leaderboard tables auto-refresh every 10s but the admin table did
390
+ not, so a pending row submitted after the tab loaded stayed invisible
391
+ until a manual Refresh. This keeps it current on the same cadence.
392
+ Unlike the leaderboard handler it stays silent (no per-tick toast)
393
+ and, on a Hub read failure, returns the current frame unchanged so a
394
+ transient blip never blanks the table or drops the user's selection.
395
+ """
396
+ admin_df, error = _safe_load_admin()
397
+ if error:
398
+ return current_df if current_df is not None else admin_df
399
+ return _reapply_selection(admin_df, set(_selected_ids(current_df)))
400
+
401
+
402
  def _admin_promote(
403
  table_df: pd.DataFrame | None,
404
  method: str | None,
 
450
  table_df: pd.DataFrame | None,
451
  confirm: bool,
452
  profile: gr.OAuthProfile | None,
453
+ ) -> tuple[
454
+ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
455
+ gr.Button,
456
+ ]:
457
  """Delete ticked rows, then refresh admin, leaderboard, gallery, and disarm.
458
 
459
+ Resets the confirm checkbox and re-disables both destructive buttons
460
+ on the way out so the next deletion needs a fresh, deliberate confirm.
461
  """
462
  if not is_admin(profile):
463
  raise gr.Error("You are not in the admin set.")
 
480
  _gallery_iframe_html(),
481
  gr.Checkbox(value=False),
482
  gr.Button(interactive=False),
483
+ gr.Button(interactive=False),
484
+ )
485
+
486
+
487
+ def _admin_stop_delete(
488
+ table_df: pd.DataFrame | None,
489
+ confirm: bool,
490
+ profile: gr.OAuthProfile | None,
491
+ ) -> tuple[
492
+ pd.DataFrame, pd.DataFrame, pd.DataFrame, str, gr.Checkbox, gr.Button,
493
+ gr.Button,
494
+ ]:
495
+ """Stop running eval job(s) for ticked rows, delete them, then disarm.
496
+
497
+ Same gating + disarm contract as :func:`_admin_delete`; the only
498
+ difference is it calls :func:`admin.stop_and_delete_rows`, which
499
+ best-effort cancels the submissions' in-flight HF Jobs before
500
+ deleting. Use this for pending rows whose GPU eval is still running.
501
+ """
502
+ if not is_admin(profile):
503
+ raise gr.Error("You are not in the admin set.")
504
+ if not confirm:
505
+ raise gr.Error("Tick the confirmation box to enable delete.")
506
+ ids = _selected_ids(table_df)
507
+ if not ids:
508
+ raise gr.Error("Tick at least one row first.")
509
+ try:
510
+ stop_and_delete_rows(ids)
511
+ except ValueError as e:
512
+ raise gr.Error(str(e))
513
+ gr.Info(f"Stopped + deleted {len(ids)} submission(s).")
514
+ validated, unvalidated, _ = _safe_load_split()
515
+ admin_df, _ = _safe_load_admin()
516
+ return (
517
+ admin_df,
518
+ validated,
519
+ unvalidated,
520
+ _gallery_iframe_html(),
521
+ gr.Checkbox(value=False),
522
+ gr.Button(interactive=False),
523
+ gr.Button(interactive=False),
524
  )
525
 
526
 
 
692
  return "data:image/png;base64," + base64.b64encode(png_bytes).decode("ascii")
693
 
694
 
695
+ # When the Space is **public**, switch the gallery from inlining base64
696
+ # thumbnails to referencing cached proxy URLs (``/render/...`` /
697
+ # ``/gt-render/...``). That lets the browser lazy-fetch only the ~33
698
+ # on-screen tiles (instead of shipping every fixture x row up front) and
699
+ # lets the CDN/browser cache them hard, so fixture-swaps and repeat
700
+ # visits are essentially free. While **private** this must stay off: HF's
701
+ # edge 404s in-browser fetches to our custom routes, so the only thing
702
+ # that renders in the browser is an inlined data URI. Flip
703
+ # ``GALLERY_PUBLIC=1`` in the Space env once the Space is made public.
704
+ GALLERY_PUBLIC = os.getenv("GALLERY_PUBLIC", "").strip().lower() in {
705
+ "1", "true", "yes", "on",
706
+ }
707
+
708
+ # Long-lived immutable caching: a (submission, fixture) render never
709
+ # changes (fixed camera + lighting; re-renders would be a new artifact),
710
+ # so the browser/CDN can keep it forever. This is what makes fixture
711
+ # swaps and repeat visits free once the Space is public.
712
+ RENDER_CACHE_CONTROL = "public, max-age=31536000, immutable"
713
+
714
+
715
  def _render_data_uri(submission_id: str, fixture: str) -> str | None:
716
+ """Resolver for a submission's per-fixture gallery thumbnail (private mode).
717
+
718
+ Inlines the render as a base64 ``data:`` URI: the only mode that
719
+ works in-browser while the Space is private.
720
+ """
721
  return _data_uri(_fetch_render(submission_id, fixture))
722
 
723
 
724
  def _gt_data_uri(fixture: str) -> str | None:
725
+ """Resolver for a fixture's ground-truth gallery thumbnail (private mode)."""
726
  return _data_uri(_fetch_gt_render(fixture))
727
 
728
 
729
+ def _render_proxy_url(submission_id: str, fixture: str) -> str | None:
730
+ """Resolver returning the cached proxy URL for a submission render.
731
+
732
+ Public mode only. Returns the route string **without** fetching the
733
+ bytes (that's the whole point: the browser lazy-fetches on demand).
734
+ The gallery only calls this for fixtures whose per-fixture status is
735
+ ``valid``; an absolute path resolves against the Space origin even
736
+ inside the iframe ``srcdoc``. A render that 404s (valid status but a
737
+ missing upload) degrades to the dashed cell client-side via the
738
+ ``<img onerror>`` hook.
739
+ """
740
+ return f"/render/{submission_id}/{fixture}.png"
741
+
742
+
743
+ def _gt_proxy_url(fixture: str) -> str | None:
744
+ """Resolver returning the cached proxy URL for a fixture's GT render."""
745
+ return f"/gt-render/{fixture}.png"
746
+
747
+
748
+ def _gallery_resolvers():
749
+ """Pick the (render, gt) resolver pair for the current Space mode.
750
+
751
+ Public -> lazy cached proxy URLs; private -> base64-inlined data URIs.
752
+ """
753
+ if GALLERY_PUBLIC:
754
+ return _render_proxy_url, _gt_proxy_url
755
+ return _render_data_uri, _gt_data_uri
756
+
757
+
758
+ def serve_render(submission_id: str, fixture: str) -> Response:
759
+ """Stream a submission's per-fixture render PNG with long-lived caching.
760
+
761
+ Used only in public mode (see :data:`GALLERY_PUBLIC`): the gallery
762
+ references ``/render/<id>/<fixture>.png`` and the browser fetches it
763
+ lazily. Re-streams the dataset bytes (the Space holds the read token)
764
+ with an immutable ``Cache-Control`` so the CDN/browser cache it hard.
765
+ """
766
+ png = _fetch_render(submission_id, fixture)
767
+ if png is None:
768
+ return Response(status_code=404)
769
+ return Response(
770
+ content=png,
771
+ media_type="image/png",
772
+ headers={"Cache-Control": RENDER_CACHE_CONTROL},
773
+ )
774
+
775
+
776
+ def serve_gt_render(fixture: str) -> Response:
777
+ """Stream a fixture's ground-truth render PNG with long-lived caching."""
778
+ png = _fetch_gt_render(fixture)
779
+ if png is None:
780
+ return Response(status_code=404)
781
+ return Response(
782
+ content=png,
783
+ media_type="image/png",
784
+ headers={"Cache-Control": RENDER_CACHE_CONTROL},
785
+ )
786
+
787
+
788
  def _gallery_iframe_html() -> str:
789
  """Build the gallery as a self-contained ``srcdoc`` iframe.
790
 
 
799
  except LeaderboardDataError:
800
  logger.exception("Gallery row load failed; rendering empty gallery")
801
  rows = []
802
+ render_resolver, gt_resolver = _gallery_resolvers()
803
+ doc = render_gallery_page(rows, render_resolver, gt_resolver)
804
  escaped = html.escape(doc, quote=True)
805
  return (
806
  f'<iframe srcdoc="{escaped}" '
 
1030
  gr.Markdown(
1031
  "Permanently deletes the ticked rows **and** their uploaded "
1032
  "zip + report files from the submissions dataset. This cannot "
1033
+ "be undone (only a manual revert of the dataset commit).\n\n"
1034
+ "**Stop & delete** additionally cancels any still-running "
1035
+ "evaluation job(s) for the ticked rows before deleting — use "
1036
+ "it for pending submissions whose GPU eval is in flight."
1037
  )
1038
  delete_confirm = gr.Checkbox(
1039
  label=(
 
1043
  value=False,
1044
  interactive=False,
1045
  )
1046
+ with gr.Row():
1047
+ delete_btn = gr.Button(
1048
+ "Delete selected", variant="stop", interactive=False,
1049
+ )
1050
+ stop_delete_btn = gr.Button(
1051
+ "Stop & delete selected", variant="stop",
1052
+ interactive=False,
1053
+ )
1054
  admin_refresh_btn = gr.Button("Refresh", size="sm")
1055
 
1056
  admin_table.change(
 
1069
  outputs=[admin_table, validated_view, unvalidated_view, gallery_html],
1070
  )
1071
  delete_confirm.change(
1072
+ fn=_arm_delete,
1073
+ inputs=[delete_confirm],
1074
+ outputs=[delete_btn, stop_delete_btn],
1075
  )
1076
  delete_btn.click(
1077
  fn=_admin_delete,
1078
  inputs=[admin_table, delete_confirm],
1079
  outputs=[
1080
  admin_table, validated_view, unvalidated_view, gallery_html,
1081
+ delete_confirm, delete_btn, stop_delete_btn,
1082
+ ],
1083
+ )
1084
+ stop_delete_btn.click(
1085
+ fn=_admin_stop_delete,
1086
+ inputs=[admin_table, delete_confirm],
1087
+ outputs=[
1088
+ admin_table, validated_view, unvalidated_view, gallery_html,
1089
+ delete_confirm, delete_btn, stop_delete_btn,
1090
  ],
1091
  )
1092
  admin_refresh_btn.click(fn=_refresh_admin_table, outputs=admin_table)
1093
 
1094
+ # Keep the admin table on the same 10s cadence as the leaderboard
1095
+ # so a row that lands (or a pending row that completes) after the
1096
+ # tab loaded shows up without a manual Refresh. Selection is
1097
+ # preserved across ticks so an in-progress set of checkboxes
1098
+ # survives the reload.
1099
+ admin_auto_refresh_timer = gr.Timer(10)
1100
+ admin_auto_refresh_timer.tick(
1101
+ fn=_auto_refresh_admin_table,
1102
+ inputs=admin_table,
1103
+ outputs=admin_table,
1104
+ )
1105
+
1106
  # gradio_leaderboard.Leaderboard handles its own update path
1107
  # cleanly; bind a Timer to push fresh dataframes every 10 seconds.
1108
  # Single tick runs `_auto_refresh_leaderboard` once and pushes the
 
1135
  demote_btn,
1136
  delete_confirm,
1137
  delete_btn,
1138
+ stop_delete_btn,
1139
  admin_status,
1140
  ],
1141
  )
 
1151
  serve_report,
1152
  methods=["GET"],
1153
  )
1154
+ # Cached render proxies the gallery references in public mode (no-ops while
1155
+ # private, where the gallery inlines base64 instead). Registered before the
1156
+ # Gradio mount so they're not shadowed by the catch-all sub-app.
1157
+ app.add_api_route(
1158
+ "/render/{submission_id}/{fixture}.png",
1159
+ serve_render,
1160
+ methods=["GET"],
1161
+ )
1162
+ app.add_api_route(
1163
+ "/gt-render/{fixture}.png",
1164
+ serve_gt_render,
1165
+ methods=["GET"],
1166
+ )
1167
  app = gr.mount_gradio_app(app, blocks, path="/")
1168
 
1169
 
tests/test_admin.py CHANGED
@@ -181,3 +181,98 @@ def test_delete_rows_removes_rows_and_artifacts(hub):
181
  "reports/alpha.json",
182
  ]
183
  assert hub["uploads"] == 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  "reports/alpha.json",
182
  ]
183
  assert hub["uploads"] == 1
184
+
185
+
186
+ def _job(job_id: str, stage: str, *args: str) -> SimpleNamespace:
187
+ """A minimal JobInfo stand-in: id, status.stage, and a command argv."""
188
+ return SimpleNamespace(
189
+ id=job_id,
190
+ status=SimpleNamespace(stage=stage, message=None),
191
+ command=["python", "/opt/eval_job.py", *args],
192
+ arguments=None,
193
+ )
194
+
195
+
196
+ @pytest.fixture
197
+ def jobs(monkeypatch):
198
+ """Mock the Jobs API (``list_jobs`` / ``cancel_job``) admin imports.
199
+
200
+ ``state["jobs"]`` is the list ``list_jobs`` returns;
201
+ ``state["cancelled"]`` records every ``job_id`` a ``cancel_job`` call
202
+ targeted, so a test can assert exactly which jobs were stopped.
203
+ """
204
+ state: dict = {"jobs": [], "cancelled": []}
205
+
206
+ def fake_list_jobs(*, namespace=None, token=None):
207
+ return state["jobs"]
208
+
209
+ def fake_cancel_job(*, job_id, namespace=None, token=None):
210
+ state["cancelled"].append(job_id)
211
+
212
+ monkeypatch.setattr(admin, "list_jobs", fake_list_jobs)
213
+ monkeypatch.setattr(admin, "cancel_job", fake_cancel_job)
214
+ return state
215
+
216
+
217
+ def test_stop_and_delete_cancels_running_then_deletes(hub, jobs):
218
+ """A running job whose command names the id is cancelled, then the row goes."""
219
+ jobs["jobs"] = [
220
+ _job("job-alpha", "RUNNING", "alpha", "https://blob/alpha.zip"),
221
+ _job("job-beta", "RUNNING", "beta", "https://blob/beta.zip"),
222
+ ]
223
+ admin.stop_and_delete_rows(["alpha"])
224
+ # Only alpha's job was cancelled.
225
+ assert jobs["cancelled"] == ["job-alpha"]
226
+ # And alpha's row + artifacts are gone, beta untouched.
227
+ assert {r["submission_id"] for r in hub["rows"]} == {"beta"}
228
+ assert "submissions/alpha.zip" in hub["deleted_paths"]
229
+
230
+
231
+ def test_stop_and_delete_catches_all_shard_jobs(hub, jobs):
232
+ """Every shard job for a submission (same id in argv) is cancelled."""
233
+ jobs["jobs"] = [
234
+ _job("job-a0", "RUNNING", "alpha", "url", "--shard-id", "shard_000"),
235
+ _job("job-a1", "RUNNING", "alpha", "url", "--shard-id", "shard_001"),
236
+ ]
237
+ admin.stop_and_delete_rows(["alpha"])
238
+ assert sorted(jobs["cancelled"]) == ["job-a0", "job-a1"]
239
+
240
+
241
+ def test_stop_and_delete_skips_terminal_jobs(hub, jobs):
242
+ """A finished job for the id is not cancelled, but the row still deletes."""
243
+ jobs["jobs"] = [
244
+ _job("job-alpha", "COMPLETED", "alpha", "url"),
245
+ ]
246
+ admin.stop_and_delete_rows(["alpha"])
247
+ assert jobs["cancelled"] == []
248
+ assert {r["submission_id"] for r in hub["rows"]} == {"beta"}
249
+
250
+
251
+ def test_stop_and_delete_tolerates_list_jobs_failure(hub, monkeypatch):
252
+ """A Jobs-API listing failure must not block the row delete."""
253
+ def boom(*, namespace=None, token=None):
254
+ raise RuntimeError("jobs API down")
255
+
256
+ monkeypatch.setattr(admin, "list_jobs", boom)
257
+ admin.stop_and_delete_rows(["alpha"])
258
+ assert {r["submission_id"] for r in hub["rows"]} == {"beta"}
259
+
260
+
261
+ def test_stop_and_delete_tolerates_cancel_failure(hub, jobs, monkeypatch):
262
+ """A cancel that errors is swallowed; the row still deletes."""
263
+ jobs["jobs"] = [_job("job-alpha", "RUNNING", "alpha", "url")]
264
+
265
+ def boom(*, job_id, namespace=None, token=None):
266
+ raise RuntimeError("cancel rejected")
267
+
268
+ monkeypatch.setattr(admin, "cancel_job", boom)
269
+ admin.stop_and_delete_rows(["alpha"])
270
+ assert {r["submission_id"] for r in hub["rows"]} == {"beta"}
271
+
272
+
273
+ def test_stop_and_delete_empty_selection_raises(hub, jobs):
274
+ """An empty selection is a caller error, before any job/list work."""
275
+ with pytest.raises(ValueError):
276
+ admin.stop_and_delete_rows([])
277
+ assert jobs["cancelled"] == []
278
+ assert hub["uploads"] == 0