Michael Rabinovich commited on
Commit
0a2a2c6
·
1 Parent(s): 3237736

submit: stuck-pending sweep on boot (chunk 6)

Browse files

Step 6 (E) chunk 6. A pending row whose worker died (Space
restart for a deploy, OOM, crash) has no one to flip it; without
this sweep it stays "pending" in the leaderboard forever (we
hit this twice during chunk-3..5 churn already).

On submit.py module import, spawn a daemon thread that:
1. Downloads results.jsonl (no lock; read-only).
2. Iterates rows; for each pending row whose `submitted_at`
is older than 30 min, calls _flip_row_to_failed(sid,
"evaluation interrupted by Space restart").
3. Logs the list of stuck ids so a deployer can see what was
swept.

Threshold is 30 min: well above the real eval ceiling on
cpu-upgrade (~5 min), so a genuinely-still-running submission is
safe. Hub-fetch failure at boot is non-fatal (logged warning,
sweep skipped; next boot retries). Per-row flip failures are
caught + logged + skipped (the rest of the sweep continues).

Opt-out via CADGENBENCH_DISABLE_BOOT_SWEEP=1 for test imports
that don't want the Hub round-trip.

Two test fixtures left over from chunk-3 race-with-rebuild
should flip on the next boot:
- dedup-verify-dan_..._20260528-071315 (~47 min old)
- dedup-verify-dan_..._20260528-071356 (~47 min old)

Closes the last bug-fix gap before chunk 7 (end-to-end smoke).

Files changed (1) hide show
  1. submit.py +109 -9
submit.py CHANGED
@@ -1,13 +1,17 @@
1
  """Submit-tab handler for the CADGenBench leaderboard Space.
2
 
3
- Step 6 (E) chunks 2 + 3 + 4: cheap-sync validation pipeline + pending-row
4
- write + zip upload + background-thread eval. The handler validates
5
- the upload, uploads the zip to ``submissions/<id>.zip``, appends a
6
- ``status: pending`` row to ``results.jsonl`` (under a process-wide
7
- lock), spawns a daemon thread to run ``cadgenbench evaluate`` +
8
- ``cadgenbench report single``, and returns immediately. The worker
9
- uploads ``reports/<id>.{html,json}`` and flips the row
10
- ``pending -> completed`` (or ``failed`` with a ``failure_reason``).
 
 
 
 
11
 
12
  Validation gates, in order:
13
 
@@ -68,7 +72,7 @@ import sys
68
  import tempfile
69
  import threading
70
  import zipfile
71
- from datetime import datetime, timezone
72
  from pathlib import Path
73
  from typing import Any
74
 
@@ -100,6 +104,10 @@ EVAL_TIMEOUT_SECONDS = 15 * 60
100
  REPORT_TIMEOUT_SECONDS = 2 * 60
101
  EVAL_WORKER_COUNT = "8"
102
  SHA256_BLOCK_SIZE = 64 * 1024
 
 
 
 
103
 
104
  # One HfApi client per process. HF_TOKEN is picked up from the env at
105
  # construction time and reused for every call.
@@ -726,3 +734,95 @@ def _flip_row_to_failed(submission_id: str, reason: str) -> None:
726
  submission_id,
727
  {"status": "failed", "failure_reason": reason},
728
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Submit-tab handler for the CADGenBench leaderboard Space.
2
 
3
+ Step 6 (E) chunks 2 + 3 + 4 + 6: cheap-sync validation + pending-row
4
+ write + zip upload + background-thread eval + boot-time stuck-pending
5
+ sweep. The handler validates the upload, uploads the zip to
6
+ ``submissions/<id>.zip``, appends a ``status: pending`` row to
7
+ ``results.jsonl`` (under a process-wide lock), spawns a daemon thread
8
+ to run ``cadgenbench evaluate`` + ``cadgenbench report single``, and
9
+ returns immediately. The worker uploads ``reports/<id>.{html,json}``
10
+ and flips the row ``pending -> completed`` (or ``failed`` with a
11
+ ``failure_reason``). At module import a one-shot daemon sweep flips
12
+ any ``pending`` row whose ``submitted_at`` is older than 30 min to
13
+ ``failed`` with a "Space restart" reason, so rows stranded by a deploy
14
+ / OOM / crash don't sit pending forever.
15
 
16
  Validation gates, in order:
17
 
 
72
  import tempfile
73
  import threading
74
  import zipfile
75
+ from datetime import datetime, timedelta, timezone
76
  from pathlib import Path
77
  from typing import Any
78
 
 
104
  REPORT_TIMEOUT_SECONDS = 2 * 60
105
  EVAL_WORKER_COUNT = "8"
106
  SHA256_BLOCK_SIZE = 64 * 1024
107
+ STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
108
+ SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
109
+ STUCK_PENDING_REASON = "evaluation interrupted by Space restart"
110
+ BOOT_SWEEP_ENV = "CADGENBENCH_DISABLE_BOOT_SWEEP"
111
 
112
  # One HfApi client per process. HF_TOKEN is picked up from the env at
113
  # construction time and reused for every call.
 
734
  submission_id,
735
  {"status": "failed", "failure_reason": reason},
736
  )
737
+
738
+
739
+ # ---------------------------------------------------------------------------
740
+ # Boot-time stuck-pending sweep
741
+ # ---------------------------------------------------------------------------
742
+
743
+
744
+ def _sweep_stuck_pending() -> None:
745
+ """Flip pending rows older than the threshold to failed.
746
+
747
+ A ``pending`` row whose worker died (Space restart, OOM, crash)
748
+ has no one to flip it; without this sweep it stays pending in
749
+ the leaderboard forever. The check is "submitted_at older than
750
+ 30 min" - well above the real eval ceiling (~5 min on
751
+ cpu-upgrade), so any genuinely-still-running submission is safe.
752
+ Runs once per process at module-import time inside a daemon
753
+ thread so app boot doesn't block on the Hub read.
754
+ """
755
+ try:
756
+ body = _download_results_jsonl()
757
+ except Exception as e: # noqa: BLE001 - Hub API surface is broad
758
+ logger.warning(
759
+ "Stuck-pending sweep skipped, Hub fetch failed (%s: %s)",
760
+ type(e).__name__, e,
761
+ )
762
+ return
763
+
764
+ cutoff = datetime.now(timezone.utc) - timedelta(
765
+ seconds=STUCK_PENDING_THRESHOLD_SECONDS
766
+ )
767
+ stuck_ids: list[str] = []
768
+ for line in body.splitlines():
769
+ if not line.strip():
770
+ continue
771
+ try:
772
+ row = json.loads(line)
773
+ except json.JSONDecodeError:
774
+ continue
775
+ if row.get("status") != "pending":
776
+ continue
777
+ sid = row.get("submission_id")
778
+ ts_str = row.get("submitted_at")
779
+ if not sid or not ts_str:
780
+ continue
781
+ try:
782
+ ts = datetime.strptime(ts_str, SUBMITTED_AT_FORMAT).replace(
783
+ tzinfo=timezone.utc
784
+ )
785
+ except ValueError:
786
+ logger.warning(
787
+ "Skipping unparseable submitted_at %r on row %s",
788
+ ts_str, sid,
789
+ )
790
+ continue
791
+ if ts < cutoff:
792
+ stuck_ids.append(sid)
793
+
794
+ if not stuck_ids:
795
+ logger.info("Stuck-pending sweep: nothing stale")
796
+ return
797
+
798
+ logger.warning(
799
+ "Stuck-pending sweep: flipping %d row(s) to failed: %s",
800
+ len(stuck_ids), stuck_ids,
801
+ )
802
+ for sid in stuck_ids:
803
+ try:
804
+ _flip_row_to_failed(sid, STUCK_PENDING_REASON)
805
+ except Exception as e: # noqa: BLE001 - log + carry on per-row
806
+ logger.exception(
807
+ "Stuck-pending flip failed for %s (%s: %s)",
808
+ sid, type(e).__name__, e,
809
+ )
810
+
811
+
812
+ def _start_boot_sweep() -> None:
813
+ """Spawn the sweep on a daemon thread at module import.
814
+
815
+ Setting ``CADGENBENCH_DISABLE_BOOT_SWEEP=1`` opts out (useful
816
+ for unit-test imports that don't want the Hub round-trip).
817
+ """
818
+ if os.getenv(BOOT_SWEEP_ENV) == "1":
819
+ logger.info("Stuck-pending sweep disabled via %s", BOOT_SWEEP_ENV)
820
+ return
821
+ threading.Thread(
822
+ target=_sweep_stuck_pending,
823
+ name="cgb-boot-sweep",
824
+ daemon=True,
825
+ ).start()
826
+
827
+
828
+ _start_boot_sweep()