submit: stuck-pending sweep on boot (chunk 6)
Browse filesStep 6 (E) chunk 6. A pending row whose worker died (Space
restart for a deploy, OOM, crash) has no one to flip it; without
this sweep it stays "pending" in the leaderboard forever (we
hit this twice during chunk-3..5 churn already).
On submit.py module import, spawn a daemon thread that:
1. Downloads results.jsonl (no lock; read-only).
2. Iterates rows; for each pending row whose `submitted_at`
is older than 30 min, calls _flip_row_to_failed(sid,
"evaluation interrupted by Space restart").
3. Logs the list of stuck ids so a deployer can see what was
swept.
Threshold is 30 min: well above the real eval ceiling on
cpu-upgrade (~5 min), so a genuinely-still-running submission is
safe. Hub-fetch failure at boot is non-fatal (logged warning,
sweep skipped; next boot retries). Per-row flip failures are
caught + logged + skipped (the rest of the sweep continues).
Opt-out via CADGENBENCH_DISABLE_BOOT_SWEEP=1 for test imports
that don't want the Hub round-trip.
Two test fixtures left over from chunk-3 race-with-rebuild
should flip on the next boot:
- dedup-verify-dan_..._20260528-071315 (~47 min old)
- dedup-verify-dan_..._20260528-071356 (~47 min old)
Closes the last bug-fix gap before chunk 7 (end-to-end smoke).
|
@@ -1,13 +1,17 @@
|
|
| 1 |
"""Submit-tab handler for the CADGenBench leaderboard Space.
|
| 2 |
|
| 3 |
-
Step 6 (E) chunks 2 + 3 + 4: cheap-sync validation
|
| 4 |
-
write + zip upload + background-thread eval
|
| 5 |
-
the upload, uploads the zip to
|
| 6 |
-
``
|
| 7 |
-
lock), spawns a daemon thread
|
| 8 |
-
``cadgenbench report single``, and
|
| 9 |
-
uploads ``reports/<id>.{html,json}``
|
| 10 |
-
``pending -> completed`` (or ``failed`` with a
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
Validation gates, in order:
|
| 13 |
|
|
@@ -68,7 +72,7 @@ import sys
|
|
| 68 |
import tempfile
|
| 69 |
import threading
|
| 70 |
import zipfile
|
| 71 |
-
from datetime import datetime, timezone
|
| 72 |
from pathlib import Path
|
| 73 |
from typing import Any
|
| 74 |
|
|
@@ -100,6 +104,10 @@ EVAL_TIMEOUT_SECONDS = 15 * 60
|
|
| 100 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 101 |
EVAL_WORKER_COUNT = "8"
|
| 102 |
SHA256_BLOCK_SIZE = 64 * 1024
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
# One HfApi client per process. HF_TOKEN is picked up from the env at
|
| 105 |
# construction time and reused for every call.
|
|
@@ -726,3 +734,95 @@ def _flip_row_to_failed(submission_id: str, reason: str) -> None:
|
|
| 726 |
submission_id,
|
| 727 |
{"status": "failed", "failure_reason": reason},
|
| 728 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Submit-tab handler for the CADGenBench leaderboard Space.
|
| 2 |
|
| 3 |
+
Step 6 (E) chunks 2 + 3 + 4 + 6: cheap-sync validation + pending-row
|
| 4 |
+
write + zip upload + background-thread eval + boot-time stuck-pending
|
| 5 |
+
sweep. The handler validates the upload, uploads the zip to
|
| 6 |
+
``submissions/<id>.zip``, appends a ``status: pending`` row to
|
| 7 |
+
``results.jsonl`` (under a process-wide lock), spawns a daemon thread
|
| 8 |
+
to run ``cadgenbench evaluate`` + ``cadgenbench report single``, and
|
| 9 |
+
returns immediately. The worker uploads ``reports/<id>.{html,json}``
|
| 10 |
+
and flips the row ``pending -> completed`` (or ``failed`` with a
|
| 11 |
+
``failure_reason``). At module import a one-shot daemon sweep flips
|
| 12 |
+
any ``pending`` row whose ``submitted_at`` is older than 30 min to
|
| 13 |
+
``failed`` with a "Space restart" reason, so rows stranded by a deploy
|
| 14 |
+
/ OOM / crash don't sit pending forever.
|
| 15 |
|
| 16 |
Validation gates, in order:
|
| 17 |
|
|
|
|
| 72 |
import tempfile
|
| 73 |
import threading
|
| 74 |
import zipfile
|
| 75 |
+
from datetime import datetime, timedelta, timezone
|
| 76 |
from pathlib import Path
|
| 77 |
from typing import Any
|
| 78 |
|
|
|
|
| 104 |
REPORT_TIMEOUT_SECONDS = 2 * 60
|
| 105 |
EVAL_WORKER_COUNT = "8"
|
| 106 |
SHA256_BLOCK_SIZE = 64 * 1024
|
| 107 |
+
STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
|
| 108 |
+
SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
| 109 |
+
STUCK_PENDING_REASON = "evaluation interrupted by Space restart"
|
| 110 |
+
BOOT_SWEEP_ENV = "CADGENBENCH_DISABLE_BOOT_SWEEP"
|
| 111 |
|
| 112 |
# One HfApi client per process. HF_TOKEN is picked up from the env at
|
| 113 |
# construction time and reused for every call.
|
|
|
|
| 734 |
submission_id,
|
| 735 |
{"status": "failed", "failure_reason": reason},
|
| 736 |
)
|
| 737 |
+
|
| 738 |
+
|
| 739 |
+
# ---------------------------------------------------------------------------
|
| 740 |
+
# Boot-time stuck-pending sweep
|
| 741 |
+
# ---------------------------------------------------------------------------
|
| 742 |
+
|
| 743 |
+
|
| 744 |
+
def _sweep_stuck_pending() -> None:
|
| 745 |
+
"""Flip pending rows older than the threshold to failed.
|
| 746 |
+
|
| 747 |
+
A ``pending`` row whose worker died (Space restart, OOM, crash)
|
| 748 |
+
has no one to flip it; without this sweep it stays pending in
|
| 749 |
+
the leaderboard forever. The check is "submitted_at older than
|
| 750 |
+
30 min" - well above the real eval ceiling (~5 min on
|
| 751 |
+
cpu-upgrade), so any genuinely-still-running submission is safe.
|
| 752 |
+
Runs once per process at module-import time inside a daemon
|
| 753 |
+
thread so app boot doesn't block on the Hub read.
|
| 754 |
+
"""
|
| 755 |
+
try:
|
| 756 |
+
body = _download_results_jsonl()
|
| 757 |
+
except Exception as e: # noqa: BLE001 - Hub API surface is broad
|
| 758 |
+
logger.warning(
|
| 759 |
+
"Stuck-pending sweep skipped, Hub fetch failed (%s: %s)",
|
| 760 |
+
type(e).__name__, e,
|
| 761 |
+
)
|
| 762 |
+
return
|
| 763 |
+
|
| 764 |
+
cutoff = datetime.now(timezone.utc) - timedelta(
|
| 765 |
+
seconds=STUCK_PENDING_THRESHOLD_SECONDS
|
| 766 |
+
)
|
| 767 |
+
stuck_ids: list[str] = []
|
| 768 |
+
for line in body.splitlines():
|
| 769 |
+
if not line.strip():
|
| 770 |
+
continue
|
| 771 |
+
try:
|
| 772 |
+
row = json.loads(line)
|
| 773 |
+
except json.JSONDecodeError:
|
| 774 |
+
continue
|
| 775 |
+
if row.get("status") != "pending":
|
| 776 |
+
continue
|
| 777 |
+
sid = row.get("submission_id")
|
| 778 |
+
ts_str = row.get("submitted_at")
|
| 779 |
+
if not sid or not ts_str:
|
| 780 |
+
continue
|
| 781 |
+
try:
|
| 782 |
+
ts = datetime.strptime(ts_str, SUBMITTED_AT_FORMAT).replace(
|
| 783 |
+
tzinfo=timezone.utc
|
| 784 |
+
)
|
| 785 |
+
except ValueError:
|
| 786 |
+
logger.warning(
|
| 787 |
+
"Skipping unparseable submitted_at %r on row %s",
|
| 788 |
+
ts_str, sid,
|
| 789 |
+
)
|
| 790 |
+
continue
|
| 791 |
+
if ts < cutoff:
|
| 792 |
+
stuck_ids.append(sid)
|
| 793 |
+
|
| 794 |
+
if not stuck_ids:
|
| 795 |
+
logger.info("Stuck-pending sweep: nothing stale")
|
| 796 |
+
return
|
| 797 |
+
|
| 798 |
+
logger.warning(
|
| 799 |
+
"Stuck-pending sweep: flipping %d row(s) to failed: %s",
|
| 800 |
+
len(stuck_ids), stuck_ids,
|
| 801 |
+
)
|
| 802 |
+
for sid in stuck_ids:
|
| 803 |
+
try:
|
| 804 |
+
_flip_row_to_failed(sid, STUCK_PENDING_REASON)
|
| 805 |
+
except Exception as e: # noqa: BLE001 - log + carry on per-row
|
| 806 |
+
logger.exception(
|
| 807 |
+
"Stuck-pending flip failed for %s (%s: %s)",
|
| 808 |
+
sid, type(e).__name__, e,
|
| 809 |
+
)
|
| 810 |
+
|
| 811 |
+
|
| 812 |
+
def _start_boot_sweep() -> None:
|
| 813 |
+
"""Spawn the sweep on a daemon thread at module import.
|
| 814 |
+
|
| 815 |
+
Setting ``CADGENBENCH_DISABLE_BOOT_SWEEP=1`` opts out (useful
|
| 816 |
+
for unit-test imports that don't want the Hub round-trip).
|
| 817 |
+
"""
|
| 818 |
+
if os.getenv(BOOT_SWEEP_ENV) == "1":
|
| 819 |
+
logger.info("Stuck-pending sweep disabled via %s", BOOT_SWEEP_ENV)
|
| 820 |
+
return
|
| 821 |
+
threading.Thread(
|
| 822 |
+
target=_sweep_stuck_pending,
|
| 823 |
+
name="cgb-boot-sweep",
|
| 824 |
+
daemon=True,
|
| 825 |
+
).start()
|
| 826 |
+
|
| 827 |
+
|
| 828 |
+
_start_boot_sweep()
|