submit: write pending row + upload zip on successful validation
Browse filesStep 6 (E) chunk 3. Cheap-sync validation (chunk 2) now also commits
to the Hub: on success the handler uploads submissions/<id>.zip and
appends a status=pending row to results.jsonl under a process-wide
threading.Lock. Eval still stubbed, the row stays pending forever
until chunk 4 lands the background worker.
Hub-write ordering (after validation passes):
1. Upload submissions/<id>.zip. Unique path per submission, no
lock needed (HfApi handles its own commit serialization).
2. Build the pending row: metadata fields + status="pending" +
failure_reason=null + submission_blob_url + null score fields,
matching the "Pending" regime in cadgenbench-submissions/schema.md.
3. Acquire _HUB_LOCK, download current results.jsonl (treat
EntryNotFoundError as empty), append one JSON line, re-upload.
Lock held only for the RMW cycle (~1-2s), not for eval time;
concurrent submitters serialise here, not in the worker.
If step 1 fails the submitter sees a clean "Hub upload failed"
rejection. If step 3 fails the zip is left orphaned in submissions/
and the submitter sees a clean "Hub write failed" rejection; an
orphan-zip sweep is a future-chunk concern (covered by the eventual
stuck-pending recovery in chunk 6 plus a manual sweep if needed).
leaderboard.py:
- Add `status` as the first leaderboard column so pending rows are
visually distinguishable from completed ones.
- Backfill `status` to "completed" in-memory for legacy rows
written before the schema bump (the three baseline seeds);
populated rows are completed by definition.
cadgenbench_version and cadgenbench_data_revision get pinned on the
pending row at submit time so the row is meaningful even if the
worker never runs and chunk 6 ends up flipping it to failed.
data_revision is the short sha from HfApi().dataset_info; cached
per-process and falls back to "unknown" on Hub flake (it's a
metadata field, not worth failing a submit over).
HF_DATA_REPO and HF_SUBMISSIONS_REPO are imported from leaderboard.py
for now; pulling them into a shared config module is a worthwhile
refactor once submit.py grows again in chunks 4+.
- leaderboard.py +8 -0
- submit.py +200 -10
|
@@ -23,6 +23,7 @@ HF_DATA_REPO = os.getenv("HF_DATA_REPO", f"{HF_ORG}/cadgenbench-data")
|
|
| 23 |
LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
|
| 24 |
|
| 25 |
LEADERBOARD_COLS = [
|
|
|
|
| 26 |
"submission_name",
|
| 27 |
"submitter_name",
|
| 28 |
"aggregate_score",
|
|
@@ -80,6 +81,13 @@ def load_leaderboard() -> pd.DataFrame:
|
|
| 80 |
if not rows:
|
| 81 |
return pd.DataFrame(columns=LEADERBOARD_COLS)
|
| 82 |
df = pd.DataFrame(rows)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
cols = [c for c in LEADERBOARD_COLS if c in df.columns]
|
| 84 |
df = (
|
| 85 |
df[cols]
|
|
|
|
| 23 |
LOCAL_RESULTS_PATH = Path(__file__).parent / "results.jsonl"
|
| 24 |
|
| 25 |
LEADERBOARD_COLS = [
|
| 26 |
+
"status",
|
| 27 |
"submission_name",
|
| 28 |
"submitter_name",
|
| 29 |
"aggregate_score",
|
|
|
|
| 81 |
if not rows:
|
| 82 |
return pd.DataFrame(columns=LEADERBOARD_COLS)
|
| 83 |
df = pd.DataFrame(rows)
|
| 84 |
+
# Backfill `status` for legacy rows written before the schema bump
|
| 85 |
+
# (the three baseline seed rows). They all have populated score
|
| 86 |
+
# fields, so "completed" is the correct retrofit.
|
| 87 |
+
if "status" not in df.columns:
|
| 88 |
+
df["status"] = "completed"
|
| 89 |
+
else:
|
| 90 |
+
df["status"] = df["status"].fillna("completed")
|
| 91 |
cols = [c for c in LEADERBOARD_COLS if c in df.columns]
|
| 92 |
df = (
|
| 93 |
df[cols]
|
|
@@ -1,10 +1,11 @@
|
|
| 1 |
"""Submit-tab handler for the CADGenBench leaderboard Space.
|
| 2 |
|
| 3 |
-
Step 6 (E)
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
| 8 |
|
| 9 |
Validation gates, in order:
|
| 10 |
|
|
@@ -23,19 +24,40 @@ Validation gates, in order:
|
|
| 23 |
geometry. Per-fixture validity (watertight, manifold, etc) is
|
| 24 |
*not* checked here, that's the evaluator's job and contributes to
|
| 25 |
the per-fixture score; this gate only rejects "not actually STEP".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"""
|
| 27 |
from __future__ import annotations
|
| 28 |
|
| 29 |
import json
|
|
|
|
| 30 |
import re
|
| 31 |
import tempfile
|
|
|
|
| 32 |
import zipfile
|
| 33 |
from datetime import datetime, timezone
|
| 34 |
from pathlib import Path
|
| 35 |
from typing import Any
|
| 36 |
|
|
|
|
| 37 |
from cadgenbench.common.paths import data_inputs_dir
|
| 38 |
from cadgenbench.common.validity import parse_step
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
NOTES_MAX_CHARS = 500
|
| 41 |
REQUIRED_META_KEYS: tuple[str, ...] = (
|
|
@@ -46,12 +68,31 @@ REQUIRED_META_KEYS: tuple[str, ...] = (
|
|
| 46 |
"agree_to_publish",
|
| 47 |
)
|
| 48 |
SUBMISSION_ID_SLUG_MAX = 40
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
class _ValidationError(Exception):
|
| 52 |
"""Internal sentinel that maps to a user-facing rejection message."""
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def handle_submit(
|
| 56 |
zip_file,
|
| 57 |
submission_name: str, # noqa: ARG001 - kept for UI compat; meta.json wins
|
|
@@ -93,12 +134,19 @@ def handle_submit(
|
|
| 93 |
submission_id = _mint_submission_id(
|
| 94 |
meta["submitter_name"], meta["submission_name"]
|
| 95 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
return (
|
| 97 |
-
f"**
|
| 98 |
-
f"
|
| 99 |
-
f"`{meta['
|
| 100 |
-
f"
|
| 101 |
-
f"
|
| 102 |
)
|
| 103 |
|
| 104 |
|
|
@@ -259,3 +307,145 @@ def _slug(s: str) -> str:
|
|
| 259 |
"""Filesystem-safe slug. Lowercase, ``[a-z0-9-]``, collapsed dashes."""
|
| 260 |
cleaned = re.sub(r"[^A-Za-z0-9]+", "-", s).strip("-").lower()
|
| 261 |
return cleaned[:SUBMISSION_ID_SLUG_MAX] or "unnamed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Submit-tab handler for the CADGenBench leaderboard Space.
|
| 2 |
|
| 3 |
+
Step 6 (E) chunks 2 + 3: cheap-sync validation pipeline + pending-row
|
| 4 |
+
write + zip upload. The handler validates the upload, uploads the
|
| 5 |
+
zip to ``submissions/<id>.zip``, appends a ``status: pending`` row to
|
| 6 |
+
``results.jsonl`` (under a process-wide lock), and returns
|
| 7 |
+
immediately. No eval and no worker yet, the row stays pending
|
| 8 |
+
forever until later chunks add the background thread.
|
| 9 |
|
| 10 |
Validation gates, in order:
|
| 11 |
|
|
|
|
| 24 |
geometry. Per-fixture validity (watertight, manifold, etc) is
|
| 25 |
*not* checked here, that's the evaluator's job and contributes to
|
| 26 |
the per-fixture score; this gate only rejects "not actually STEP".
|
| 27 |
+
|
| 28 |
+
Hub-write ordering (after validation passes):
|
| 29 |
+
|
| 30 |
+
1. Upload ``submissions/<id>.zip``. Unique path per submission, no
|
| 31 |
+
lock needed.
|
| 32 |
+
2. Build pending row (metadata + null scores + ``submission_blob_url``).
|
| 33 |
+
3. Acquire ``_HUB_LOCK``; download current ``results.jsonl`` (or
|
| 34 |
+
start empty); append the pending row; re-upload.
|
| 35 |
+
|
| 36 |
+
If step 1 fails the user sees a clean rejection. If step 3 fails the
|
| 37 |
+
zip is left orphaned in ``submissions/`` and the user sees a clean
|
| 38 |
+
rejection; an orphan-zip sweep is a future-chunk concern.
|
| 39 |
"""
|
| 40 |
from __future__ import annotations
|
| 41 |
|
| 42 |
import json
|
| 43 |
+
import logging
|
| 44 |
import re
|
| 45 |
import tempfile
|
| 46 |
+
import threading
|
| 47 |
import zipfile
|
| 48 |
from datetime import datetime, timezone
|
| 49 |
from pathlib import Path
|
| 50 |
from typing import Any
|
| 51 |
|
| 52 |
+
import cadgenbench
|
| 53 |
from cadgenbench.common.paths import data_inputs_dir
|
| 54 |
from cadgenbench.common.validity import parse_step
|
| 55 |
+
from huggingface_hub import HfApi
|
| 56 |
+
from huggingface_hub.errors import EntryNotFoundError
|
| 57 |
+
|
| 58 |
+
from leaderboard import HF_DATA_REPO, HF_SUBMISSIONS_REPO
|
| 59 |
+
|
| 60 |
+
logger = logging.getLogger(__name__)
|
| 61 |
|
| 62 |
NOTES_MAX_CHARS = 500
|
| 63 |
REQUIRED_META_KEYS: tuple[str, ...] = (
|
|
|
|
| 68 |
"agree_to_publish",
|
| 69 |
)
|
| 70 |
SUBMISSION_ID_SLUG_MAX = 40
|
| 71 |
+
RESULTS_FILENAME = "results.jsonl"
|
| 72 |
+
SUBMISSIONS_DIR = "submissions"
|
| 73 |
+
DATA_REV_SHORT_LEN = 12
|
| 74 |
+
|
| 75 |
+
# One HfApi client per process. HF_TOKEN is picked up from the env at
|
| 76 |
+
# construction time and reused for every call.
|
| 77 |
+
_HF_API = HfApi()
|
| 78 |
+
|
| 79 |
+
# Process-wide lock guarding the read-modify-write of results.jsonl.
|
| 80 |
+
# The Space is single-process so a threading.Lock is sufficient; held
|
| 81 |
+
# only for the duration of the RMW cycle (~1-2s), not for eval time.
|
| 82 |
+
_HUB_LOCK = threading.Lock()
|
| 83 |
+
|
| 84 |
+
# Lazily-resolved cadgenbench-data revision, cached per process.
|
| 85 |
+
_DATA_REVISION: str | None = None
|
| 86 |
|
| 87 |
|
| 88 |
class _ValidationError(Exception):
|
| 89 |
"""Internal sentinel that maps to a user-facing rejection message."""
|
| 90 |
|
| 91 |
|
| 92 |
+
class _HubWriteError(Exception):
|
| 93 |
+
"""Raised when a Hub upload fails after validation succeeded."""
|
| 94 |
+
|
| 95 |
+
|
| 96 |
def handle_submit(
|
| 97 |
zip_file,
|
| 98 |
submission_name: str, # noqa: ARG001 - kept for UI compat; meta.json wins
|
|
|
|
| 134 |
submission_id = _mint_submission_id(
|
| 135 |
meta["submitter_name"], meta["submission_name"]
|
| 136 |
)
|
| 137 |
+
try:
|
| 138 |
+
blob_url = _upload_submission_zip(submission_id, zip_path)
|
| 139 |
+
row = _build_pending_row(submission_id, meta, fixture_names, blob_url)
|
| 140 |
+
_append_pending_row(row)
|
| 141 |
+
except _HubWriteError as e:
|
| 142 |
+
return f"**Submission rejected.** {e}"
|
| 143 |
+
|
| 144 |
return (
|
| 145 |
+
f"**Queued.** Submission `{submission_id}` has been accepted and a "
|
| 146 |
+
f"`pending` row added to the leaderboard (submitter: "
|
| 147 |
+
f"`{meta['submitter_name']}`, system: `{meta['submission_name']}`, "
|
| 148 |
+
f"{len(fixture_names)} fixtures). Evaluation will populate the "
|
| 149 |
+
f"score columns once the worker lands in a later chunk."
|
| 150 |
)
|
| 151 |
|
| 152 |
|
|
|
|
| 307 |
"""Filesystem-safe slug. Lowercase, ``[a-z0-9-]``, collapsed dashes."""
|
| 308 |
cleaned = re.sub(r"[^A-Za-z0-9]+", "-", s).strip("-").lower()
|
| 309 |
return cleaned[:SUBMISSION_ID_SLUG_MAX] or "unnamed"
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def _upload_submission_zip(submission_id: str, zip_path: Path) -> str:
|
| 313 |
+
"""Upload the submission zip to ``submissions/<id>.zip``.
|
| 314 |
+
|
| 315 |
+
Returns the canonical Hub blob URL on success. Raises
|
| 316 |
+
:class:`_HubWriteError` with a short user-facing reason on
|
| 317 |
+
failure.
|
| 318 |
+
"""
|
| 319 |
+
repo_path = f"{SUBMISSIONS_DIR}/{submission_id}.zip"
|
| 320 |
+
try:
|
| 321 |
+
_HF_API.upload_file(
|
| 322 |
+
path_or_fileobj=str(zip_path),
|
| 323 |
+
path_in_repo=repo_path,
|
| 324 |
+
repo_id=HF_SUBMISSIONS_REPO,
|
| 325 |
+
repo_type="dataset",
|
| 326 |
+
commit_message=f"add submission zip for {submission_id}",
|
| 327 |
+
)
|
| 328 |
+
except Exception as e: # noqa: BLE001 - Hub API surface is broad
|
| 329 |
+
logger.exception("Failed to upload submission zip %s", submission_id)
|
| 330 |
+
raise _HubWriteError(
|
| 331 |
+
f"Server-side error uploading submission zip "
|
| 332 |
+
f"({type(e).__name__}: {e}). Please try again later."
|
| 333 |
+
) from e
|
| 334 |
+
return (
|
| 335 |
+
f"https://huggingface.co/datasets/{HF_SUBMISSIONS_REPO}"
|
| 336 |
+
f"/resolve/main/{repo_path}"
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
def _build_pending_row(
|
| 341 |
+
submission_id: str,
|
| 342 |
+
meta: dict[str, Any],
|
| 343 |
+
fixture_names: set[str], # noqa: ARG001 - kept for future per-fixture pre-fill
|
| 344 |
+
blob_url: str,
|
| 345 |
+
) -> dict[str, Any]:
|
| 346 |
+
"""Construct the JSON row written for a freshly-queued submission.
|
| 347 |
+
|
| 348 |
+
Mirrors the pending regime in ``cadgenbench-submissions/schema.md``:
|
| 349 |
+
metadata + ``status: pending`` + ``submission_blob_url``; every
|
| 350 |
+
score-shaped field is ``null`` until the worker flips the row.
|
| 351 |
+
"""
|
| 352 |
+
return {
|
| 353 |
+
"submission_id": submission_id,
|
| 354 |
+
"status": "pending",
|
| 355 |
+
"failure_reason": None,
|
| 356 |
+
"submitter_name": meta["submitter_name"],
|
| 357 |
+
"submission_name": meta["submission_name"],
|
| 358 |
+
"agent_url": meta["agent_url"],
|
| 359 |
+
"notes": meta["notes"],
|
| 360 |
+
"submitted_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
| 361 |
+
"cadgenbench_version": cadgenbench.__version__,
|
| 362 |
+
"cadgenbench_data_revision": _resolve_data_revision(),
|
| 363 |
+
"validity_rate": None,
|
| 364 |
+
"aggregate_score": None,
|
| 365 |
+
"score_by_task_type": None,
|
| 366 |
+
"per_task_scores": None,
|
| 367 |
+
"per_fixture_scores": None,
|
| 368 |
+
"per_fixture_breakdown": None,
|
| 369 |
+
"submission_blob_url": blob_url,
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def _append_pending_row(row: dict[str, Any]) -> None:
|
| 374 |
+
"""Append a pending row to ``results.jsonl`` on the Hub under the lock.
|
| 375 |
+
|
| 376 |
+
Read-modify-write: download the current file (or treat as empty if
|
| 377 |
+
it doesn't exist yet), append one line, re-upload. The lock is
|
| 378 |
+
held only for the duration of this cycle (~1-2s), not for any
|
| 379 |
+
background eval; concurrent submitters serialise here, not on the
|
| 380 |
+
eval pipeline.
|
| 381 |
+
"""
|
| 382 |
+
with _HUB_LOCK:
|
| 383 |
+
try:
|
| 384 |
+
existing = _download_results_jsonl()
|
| 385 |
+
except Exception as e: # noqa: BLE001 - Hub API surface is broad
|
| 386 |
+
logger.exception("Failed to download results.jsonl for append")
|
| 387 |
+
raise _HubWriteError(
|
| 388 |
+
f"Server-side error reading the submissions table "
|
| 389 |
+
f"({type(e).__name__}: {e}). Please try again later."
|
| 390 |
+
) from e
|
| 391 |
+
|
| 392 |
+
line = json.dumps(row, ensure_ascii=False)
|
| 393 |
+
new_body = existing + line + "\n" if existing else line + "\n"
|
| 394 |
+
|
| 395 |
+
try:
|
| 396 |
+
_HF_API.upload_file(
|
| 397 |
+
path_or_fileobj=new_body.encode("utf-8"),
|
| 398 |
+
path_in_repo=RESULTS_FILENAME,
|
| 399 |
+
repo_id=HF_SUBMISSIONS_REPO,
|
| 400 |
+
repo_type="dataset",
|
| 401 |
+
commit_message=(
|
| 402 |
+
f"add pending row for {row['submission_id']}"
|
| 403 |
+
),
|
| 404 |
+
)
|
| 405 |
+
except Exception as e: # noqa: BLE001 - Hub API surface is broad
|
| 406 |
+
logger.exception(
|
| 407 |
+
"Failed to upload appended results.jsonl for %s",
|
| 408 |
+
row["submission_id"],
|
| 409 |
+
)
|
| 410 |
+
raise _HubWriteError(
|
| 411 |
+
f"Server-side error writing the submissions table "
|
| 412 |
+
f"({type(e).__name__}: {e}). The submission zip was "
|
| 413 |
+
f"uploaded but the row was not; please try again later."
|
| 414 |
+
) from e
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def _download_results_jsonl() -> str:
|
| 418 |
+
"""Fetch the current ``results.jsonl`` body as text, or ``""`` if absent."""
|
| 419 |
+
from huggingface_hub import hf_hub_download
|
| 420 |
+
|
| 421 |
+
try:
|
| 422 |
+
path = hf_hub_download(
|
| 423 |
+
repo_id=HF_SUBMISSIONS_REPO,
|
| 424 |
+
filename=RESULTS_FILENAME,
|
| 425 |
+
repo_type="dataset",
|
| 426 |
+
force_download=True,
|
| 427 |
+
)
|
| 428 |
+
except EntryNotFoundError:
|
| 429 |
+
return ""
|
| 430 |
+
return Path(path).read_text(encoding="utf-8")
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def _resolve_data_revision() -> str:
|
| 434 |
+
"""Return a short sha for the cadgenbench-data dataset, cached per process.
|
| 435 |
+
|
| 436 |
+
Falls back to ``"unknown"`` on Hub errors so a flaky network can't
|
| 437 |
+
block a submission over a metadata field.
|
| 438 |
+
"""
|
| 439 |
+
global _DATA_REVISION
|
| 440 |
+
if _DATA_REVISION is not None:
|
| 441 |
+
return _DATA_REVISION
|
| 442 |
+
try:
|
| 443 |
+
info = _HF_API.dataset_info(HF_DATA_REPO)
|
| 444 |
+
_DATA_REVISION = (info.sha or "unknown")[:DATA_REV_SHORT_LEN]
|
| 445 |
+
except Exception as e: # noqa: BLE001 - metadata only, don't fail the submit
|
| 446 |
+
logger.warning(
|
| 447 |
+
"Failed to resolve cadgenbench-data revision (%s: %s)",
|
| 448 |
+
type(e).__name__, e,
|
| 449 |
+
)
|
| 450 |
+
_DATA_REVISION = "unknown"
|
| 451 |
+
return _DATA_REVISION
|