Spaces:

HuggingAI4Engineering
/

cadgenbench-eval-gpu

Paused

Michael Rabinovich Cursor commited on Jun 2

Commit

b2e9d3a

1 Parent(s): cff8663

eval_job: add shard mode (--fixtures/--shard-id) for sharded submission eval

When --fixtures is set, prune the run dir to that shard's slice, evaluate
only it, and upload the per-fixture dirs to reports/<id>/shards/<shard_id>/
for the Space to merge. No --fixtures keeps the original whole-submission
path (report.json + html + gallery) unchanged.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show

eval_job.py +151 -3

eval_job.py CHANGED Viewed

@@ -12,9 +12,12 @@ Invoked by the leaderboard Space's worker (see
         --secrets HF_TOKEN \\
         python /opt/eval_job.py <submission_id> <zip_url>
-Pipeline, in order. Synchronous, no fallbacks. Any failure raises
-and the container exits non-zero; the Space's poller catches the
-ERROR stage and flips the submission row to ``failed``.
 1. Download ``submissions/<id>.zip`` from the submissions dataset
    via ``hf_hub_download`` (auth via ``HF_TOKEN``).
@@ -31,6 +34,19 @@ ERROR stage and flips the submission row to ``failed``.
 The Space-side worker then downloads ``reports/<id>.json``, reads
 ``run_summary`` out of it, and flips the row to ``completed``.
 """
 from __future__ import annotations
@@ -56,6 +72,11 @@ REPORT_TIMEOUT_SECONDS = 5 * 60
 REPORTS_DIR_IN_REPO = "reports"
 RENDERS_DIR_IN_REPO = "renders"
 # Single canonical view uploaded per fixture for the leaderboard
 # gallery thumbnail. "iso" matches the GT render the gallery pairs it
 # with, so the gallery columns stay a comparable matrix at one fixed
@@ -79,6 +100,25 @@ def main() -> int:
             "(submission_blob_url from the row)."
         ),
     )
     args = parser.parse_args()
     submission_id: str = args.submission_id
@@ -88,6 +128,28 @@ def main() -> int:
     submissions_repo = _require_env("HF_SUBMISSIONS_REPO")
     worker_count = int(os.environ.get("EVAL_WORKER_COUNT", "8"))
     print(
         f"[eval_job] submission_id={submission_id} "
         f"workers={worker_count} repo={submissions_repo}",
@@ -107,6 +169,28 @@ def main() -> int:
     return 0
 def _require_env(name: str) -> str:
     """Return env var *name* or raise with a clear message."""
     value = os.environ.get(name)
@@ -157,6 +241,70 @@ def _prepare_run_dir(
     print(f"[eval_job] unpacked into {RUN_DIR}", flush=True)
 def _run_eval(run_dir: Path, workers: int) -> None:
     """Invoke ``cadgenbench evaluate`` over *run_dir*; raise on non-zero."""
     cmd = [

         --secrets HF_TOKEN \\
         python /opt/eval_job.py <submission_id> <zip_url>
+Two run modes:
+**Whole-submission (default, no ``--fixtures``)** -- the original path.
+Synchronous, no fallbacks. Any failure raises and the container exits
+non-zero; the Space's poller catches the ERROR stage and flips the
+submission row to ``failed``.
 1. Download ``submissions/<id>.zip`` from the submissions dataset
    via ``hf_hub_download`` (auth via ``HF_TOKEN``).
 The Space-side worker then downloads ``reports/<id>.json``, reads
 ``run_summary`` out of it, and flips the row to ``completed``.
+**Shard (``--fixtures f1,f2,... --shard-id shard_000``)** -- used by
+the Space's sharded submit path (UC3) to fan a large submission across
+several jobs. Steps 1-2 are identical, then the run dir is pruned to
+just this shard's fixtures, ``cadgenbench evaluate`` runs over that
+subset, and the resulting per-fixture dirs (``result.json`` + renders)
+are uploaded *verbatim* under ``reports/<id>/shards/<shard_id>/``. No
+report HTML, ``report.json``, or gallery render is produced per shard:
+the Space downloads every shard's fixture dirs, merges them into one
+run dir, and builds the single ``run_summary`` + report + gallery from
+the merged whole (mirroring the orchestrator's ``_merge_eval``). Exit 0
+on success; any failure exits non-zero and the Space marks that shard
+ERROR and retries it.
 """
 from __future__ import annotations
 REPORTS_DIR_IN_REPO = "reports"
 RENDERS_DIR_IN_REPO = "renders"
+# Sub-prefix under ``reports/<id>/`` where each shard uploads its raw
+# per-fixture dirs in shard mode. The Space merges these and deletes the
+# whole ``shards/`` tree after a successful merge.
+SHARDS_DIR_NAME = "shards"
 # Single canonical view uploaded per fixture for the leaderboard
 # gallery thumbnail. "iso" matches the GT render the gallery pairs it
 # with, so the gallery columns stay a comparable matrix at one fixed
             "(submission_blob_url from the row)."
         ),
     )
+    parser.add_argument(
+        "--fixtures",
+        default=None,
+        help=(
+            "Comma-separated fixture subset for shard mode. When set, the "
+            "run dir is pruned to just these fixtures, evaluated, and the "
+            "per-fixture dirs are uploaded under "
+            "reports/<id>/shards/<shard-id>/ for the Space to merge. "
+            "Omit for the original whole-submission path."
+        ),
+    )
+    parser.add_argument(
+        "--shard-id",
+        default=None,
+        help=(
+            "Shard label (e.g. shard_000) naming this shard's upload prefix. "
+            "Required when --fixtures is set."
+        ),
+    )
     args = parser.parse_args()
     submission_id: str = args.submission_id
     submissions_repo = _require_env("HF_SUBMISSIONS_REPO")
     worker_count = int(os.environ.get("EVAL_WORKER_COUNT", "8"))
+    shard_fixtures = _parse_fixtures_arg(args.fixtures)
+    if shard_fixtures is not None:
+        if not args.shard_id:
+            raise RuntimeError("--shard-id is required when --fixtures is set.")
+        print(
+            f"[eval_job] submission_id={submission_id} shard={args.shard_id} "
+            f"fixtures={len(shard_fixtures)} workers={worker_count} "
+            f"repo={submissions_repo}",
+            flush=True,
+        )
+        _prepare_run_dir(submission_id, zip_url, submissions_repo, token)
+        _prune_run_dir(RUN_DIR, shard_fixtures)
+        _run_eval(RUN_DIR, worker_count)
+        _upload_shard_artifacts(
+            submission_id, args.shard_id, RUN_DIR, submissions_repo, token,
+        )
+        print(
+            f"[eval_job] done: {submission_id} shard={args.shard_id}",
+            flush=True,
+        )
+        return 0
     print(
         f"[eval_job] submission_id={submission_id} "
         f"workers={worker_count} repo={submissions_repo}",
     return 0
+def _parse_fixtures_arg(raw: str | None) -> list[str] | None:
+    """Parse the ``--fixtures`` CSV into a deduped list, or ``None``.
+    ``None`` (flag absent) selects the whole-submission path. A present
+    but empty/whitespace value is a usage error: a shard with no
+    fixtures is never something the Space should dispatch.
+    """
+    if raw is None:
+        return None
+    names: list[str] = []
+    seen: set[str] = set()
+    for part in raw.split(","):
+        name = part.strip()
+        if not name or name in seen:
+            continue
+        seen.add(name)
+        names.append(name)
+    if not names:
+        raise RuntimeError("--fixtures was set but resolved to no fixture names.")
+    return names
 def _require_env(name: str) -> str:
     """Return env var *name* or raise with a clear message."""
     value = os.environ.get(name)
     print(f"[eval_job] unpacked into {RUN_DIR}", flush=True)
+def _prune_run_dir(run_dir: Path, fixtures: list[str]) -> None:
+    """Drop every fixture dir under *run_dir* not in *fixtures*.
+    Shard mode unpacks the whole zip (the candidate STEPs for every
+    fixture) but should only evaluate this shard's slice, so we delete
+    the other fixture dirs before ``cadgenbench evaluate`` walks the
+    tree. Non-fixture files at the root (e.g. ``meta.json``) are left
+    untouched. Raises if a requested fixture is absent from the zip,
+    which would mean the Space sharded a name the submission didn't
+    contain (a contract violation worth a loud, retried failure).
+    """
+    wanted = set(fixtures)
+    present = {p.name for p in run_dir.iterdir() if p.is_dir()}
+    missing = wanted - present
+    if missing:
+        raise RuntimeError(
+            f"Shard fixtures missing from submission zip: "
+            f"{', '.join(sorted(missing))}"
+        )
+    removed = 0
+    for child in run_dir.iterdir():
+        if child.is_dir() and child.name not in wanted:
+            shutil.rmtree(child)
+            removed += 1
+    print(
+        f"[eval_job] pruned run dir to {len(wanted)} shard fixture(s) "
+        f"(removed {removed})",
+        flush=True,
+    )
+def _upload_shard_artifacts(
+    submission_id: str,
+    shard_id: str,
+    run_dir: Path,
+    submissions_repo: str,
+    token: str,
+) -> None:
+    """Upload this shard's evaluated per-fixture dirs for the Space to merge.
+    Pushes the pruned ``run_dir`` (each ``<fixture>/`` with its
+    ``result.json`` + ``renders/`` + any overlay PNGs) verbatim to
+    ``reports/<id>/shards/<shard_id>/`` in one commit. The Space
+    downloads every shard's tree, copies the fixture dirs into a single
+    merged run dir, and builds the aggregate ``run_summary`` + report +
+    gallery from the whole. The per-shard ``run_summary.json`` written
+    by ``cadgenbench evaluate`` rides along harmlessly; the merge
+    recomputes it over the union and ignores the partials.
+    """
+    api = HfApi(token=token)
+    path_in_repo = f"{REPORTS_DIR_IN_REPO}/{submission_id}/{SHARDS_DIR_NAME}/{shard_id}"
+    api.upload_folder(
+        folder_path=str(run_dir),
+        path_in_repo=path_in_repo,
+        repo_id=submissions_repo,
+        repo_type="dataset",
+        commit_message=f"add eval shard {shard_id} for {submission_id}",
+    )
+    print(
+        f"[eval_job] uploaded shard {shard_id} -> {path_in_repo}",
+        flush=True,
+    )
 def _run_eval(run_dir: Path, workers: int) -> None:
     """Invoke ``cadgenbench evaluate`` over *run_dir*; raise on non-zero."""
     cmd = [