Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

Michael Rabinovich Cursor commited on 3 days ago

Commit

293bc3b

1 Parent(s): 8eb8954

submit: shard large submissions across HF Jobs + merge (UC3)

Submissions over SHARD_THRESHOLD (12) fixtures now fan out into
12-fixture shards dispatched at once, polled to terminal with ERROR-only
retry, then merged into one run via write_run_summary (regenerating the
report + gallery from the merged whole) before the row flips. At/under
the threshold a submission stays a single job, so the original path is
unchanged. Eval is CPU-bound, so more shards is the throughput lever.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show

submit.py +452 -10

submit.py CHANGED Viewed

@@ -108,6 +108,7 @@ from huggingface_hub import (
     hf_hub_download,
     inspect_job,
     run_job,
 )
 from huggingface_hub.errors import EntryNotFoundError
@@ -127,6 +128,12 @@ SUBMISSION_ID_SLUG_MAX = 40
 RESULTS_FILENAME = "results.jsonl"
 SUBMISSIONS_DIR = "submissions"
 REPORTS_DIR = "reports"
 DATA_REV_SHORT_LEN = 12
 FAILURE_REASON_MAX_CHARS = 200
 SHA256_BLOCK_SIZE = 64 * 1024
@@ -160,6 +167,31 @@ JOB_POLL_DEADLINE_SECONDS = 35 * 60
 JOB_LOG_TAIL_LINES = 30
 JOB_POLL_MAX_CONSECUTIVE_ERRORS = 5
 # One HfApi client per process. HF_TOKEN is picked up from the env at
 # construction time and reused for every call.
 _HF_API = HfApi()
@@ -270,7 +302,7 @@ def handle_submit(
         except _HubWriteError as e:
             raise gr.Error(f"Submission rejected: {e}")
-        _spawn_worker(submission_id, blob_url)
         gr.Info(
             f"Submission {submission_id} queued for evaluation "
             f"({len(fixture_names)} fixtures). The eval runs on an "
@@ -688,31 +720,48 @@ def _resolve_data_revision() -> str:
 # ---------------------------------------------------------------------------
-def _spawn_worker(submission_id: str, submission_blob_url: str) -> None:
     """Start the dispatch+poll worker thread.
     Fire-and-forget; daemon=True so a Space restart doesn't block on
     in-flight workers (the boot-time sweep below flips any rows their
     workers didn't finish to failed). The worker no longer owns any
-    Space-local files; the Job downloads the zip itself from the Hub.
     """
     t = threading.Thread(
         target=_run_worker,
-        args=(submission_id, submission_blob_url),
         name=f"cgb-worker-{submission_id}",
         daemon=True,
     )
     t.start()
-def _run_worker(submission_id: str, submission_blob_url: str) -> None:
-    """Dispatch the eval Job, poll to completion, flip the row.
-    Any exception (dispatch, poll, fetch_run_summary, flip) maps to a
-    ``failed`` row with a short ``failure_reason`` (full traceback goes
-    to the Space's runtime logs).
     """
     try:
         job_id = _dispatch_eval_job(submission_id, submission_blob_url)
         logger.info("Dispatched eval job %s for %s", job_id, submission_id)
         stage, status_message = _poll_until_done(job_id, submission_id)
@@ -741,15 +790,88 @@ def _run_worker(submission_id: str, submission_blob_url: str) -> None:
             )
 def _dispatch_eval_job(
     submission_id: str, submission_blob_url: str,
 ) -> str:
-    """Dispatch the per-submission eval Job and return its id.
     Passes through every env var ``eval_job.py`` needs to resolve the
     Hub data + GT repos and the target submissions repo; the Job's
     HF_TOKEN secret comes from the Space's own HF_TOKEN env (which
     needs Jobs + repo R/W scopes, see space-setup/jobs-migration.md).
     """
     token = os.environ.get("HF_TOKEN")
     if not token:
@@ -768,6 +890,7 @@ def _dispatch_eval_job(
         image=f"hf.co/spaces/{EVAL_GPU_SPACE}",
         command=[
             "python", "/opt/eval_job.py", submission_id, submission_blob_url,
         ],
         flavor=EVAL_JOB_FLAVOR,
         namespace=EVAL_JOB_NAMESPACE,
@@ -779,6 +902,117 @@ def _dispatch_eval_job(
     return job.id
 def _poll_until_done(
     job_id: str, submission_id: str,
 ) -> tuple[str, str | None]:
@@ -868,6 +1102,214 @@ def _fetch_run_summary_from_report(submission_id: str) -> dict[str, Any]:
     return summary
 def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:
     """Merge ``run_summary.json`` fields into the pending row."""
     updates: dict[str, Any] = {

     hf_hub_download,
     inspect_job,
     run_job,
+    snapshot_download,
 )
 from huggingface_hub.errors import EntryNotFoundError
 RESULTS_FILENAME = "results.jsonl"
 SUBMISSIONS_DIR = "submissions"
 REPORTS_DIR = "reports"
+RENDERS_DIR = "renders"
+# Single canonical view staged per fixture for the leaderboard gallery
+# thumbnail (matches eval_job.py's GALLERY_THUMB_VIEW). The merged-shard
+# path stages these from the merged run dir, exactly as the single-job
+# eval_job does from its in-job run dir.
+GALLERY_THUMB_VIEW = "iso"
 DATA_REV_SHORT_LEN = 12
 FAILURE_REASON_MAX_CHARS = 200
 SHA256_BLOCK_SIZE = 64 * 1024
 JOB_LOG_TAIL_LINES = 30
 JOB_POLL_MAX_CONSECUTIVE_ERRORS = 5
+# Sharded eval (UC3). A submission with more than SHARD_THRESHOLD
+# fixtures fans out across several jobs of SHARD_CHUNK_SIZE fixtures
+# each, dispatched all at once (HF queues any overflow past the
+# account's ~8 concurrent slots; queueing is a speed variable, never a
+# failure). Each shard uploads its per-fixture dirs under
+# ``reports/<id>/shards/<shard_id>/``; the Space merges them into one
+# run dir, recomputes the aggregate run_summary + report + gallery, and
+# deletes the shards tree. Eval is CPU-bound (tessellation + Manifold
+# booleans), so more machines is the throughput lever. At/under the
+# threshold a submission stays a single job (the original path), so the
+# extra dispatch/merge machinery only kicks in when it pays off.
+SHARD_THRESHOLD = 12
+SHARD_CHUNK_SIZE = 12
+SHARDS_SUBDIR = "shards"
+# ERROR-only retries per shard before the whole submission fails. A
+# shard re-run is idempotent (it re-evals its own fixture slice and
+# overwrites its upload prefix), so one cheap retry absorbs a transient
+# job/runtime blip without re-running the shards that already passed.
+SHARD_MAX_RETRIES = 1
+# Whole-fan-out poll deadline. Each shard job carries its own
+# ``EVAL_JOB_TIMEOUT``; this guards the Space-side poll loop. Generous
+# vs. the per-shard ceiling because queued shards (past the ~8
+# concurrent slots) wait their turn before their own timeout starts.
+SHARD_POLL_DEADLINE_SECONDS = 45 * 60
 # One HfApi client per process. HF_TOKEN is picked up from the env at
 # construction time and reused for every call.
 _HF_API = HfApi()
         except _HubWriteError as e:
             raise gr.Error(f"Submission rejected: {e}")
+        _spawn_worker(submission_id, blob_url, sorted(fixture_names))
         gr.Info(
             f"Submission {submission_id} queued for evaluation "
             f"({len(fixture_names)} fixtures). The eval runs on an "
 # ---------------------------------------------------------------------------
+def _spawn_worker(
+    submission_id: str,
+    submission_blob_url: str,
+    fixture_names: list[str],
+) -> None:
     """Start the dispatch+poll worker thread.
     Fire-and-forget; daemon=True so a Space restart doesn't block on
     in-flight workers (the boot-time sweep below flips any rows their
     workers didn't finish to failed). The worker no longer owns any
+    Space-local files; the Job(s) download the zip themselves from the
+    Hub. *fixture_names* (the validated, dataset-matched set) decides
+    single-job vs. sharded dispatch and drives the shard split.
     """
     t = threading.Thread(
         target=_run_worker,
+        args=(submission_id, submission_blob_url, fixture_names),
         name=f"cgb-worker-{submission_id}",
         daemon=True,
     )
     t.start()
+def _run_worker(
+    submission_id: str,
+    submission_blob_url: str,
+    fixture_names: list[str],
+) -> None:
+    """Dispatch the eval Job(s), poll to completion, flip the row.
+    Submissions at/under :data:`SHARD_THRESHOLD` fixtures run as a
+    single job (the original path); larger ones fan out across shards
+    and merge. Any exception (dispatch, poll, fetch/merge, flip) maps to
+    a ``failed`` row with a short ``failure_reason`` (full traceback
+    goes to the Space's runtime logs).
     """
     try:
+        if len(fixture_names) > SHARD_THRESHOLD:
+            _run_worker_sharded(
+                submission_id, submission_blob_url, fixture_names,
+            )
+            return
         job_id = _dispatch_eval_job(submission_id, submission_blob_url)
         logger.info("Dispatched eval job %s for %s", job_id, submission_id)
         stage, status_message = _poll_until_done(job_id, submission_id)
             )
+def _run_worker_sharded(
+    submission_id: str,
+    submission_blob_url: str,
+    fixture_names: list[str],
+) -> None:
+    """Fan a large submission across shard jobs, then merge + flip.
+    Dispatches every shard at once (HF queues overflow past the
+    account's concurrent-job cap), polls all to terminal retrying only
+    ERROR shards, then merges each shard's per-fixture dirs into one run
+    dir, recomputes the aggregate ``run_summary`` + report + gallery,
+    flips the row to ``completed``, and deletes the shards tree. If any
+    shard is still ERROR after its retries the row flips to ``failed``
+    and the partial shard artifacts are left for a maintainer to
+    inspect. Raised exceptions propagate to :func:`_run_worker`'s
+    handler, which maps them to a failed row.
+    """
+    chunks = _chunk_fixtures(fixture_names, SHARD_CHUNK_SIZE)
+    shards: dict[str, dict[str, Any]] = {
+        f"shard_{i:03d}": {
+            "fixtures": chunk,
+            "job_id": None,
+            "attempts": 0,
+            "stage": None,
+            "message": None,
+        }
+        for i, chunk in enumerate(chunks)
+    }
+    logger.info(
+        "Sharded eval for %s: %d fixtures -> %d shard(s)",
+        submission_id, len(fixture_names), len(shards),
+    )
+    for shard_id, st in shards.items():
+        _dispatch_shard(submission_id, submission_blob_url, shard_id, st)
+    failures = _poll_shards_until_done(
+        submission_id, submission_blob_url, shards,
+    )
+    if failures:
+        reason = ("sharded eval failed: " + "; ".join(failures))[
+            :FAILURE_REASON_MAX_CHARS
+        ]
+        _flip_row_to_failed(submission_id, reason)
+        logger.warning("Sharded eval for %s failed: %s", submission_id, reason)
+        return
+    summary = _merge_shards_and_publish(
+        submission_id, list(shards.keys()), fixture_names,
+    )
+    _flip_row_to_completed(submission_id, summary)
+    logger.info("Sharded worker completed for %s", submission_id)
+    _cleanup_shard_artifacts(submission_id)
+def _chunk_fixtures(fixtures: list[str], chunk_size: int) -> list[list[str]]:
+    """Split *fixtures* into contiguous chunks of at most *chunk_size*."""
+    return [
+        fixtures[i:i + chunk_size]
+        for i in range(0, len(fixtures), chunk_size)
+    ]
 def _dispatch_eval_job(
     submission_id: str, submission_blob_url: str,
 ) -> str:
+    """Dispatch the whole-submission eval Job and return its id."""
+    return _dispatch_eval_command(submission_id, submission_blob_url, [])
+def _dispatch_eval_command(
+    submission_id: str,
+    submission_blob_url: str,
+    extra_args: list[str],
+) -> str:
+    """Dispatch an eval Job (whole-submission or one shard) and return its id.
     Passes through every env var ``eval_job.py`` needs to resolve the
     Hub data + GT repos and the target submissions repo; the Job's
     HF_TOKEN secret comes from the Space's own HF_TOKEN env (which
     needs Jobs + repo R/W scopes, see space-setup/jobs-migration.md).
+    *extra_args* are appended to the entrypoint argv; empty for the
+    whole-submission path, ``--shard-id ... --fixtures ...`` for a shard.
     """
     token = os.environ.get("HF_TOKEN")
     if not token:
         image=f"hf.co/spaces/{EVAL_GPU_SPACE}",
         command=[
             "python", "/opt/eval_job.py", submission_id, submission_blob_url,
+            *extra_args,
         ],
         flavor=EVAL_JOB_FLAVOR,
         namespace=EVAL_JOB_NAMESPACE,
     return job.id
+def _dispatch_shard(
+    submission_id: str,
+    submission_blob_url: str,
+    shard_id: str,
+    state: dict[str, Any],
+) -> None:
+    """Dispatch (or re-dispatch) one shard job and record it in *state*.
+    Mutates *state* in place: sets ``job_id``, bumps ``attempts``, and
+    clears the prior ``stage``/``message`` so a retried shard is polled
+    fresh. The shard re-evals its own fixture slice and overwrites its
+    ``reports/<id>/shards/<shard_id>/`` prefix, so a retry is idempotent.
+    """
+    job_id = _dispatch_eval_command(
+        submission_id,
+        submission_blob_url,
+        ["--shard-id", shard_id, "--fixtures", ",".join(state["fixtures"])],
+    )
+    state["job_id"] = job_id
+    state["attempts"] += 1
+    state["stage"] = None
+    state["message"] = None
+    logger.info(
+        "Dispatched shard %s for %s (attempt %d, job %s, %d fixtures)",
+        shard_id, submission_id, state["attempts"], job_id,
+        len(state["fixtures"]),
+    )
+def _poll_shards_until_done(
+    submission_id: str,
+    submission_blob_url: str,
+    shards: dict[str, dict[str, Any]],
+) -> list[str]:
+    """Poll every shard to terminal, retrying only ERROR shards.
+    Mirrors the orchestrator's eval poll loop: a single thread sweeps
+    all running shards each tick (``inspect_job`` calls are cheap), an
+    ERROR shard re-dispatches up to :data:`SHARD_MAX_RETRIES` times,
+    and a non-terminal stage just waits. Returns a list of
+    ``"<shard_id>: <reason>"`` strings for shards that stayed ERROR
+    after their retries (empty list means every shard COMPLETED).
+    Transient ``inspect_job`` failures retry up to
+    :data:`JOB_POLL_MAX_CONSECUTIVE_ERRORS` before raising.
+    """
+    deadline = time.monotonic() + SHARD_POLL_DEADLINE_SECONDS
+    consecutive_errors = 0
+    while True:
+        running = [
+            sid for sid, st in shards.items()
+            if st["stage"] not in ("COMPLETED", "FAILED")
+        ]
+        if not running:
+            break
+        for shard_id in running:
+            st = shards[shard_id]
+            try:
+                info = inspect_job(job_id=st["job_id"])
+                consecutive_errors = 0
+            except Exception as e:  # noqa: BLE001 - retry transient API errors
+                consecutive_errors += 1
+                logger.warning(
+                    "inspect_job(%s) for shard %s failed (%d/%d): %s",
+                    st["job_id"], shard_id, consecutive_errors,
+                    JOB_POLL_MAX_CONSECUTIVE_ERRORS, e,
+                )
+                if consecutive_errors >= JOB_POLL_MAX_CONSECUTIVE_ERRORS:
+                    raise
+                break  # stop this sweep; sleep then retry
+            stage = info.status.stage
+            if stage == "COMPLETED":
+                st["stage"] = "COMPLETED"
+                logger.info("Shard %s COMPLETED for %s", shard_id, submission_id)
+            elif stage == "ERROR":
+                if st["attempts"] <= SHARD_MAX_RETRIES:
+                    logger.warning(
+                        "Shard %s ERROR; retry %d/%d",
+                        shard_id, st["attempts"], SHARD_MAX_RETRIES,
+                    )
+                    _dispatch_shard(
+                        submission_id, submission_blob_url, shard_id, st,
+                    )
+                else:
+                    st["stage"] = "FAILED"
+                    st["message"] = _job_failure_reason(
+                        st["job_id"], stage, info.status.message,
+                    )
+                    logger.warning(
+                        "Shard %s FAILED after %d attempt(s): %s",
+                        shard_id, st["attempts"], st["message"],
+                    )
+        if time.monotonic() >= deadline:
+            for shard_id, st in shards.items():
+                if st["stage"] not in ("COMPLETED", "FAILED"):
+                    st["stage"] = "FAILED"
+                    st["message"] = (
+                        f"Space-side poll deadline exceeded "
+                        f"({SHARD_POLL_DEADLINE_SECONDS}s)"
+                    )
+            break
+        time.sleep(JOB_POLL_INTERVAL_SECONDS)
+    return [
+        f"{sid}: {st['message']}"
+        for sid, st in shards.items()
+        if st["stage"] == "FAILED"
+    ]
 def _poll_until_done(
     job_id: str, submission_id: str,
 ) -> tuple[str, str | None]:
     return summary
+def _merge_shards_and_publish(
+    submission_id: str,
+    shard_ids: list[str],
+    fixture_names: list[str],
+) -> dict[str, Any]:
+    """Merge every shard's per-fixture dirs into one run + publish results.
+    Downloads ``reports/<id>/shards/**`` from the submissions dataset,
+    copies each shard's ``<fixture>/`` dir (``result.json`` + renders)
+    into a single merged run dir, then recomputes the aggregate exactly
+    as a single-job run would: ``write_run_summary`` over the union
+    (the proven merge primitive, importable from the Space's own
+    ``cadgenbench`` install -- no private-repo dependency), a
+    ``report.json`` bundle, an HTML report via the same ``report
+    single`` renderer the job uses, and one ``iso`` gallery thumbnail
+    per fixture. Uploads ``reports/<id>.{html,json}`` + the gallery
+    renders, and returns the merged ``run_summary`` for the row flip.
+    Raises if a shard's tree is missing, a fixture appears in two shards,
+    or the merged set doesn't cover every expected fixture -- any of
+    which means the fan-out lost or duplicated work and the row should
+    fail loudly rather than publish a partial aggregate.
+    """
+    # Imported from the Space's own cadgenbench install (the same
+    # package submit.py imports at module load); these are public eval
+    # APIs, not the private orchestrator repo.
+    from cadgenbench.eval.report.single_run import discover_run, generate_html
+    from cadgenbench.eval.run_summary import write_run_summary
+    tmp = Path(tempfile.mkdtemp(prefix=f"cgb-merge-{submission_id}-"))
+    try:
+        download_root = Path(
+            snapshot_download(
+                repo_id=HF_SUBMISSIONS_REPO,
+                repo_type="dataset",
+                allow_patterns=[
+                    f"{REPORTS_DIR}/{submission_id}/{SHARDS_SUBDIR}/**"
+                ],
+                local_dir=str(tmp / "dl"),
+            )
+        )
+        shards_root = (
+            download_root / REPORTS_DIR / submission_id / SHARDS_SUBDIR
+        )
+        if not shards_root.is_dir():
+            raise RuntimeError(
+                f"No shard artifacts found under {shards_root} after download."
+            )
+        merged_run = tmp / "run"
+        merged_run.mkdir()
+        seen: set[str] = set()
+        for shard_dir in sorted(p for p in shards_root.iterdir() if p.is_dir()):
+            for fixture_dir in sorted(
+                p for p in shard_dir.iterdir() if p.is_dir()
+            ):
+                # Only real fixture dirs carry result.json; skip anything
+                # else the shard upload swept in (e.g. a stray run_summary
+                # subdir would not, but be defensive).
+                if not (fixture_dir / "result.json").is_file():
+                    continue
+                name = fixture_dir.name
+                if name in seen:
+                    raise RuntimeError(
+                        f"Fixture {name!r} present in more than one shard."
+                    )
+                seen.add(name)
+                shutil.copytree(fixture_dir, merged_run / name)
+        missing = set(fixture_names) - seen
+        if missing:
+            raise RuntimeError(
+                f"Merged run missing {len(missing)} fixture(s) after shard "
+                f"merge: {', '.join(sorted(missing)[:5])}"
+                + ("..." if len(missing) > 5 else "")
+            )
+        write_run_summary(merged_run)
+        report_json = _build_report_json(merged_run)
+        run_data = discover_run(merged_run)
+        html = generate_html(run_data)
+        html_path = tmp / f"{submission_id}.html"
+        html_path.write_text(html, encoding="utf-8")
+        _upload_reports(submission_id, html_path, report_json)
+        _upload_gallery_renders_from_dir(submission_id, merged_run)
+        return report_json["run_summary"]
+    finally:
+        shutil.rmtree(tmp, ignore_errors=True)
+def _build_report_json(run_dir: Path) -> dict[str, Any]:
+    """Bundle ``run_summary.json`` + every per-fixture ``result.json``.
+    Identical shape to ``eval_job.py``'s ``_build_report_json`` so the
+    merged report matches a single-job report: the row flip reads
+    ``run_summary`` out of this and the bundle is what gets uploaded as
+    ``reports/<id>.json``.
+    """
+    summary_path = run_dir / "run_summary.json"
+    if not summary_path.is_file():
+        raise RuntimeError(
+            f"run_summary.json not produced under {run_dir} (merge issue?)"
+        )
+    summary = json.loads(summary_path.read_text(encoding="utf-8"))
+    per_fixture: dict[str, dict[str, Any]] = {}
+    for fixture_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
+        rp = fixture_dir / "result.json"
+        if rp.is_file():
+            per_fixture[fixture_dir.name] = json.loads(
+                rp.read_text(encoding="utf-8")
+            )
+    return {"run_summary": summary, "per_fixture_results": per_fixture}
+def _upload_reports(
+    submission_id: str,
+    html_path: Path,
+    report_json: dict[str, Any],
+) -> None:
+    """Upload ``reports/<id>.html`` + ``reports/<id>.json`` to the Hub.
+    Mirrors ``eval_job.py``'s ``_upload_reports`` so the merged-shard
+    artifacts land at the exact paths the leaderboard + the row-flip
+    expect. Uses the process ``HfApi`` (Space HF_TOKEN env).
+    """
+    _HF_API.upload_file(
+        path_or_fileobj=str(html_path),
+        path_in_repo=f"{REPORTS_DIR}/{submission_id}.html",
+        repo_id=HF_SUBMISSIONS_REPO,
+        repo_type="dataset",
+        commit_message=f"add merged HTML report for {submission_id}",
+    )
+    _HF_API.upload_file(
+        path_or_fileobj=json.dumps(
+            report_json, ensure_ascii=False, indent=2,
+        ).encode("utf-8"),
+        path_in_repo=f"{REPORTS_DIR}/{submission_id}.json",
+        repo_id=HF_SUBMISSIONS_REPO,
+        repo_type="dataset",
+        commit_message=f"add merged JSON report for {submission_id}",
+    )
+    logger.info("Uploaded merged reports/%s.{html,json}", submission_id)
+def _upload_gallery_renders_from_dir(
+    submission_id: str,
+    run_dir: Path,
+) -> None:
+    """Stage one ``iso`` thumbnail per fixture for the leaderboard gallery.
+    Mirrors ``eval_job.py``'s ``_upload_gallery_renders`` but reads from
+    the merged run dir: every ``<run_dir>/<fixture>/renders/iso.png``
+    becomes ``renders/<id>/<fixture>.png``. A fixture with no ``iso.png``
+    (missing output / render that never ran) is skipped, matching the
+    single-job behaviour; the gallery draws the dashed "invalid" cell
+    from the row, so an absent thumbnail is not an error.
+    """
+    staged: list[tuple[Path, str]] = []
+    for fixture_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
+        iso_png = fixture_dir / "renders" / f"{GALLERY_THUMB_VIEW}.png"
+        if iso_png.is_file():
+            staged.append((iso_png, fixture_dir.name))
+    if not staged:
+        logger.info("No gallery renders to upload for %s", submission_id)
+        return
+    for iso_png, fixture_name in staged:
+        _HF_API.upload_file(
+            path_or_fileobj=str(iso_png),
+            path_in_repo=(
+                f"{RENDERS_DIR}/{submission_id}/{fixture_name}.png"
+            ),
+            repo_id=HF_SUBMISSIONS_REPO,
+            repo_type="dataset",
+            commit_message=(
+                f"add gallery render {fixture_name} for {submission_id}"
+            ),
+        )
+    logger.info(
+        "Uploaded %d gallery render(s) under %s/%s/",
+        len(staged), RENDERS_DIR, submission_id,
+    )
+def _cleanup_shard_artifacts(submission_id: str) -> None:
+    """Delete ``reports/<id>/shards/`` after a successful merge.
+    Best-effort: the merged ``reports/<id>.{html,json}`` + gallery are
+    the durable artifacts, so a failed cleanup only leaves recoverable
+    scratch behind and must never fail an otherwise-completed
+    submission.
+    """
+    try:
+        _HF_API.delete_folder(
+            path_in_repo=f"{REPORTS_DIR}/{submission_id}/{SHARDS_SUBDIR}",
+            repo_id=HF_SUBMISSIONS_REPO,
+            repo_type="dataset",
+            commit_message=f"clean up eval shards for {submission_id}",
+        )
+        logger.info("Cleaned up shard artifacts for %s", submission_id)
+    except Exception as e:  # noqa: BLE001 - cleanup is best-effort
+        logger.warning(
+            "Shard-artifact cleanup failed for %s (%s: %s); leaving scratch",
+            submission_id, type(e).__name__, e,
+        )
 def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:
     """Merge ``run_summary.json`` fields into the pending row."""
     updates: dict[str, Any] = {