Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

App Files Files Community

michaelr27 HF Staff commited on 7 days ago

Commit

fd61157

verified ·

1 Parent(s): ed2a486

step 10 / jobs-migration: submit.py -> dispatch eval to HF Jobs

Browse files

Files changed (1) hide show

submit.py +220 -162

submit.py CHANGED Viewed

@@ -1,17 +1,22 @@
 """Submit-tab handler for the CADGenBench leaderboard Space.
-Step 6 (E) chunks 2 + 3 + 4 + 6: cheap-sync validation + pending-row
-write + zip upload + background-thread eval + boot-time stuck-pending
-sweep. The handler validates the upload, uploads the zip to
 ``submissions/<id>.zip``, appends a ``status: pending`` row to
 ``results.jsonl`` (under a process-wide lock), spawns a daemon thread
-to run ``cadgenbench evaluate`` + ``cadgenbench report single``, and
-returns immediately. The worker uploads ``reports/<id>.{html,json}``
-and flips the row ``pending -> completed`` (or ``failed`` with a
-``failure_reason``). At module import a one-shot daemon sweep flips
-any ``pending`` row whose ``submitted_at`` is older than 30 min to
-``failed`` with a "Space restart" reason, so rows stranded by a deploy
-/ OOM / crash don't sit pending forever.
 Validation gates, in order:
@@ -46,18 +51,21 @@ rejection; an orphan-zip sweep is a future-chunk concern.
 Background worker, per submission:
-1. ``cadgenbench evaluate <run_dir>`` (subprocess; runs
-   per-fixture eval in parallel via the CLI's ProcessPoolExecutor;
-   writes ``run_summary.json`` at the run-dir root).
-2. ``cadgenbench report single <run_dir> -o <report.html>``
-   (subprocess; self-contained HTML with embedded renders).
-3. Upload ``reports/<id>.html`` + ``reports/<id>.json``. The JSON
-   bundles ``run_summary.json`` + every per-fixture ``result.json``.
-4. Read ``run_summary.json``; under ``_HUB_LOCK`` flip the row's
-   ``status`` to ``"completed"`` and merge the score fields.
-5. On any worker-side exception, flip the row to ``"failed"`` with
-   a short ``failure_reason``. Tempdir cleanup runs in ``finally``
-   either way.
 """
 from __future__ import annotations
@@ -67,10 +75,9 @@ import logging
 import os
 import re
 import shutil
-import subprocess
-import sys
 import tempfile
 import threading
 import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timedelta, timezone
@@ -81,7 +88,13 @@ import cadgenbench
 import gradio as gr
 from cadgenbench.common.paths import data_inputs_dir
 from cadgenbench.common.validity import parse_step
-from huggingface_hub import HfApi
 from huggingface_hub.errors import EntryNotFoundError
 from leaderboard import HF_DATA_REPO, HF_SUBMISSIONS_REPO
@@ -102,22 +115,33 @@ SUBMISSIONS_DIR = "submissions"
 REPORTS_DIR = "reports"
 DATA_REV_SHORT_LEN = 12
 FAILURE_REASON_MAX_CHARS = 200
-EVAL_TIMEOUT_SECONDS = 15 * 60
-REPORT_TIMEOUT_SECONDS = 2 * 60
-# Per-fixture eval workers. Set to "1" (sequential) because 4 parallel
-# workers OOM-kill on the Space's cpu-upgrade tier: cadquery-ocp +
-# manifold3d + VTK osmesa state per worker, times 4, exceeds the box's
-# RAM and the kernel sends a SIGKILL that surfaces in the parent as
-# concurrent.futures.process.BrokenProcessPool after ~5 min of render.
-# Revisit once we have a hardware tier with more RAM, or once the
-# per-worker footprint shrinks (e.g. lazier OCC imports).
-EVAL_WORKER_COUNT = "1"
 SHA256_BLOCK_SIZE = 64 * 1024
 STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
 SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 STUCK_PENDING_REASON = "evaluation interrupted by Space restart"
 BOOT_SWEEP_ENV = "CADGENBENCH_DISABLE_BOOT_SWEEP"
 # One HfApi client per process. HF_TOKEN is picked up from the env at
 # construction time and reused for every call.
 _HF_API = HfApi()
@@ -178,10 +202,11 @@ def handle_submit(
     zip_path = Path(zip_file.name)
-    # Manual tempdir lifecycle: cleaned up here on any rejection, but
-    # ownership passes to the worker on a successful spawn (the worker
-    # cleans up in its own finally). TemporaryDirectory's context
-    # manager doesn't fit because the dir has to outlive this function.
     tmp = Path(tempfile.mkdtemp(prefix="cadgenbench-submit-"))
     run_dir = tmp / "run"
     run_dir.mkdir()
@@ -229,18 +254,16 @@ def handle_submit(
         except _HubWriteError as e:
             raise gr.Error(f"Submission rejected: {e}")
-        _spawn_worker(submission_id, tmp, run_dir)
-        tmp = None  # ownership transferred; skip cleanup below
         progress(1.0, desc="Queued")
         gr.Info(
             f"Submission {submission_id} queued for evaluation "
-            f"({len(fixture_names)} fixtures). Evaluation typically "
-            f"takes 2-5 minutes; the row flips to completed when the "
-            f"worker finishes."
         )
     finally:
-        if tmp is not None:
-            shutil.rmtree(tmp, ignore_errors=True)
 def _validate_form(zip_file) -> str | None:
@@ -646,153 +669,188 @@ def _resolve_data_revision() -> str:
 # ---------------------------------------------------------------------------
-# Background worker (eval + report + row flip)
 # ---------------------------------------------------------------------------
-def _spawn_worker(submission_id: str, tmp: Path, run_dir: Path) -> None:
-    """Start the eval worker thread. Fire-and-forget; daemon=True so a
-    Space restart doesn't block on in-flight workers (chunk 6's
-    boot-time sweep flips any rows their workers didn't finish to
-    failed).
     """
     t = threading.Thread(
         target=_run_worker,
-        args=(submission_id, tmp, run_dir),
         name=f"cgb-worker-{submission_id}",
         daemon=True,
     )
     t.start()
-def _run_worker(submission_id: str, tmp: Path, run_dir: Path) -> None:
-    """Top-level worker entry: run eval, build + upload reports, flip row.
-    Any exception in the pipeline flips the row to ``failed`` with a
-    short ``failure_reason`` (full traceback goes to the Space's
-    runtime logs). The tempdir is always cleaned up.
     """
     try:
-        try:
-            _run_eval(run_dir)
-            report_html = tmp / f"{submission_id}.html"
-            _run_report(run_dir, report_html)
-            report_json = _build_report_json(run_dir)
-            _upload_reports(submission_id, report_html, report_json)
-            summary = json.loads(
-                (run_dir / "run_summary.json").read_text(encoding="utf-8")
-            )
             _flip_row_to_completed(submission_id, summary)
             logger.info("Worker completed for %s", submission_id)
-        except Exception as e:  # noqa: BLE001 - broad on purpose; we map to row state
-            logger.exception("Worker failed for %s", submission_id)
-            reason = f"{type(e).__name__}: {str(e)}"[:FAILURE_REASON_MAX_CHARS]
-            try:
-                _flip_row_to_failed(submission_id, reason)
-            except Exception:
-                # If even the row-flip fails, the row stays pending.
-                # Chunk 6's stuck-pending sweep will catch it on the
-                # next Space boot.
-                logger.exception(
-                    "Failed to flip row to failed for %s; row stays pending",
-                    submission_id,
-                )
-    finally:
-        shutil.rmtree(tmp, ignore_errors=True)
-def _run_eval(run_dir: Path) -> None:
-    """Invoke ``cadgenbench evaluate`` over the run_dir; raise on non-zero."""
-    cmd = [
-        sys.executable, "-m", "cadgenbench.cli", "evaluate", str(run_dir),
-        "--workers", EVAL_WORKER_COUNT,
-    ]
-    logger.info("Running eval: %s", " ".join(cmd))
-    proc = subprocess.run(
-        cmd,
-        capture_output=True,
-        text=True,
-        timeout=EVAL_TIMEOUT_SECONDS,
-        env=os.environ.copy(),
-        check=False,
-    )
-    if proc.returncode != 0:
-        # Dump the full subprocess output to the container log so the
-        # actual child crash (segfault, OSMesa init error, etc.) is
-        # recoverable via the Space's run logs. The user-facing
-        # failure_reason field stays short (200 chars cap downstream).
-        logger.error(
-            "cadgenbench evaluate exited %s\n--- STDERR ---\n%s\n--- STDOUT ---\n%s",
-            proc.returncode, proc.stderr or "", proc.stdout or "",
-        )
-        tail = (proc.stderr or proc.stdout or "")[-500:].strip()
         raise RuntimeError(
-            f"cadgenbench evaluate exited {proc.returncode}: {tail}"
         )
-def _run_report(run_dir: Path, html_out: Path) -> None:
-    """Invoke ``cadgenbench report single`` for the run_dir; raise on non-zero."""
-    cmd = [
-        sys.executable, "-m", "cadgenbench.cli", "report", "single",
-        str(run_dir), "-o", str(html_out),
-    ]
-    logger.info("Running report: %s", " ".join(cmd))
-    proc = subprocess.run(
-        cmd,
-        capture_output=True,
-        text=True,
-        timeout=REPORT_TIMEOUT_SECONDS,
-        env=os.environ.copy(),
-        check=False,
-    )
-    if proc.returncode != 0 or not html_out.is_file():
-        logger.error(
-            "cadgenbench report single exited %s\n--- STDERR ---\n%s\n--- STDOUT ---\n%s",
-            proc.returncode, proc.stderr or "", proc.stdout or "",
-        )
-        tail = (proc.stderr or proc.stdout or "")[-500:].strip()
-        raise RuntimeError(
-            f"cadgenbench report single exited {proc.returncode}: {tail}"
-        )
-def _build_report_json(run_dir: Path) -> dict[str, Any]:
-    """Bundle ``run_summary.json`` + every per-fixture ``result.json``."""
-    summary_path = run_dir / "run_summary.json"
-    if not summary_path.is_file():
-        raise RuntimeError(
-            f"run_summary.json not produced under {run_dir} (eval issue?)"
-        )
-    summary = json.loads(summary_path.read_text(encoding="utf-8"))
-    per_fixture: dict[str, dict[str, Any]] = {}
-    for fixture_dir in sorted(d for d in run_dir.iterdir() if d.is_dir()):
-        rp = fixture_dir / "result.json"
-        if rp.is_file():
-            per_fixture[fixture_dir.name] = json.loads(
-                rp.read_text(encoding="utf-8")
             )
-    return {"run_summary": summary, "per_fixture_results": per_fixture}
-def _upload_reports(
-    submission_id: str, html_path: Path, report_json: dict[str, Any],
-) -> None:
-    """Upload ``reports/<id>.html`` and ``reports/<id>.json`` to the Hub."""
-    _HF_API.upload_file(
-        path_or_fileobj=str(html_path),
-        path_in_repo=f"{REPORTS_DIR}/{submission_id}.html",
-        repo_id=HF_SUBMISSIONS_REPO,
-        repo_type="dataset",
-        commit_message=f"add HTML report for {submission_id}",
-    )
-    _HF_API.upload_file(
-        path_or_fileobj=json.dumps(report_json, ensure_ascii=False, indent=2).encode("utf-8"),
-        path_in_repo=f"{REPORTS_DIR}/{submission_id}.json",
         repo_id=HF_SUBMISSIONS_REPO,
         repo_type="dataset",
-        commit_message=f"add JSON report for {submission_id}",
     )
 def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:

 """Submit-tab handler for the CADGenBench leaderboard Space.
+Step 6 (E) chunks 2 + 3 + 4 + 6 + Step 10 (jobs migration): cheap-sync
+validation + pending-row write + zip upload + background dispatch +
+poll of an HF Jobs GPU eval + boot-time stuck-pending sweep. The
+handler validates the upload, uploads the zip to
 ``submissions/<id>.zip``, appends a ``status: pending`` row to
 ``results.jsonl`` (under a process-wide lock), spawns a daemon thread
+that dispatches a per-submission HF Job against the
+``HuggingAI4Engineering/cadgenbench-eval-gpu`` image and polls
+``inspect_job`` until the job's stage is terminal. On COMPLETED the
+worker downloads ``reports/<id>.json`` (the Job already uploaded
+``reports/<id>.{html,json}`` to the submissions dataset), reads
+``run_summary`` out of it, and flips the row ``pending -> completed``.
+On ERROR (or any dispatch / poll exception) the row flips to ``failed``
+with a short ``failure_reason``. At module import a one-shot daemon
+sweep flips any ``pending`` row whose ``submitted_at`` is older than
+30 min to ``failed`` with a "Space restart" reason, so rows stranded by
+a deploy / OOM / crash / orphaned Job don't sit pending forever.
 Validation gates, in order:
 Background worker, per submission:
+1. ``huggingface_hub.run_job(...)`` dispatches an HF Job against
+   the ``cadgenbench-eval-gpu`` Space image on ``a10g-large``,
+   passing the submission_id + zip blob URL as command args and
+   ``HF_TOKEN`` as a secret.
+2. Poll ``inspect_job(job_id)`` every few seconds until the job's
+   stage is terminal (``COMPLETED`` or ``ERROR``). Outer deadline
+   guards against an unresponsive poll surface.
+3. On ``COMPLETED``: download ``reports/<id>.json`` from the
+   submissions dataset (the Job uploaded both
+   ``reports/<id>.{html,json}`` before exiting), read
+   ``run_summary`` out of the bundled payload, under ``_HUB_LOCK``
+   flip the row to ``"completed"`` and merge the score fields.
+4. On ``ERROR`` (or any dispatch / poll exception), flip the row to
+   ``"failed"`` with a short ``failure_reason`` (the job's
+   ``status.message`` plus the last N lines of ``fetch_job_logs``).
 """
 from __future__ import annotations
 import os
 import re
 import shutil
 import tempfile
 import threading
+import time
 import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timedelta, timezone
 import gradio as gr
 from cadgenbench.common.paths import data_inputs_dir
 from cadgenbench.common.validity import parse_step
+from huggingface_hub import (
+    HfApi,
+    fetch_job_logs,
+    hf_hub_download,
+    inspect_job,
+    run_job,
+)
 from huggingface_hub.errors import EntryNotFoundError
 from leaderboard import HF_DATA_REPO, HF_SUBMISSIONS_REPO
 REPORTS_DIR = "reports"
 DATA_REV_SHORT_LEN = 12
 FAILURE_REASON_MAX_CHARS = 200
 SHA256_BLOCK_SIZE = 64 * 1024
 STUCK_PENDING_THRESHOLD_SECONDS = 30 * 60
 SUBMITTED_AT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 STUCK_PENDING_REASON = "evaluation interrupted by Space restart"
 BOOT_SWEEP_ENV = "CADGENBENCH_DISABLE_BOOT_SWEEP"
+# HF Jobs target. The eval-gpu image is hosted as a Docker Space
+# (paused; image-only) at HuggingAI4Engineering/cadgenbench-eval-gpu.
+# Jobs run under the personal `michaelr27` namespace (no-bill for
+# HF employees per Round 6 of space-setup/leandro.md). a10g-large
+# fits cadgenbench evaluate --workers 8 comfortably in 46 GB RAM.
+EVAL_GPU_SPACE = "HuggingAI4Engineering/cadgenbench-eval-gpu"
+EVAL_JOB_FLAVOR = "a10g-large"
+EVAL_JOB_NAMESPACE = "michaelr27"
+EVAL_JOB_TIMEOUT = "30m"
+EVAL_JOB_WORKER_COUNT = "8"
+# Poll cadence + outer deadline guarding inspect_job. 5 s is fast
+# enough that a 60 s eval lands in <10 s of completion, slow enough
+# that we don't hammer the API. Deadline matches the Job's own
+# --timeout; the Job is the source of truth, this is just a belt
+# for an unresponsive inspect_job surface.
+JOB_POLL_INTERVAL_SECONDS = 5
+JOB_POLL_DEADLINE_SECONDS = 35 * 60
+JOB_LOG_TAIL_LINES = 30
+JOB_POLL_MAX_CONSECUTIVE_ERRORS = 5
 # One HfApi client per process. HF_TOKEN is picked up from the env at
 # construction time and reused for every call.
 _HF_API = HfApi()
     zip_path = Path(zip_file.name)
+    # The tempdir lives only for the cheap-sync validation pass
+    # (unpack zip, validate meta + fixture set + STEP parseability).
+    # The Job downloads the zip itself from the Hub, so the
+    # Space-local unpack is throwaway and the tempdir gets cleaned
+    # up unconditionally in the outer finally.
     tmp = Path(tempfile.mkdtemp(prefix="cadgenbench-submit-"))
     run_dir = tmp / "run"
     run_dir.mkdir()
         except _HubWriteError as e:
             raise gr.Error(f"Submission rejected: {e}")
+        _spawn_worker(submission_id, blob_url)
         progress(1.0, desc="Queued")
         gr.Info(
             f"Submission {submission_id} queued for evaluation "
+            f"({len(fixture_names)} fixtures). The eval runs on an "
+            f"HF Jobs GPU; the row flips to completed when the job "
+            f"finishes (typically 1-3 minutes)."
         )
     finally:
+        shutil.rmtree(tmp, ignore_errors=True)
 def _validate_form(zip_file) -> str | None:
 # ---------------------------------------------------------------------------
+# Background worker (dispatch eval to HF Jobs, poll, flip row)
 # ---------------------------------------------------------------------------
+def _spawn_worker(submission_id: str, submission_blob_url: str) -> None:
+    """Start the dispatch+poll worker thread.
+    Fire-and-forget; daemon=True so a Space restart doesn't block on
+    in-flight workers (the boot-time sweep below flips any rows their
+    workers didn't finish to failed). The worker no longer owns any
+    Space-local files; the Job downloads the zip itself from the Hub.
     """
     t = threading.Thread(
         target=_run_worker,
+        args=(submission_id, submission_blob_url),
         name=f"cgb-worker-{submission_id}",
         daemon=True,
     )
     t.start()
+def _run_worker(submission_id: str, submission_blob_url: str) -> None:
+    """Dispatch the eval Job, poll to completion, flip the row.
+    Any exception (dispatch, poll, fetch_run_summary, flip) maps to a
+    ``failed`` row with a short ``failure_reason`` (full traceback goes
+    to the Space's runtime logs).
     """
     try:
+        job_id = _dispatch_eval_job(submission_id, submission_blob_url)
+        logger.info("Dispatched eval job %s for %s", job_id, submission_id)
+        stage, status_message = _poll_until_done(job_id, submission_id)
+        if stage == "COMPLETED":
+            summary = _fetch_run_summary_from_report(submission_id)
             _flip_row_to_completed(submission_id, summary)
             logger.info("Worker completed for %s", submission_id)
+            return
+        reason = _job_failure_reason(job_id, stage, status_message)
+        _flip_row_to_failed(submission_id, reason)
+        logger.warning(
+            "Eval job %s for %s ended %s: %s",
+            job_id, submission_id, stage, reason,
+        )
+    except Exception as e:  # noqa: BLE001 - broad on purpose; we map to row state
+        logger.exception("Worker failed for %s", submission_id)
+        reason = f"{type(e).__name__}: {str(e)}"[:FAILURE_REASON_MAX_CHARS]
+        try:
+            _flip_row_to_failed(submission_id, reason)
+        except Exception:
+            # If even the row-flip fails, the row stays pending. The
+            # stuck-pending sweep on the next Space boot will catch it.
+            logger.exception(
+                "Failed to flip row to failed for %s; row stays pending",
+                submission_id,
+            )
+def _dispatch_eval_job(
+    submission_id: str, submission_blob_url: str,
+) -> str:
+    """Dispatch the per-submission eval Job and return its id.
+    Passes through every env var ``eval_job.py`` needs to resolve the
+    Hub data + GT repos and the target submissions repo; the Job's
+    HF_TOKEN secret comes from the Space's own HF_TOKEN env (which
+    needs Jobs + repo R/W scopes, see space-setup/jobs-migration.md).
+    """
+    token = os.environ.get("HF_TOKEN")
+    if not token:
         raise RuntimeError(
+            "HF_TOKEN is unset on the Space; cannot dispatch eval job."
         )
+    env: dict[str, str] = {
+        "HF_SUBMISSIONS_REPO": HF_SUBMISSIONS_REPO,
+        "EVAL_WORKER_COUNT": EVAL_JOB_WORKER_COUNT,
+    }
+    for key in ("CADGENBENCH_DATA_REPO", "CADGENBENCH_DATA_GT_REPO"):
+        value = os.environ.get(key)
+        if value:
+            env[key] = value
+    job = run_job(
+        image=f"hf.co/spaces/{EVAL_GPU_SPACE}",
+        command=[
+            "python", "/opt/eval_job.py", submission_id, submission_blob_url,
+        ],
+        flavor=EVAL_JOB_FLAVOR,
+        namespace=EVAL_JOB_NAMESPACE,
+        env=env,
+        secrets={"HF_TOKEN": token},
+        timeout=EVAL_JOB_TIMEOUT,
+        token=token,
+    )
+    return job.id
+def _poll_until_done(
+    job_id: str, submission_id: str,
+) -> tuple[str, str | None]:
+    """Poll ``inspect_job`` until terminal; return (stage, message).
+    Terminal stages: ``COMPLETED``, ``ERROR``. Anything else after the
+    outer deadline counts as a synthetic ``ERROR`` with a "deadline
+    exceeded" message; we do not try to cancel the Job from here (the
+    Job carries its own ``timeout`` and HF will reap it). Transient
+    ``inspect_job`` errors retry up to
+    ``JOB_POLL_MAX_CONSECUTIVE_ERRORS`` consecutive failures before
+    raising.
+    """
+    deadline = time.monotonic() + JOB_POLL_DEADLINE_SECONDS
+    consecutive_errors = 0
+    while True:
+        try:
+            info = inspect_job(job_id=job_id)
+            consecutive_errors = 0
+        except Exception as e:  # noqa: BLE001 - retry transient API errors
+            consecutive_errors += 1
+            logger.warning(
+                "inspect_job(%s) failed (%d/%d): %s",
+                job_id, consecutive_errors,
+                JOB_POLL_MAX_CONSECUTIVE_ERRORS, e,
+            )
+            if consecutive_errors >= JOB_POLL_MAX_CONSECUTIVE_ERRORS:
+                raise
+            time.sleep(JOB_POLL_INTERVAL_SECONDS)
+            continue
+        stage = info.status.stage
+        message = info.status.message
+        if stage in ("COMPLETED", "ERROR"):
+            return stage, message
+        if time.monotonic() >= deadline:
+            return "ERROR", (
+                f"Space-side poll deadline exceeded "
+                f"({JOB_POLL_DEADLINE_SECONDS}s); last stage={stage}"
             )
+        time.sleep(JOB_POLL_INTERVAL_SECONDS)
+def _job_failure_reason(
+    job_id: str, stage: str, status_message: str | None,
+) -> str:
+    """Build a short ``failure_reason`` for a non-completed Job.
+    Combines the job's own ``status.message`` (if any) with the last
+    ``JOB_LOG_TAIL_LINES`` of ``fetch_job_logs`` so the user sees
+    something actionable in the row. Log fetch is best-effort.
+    """
+    parts: list[str] = [f"eval job {stage.lower()}"]
+    if status_message:
+        parts.append(status_message)
+    try:
+        tail = list(fetch_job_logs(job_id=job_id))[-JOB_LOG_TAIL_LINES:]
+        if tail:
+            parts.append("logs: " + " | ".join(tail))
+    except Exception as e:  # noqa: BLE001 - logs are best-effort
+        logger.warning("fetch_job_logs(%s) failed: %s", job_id, e)
+    return ": ".join(parts)[:FAILURE_REASON_MAX_CHARS]
+def _fetch_run_summary_from_report(submission_id: str) -> dict[str, Any]:
+    """Download ``reports/<id>.json`` and return its ``run_summary`` dict.
+    The Job uploaded the report bundle before exiting; by the time
+    ``inspect_job`` returns COMPLETED the file is on the Hub. Raises
+    if the report or the ``run_summary`` key is missing (which would
+    indicate an eval that ran-but-broke contract; we want loud
+    failure rather than a silently-empty row).
+    """
+    path = hf_hub_download(
         repo_id=HF_SUBMISSIONS_REPO,
+        filename=f"{REPORTS_DIR}/{submission_id}.json",
         repo_type="dataset",
+        force_download=True,
     )
+    payload = json.loads(Path(path).read_text(encoding="utf-8"))
+    summary = payload.get("run_summary")
+    if not isinstance(summary, dict):
+        raise RuntimeError(
+            f"reports/{submission_id}.json missing or malformed "
+            f"`run_summary` block (got {type(summary).__name__})"
+        )
+    return summary
 def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None: