Michael Rabinovich Cursor commited on
Commit
b2e9d3a
·
1 Parent(s): cff8663

eval_job: add shard mode (--fixtures/--shard-id) for sharded submission eval

Browse files

When --fixtures is set, prune the run dir to that shard's slice, evaluate
only it, and upload the per-fixture dirs to reports/<id>/shards/<shard_id>/
for the Space to merge. No --fixtures keeps the original whole-submission
path (report.json + html + gallery) unchanged.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show
  1. eval_job.py +151 -3
eval_job.py CHANGED
@@ -12,9 +12,12 @@ Invoked by the leaderboard Space's worker (see
12
  --secrets HF_TOKEN \\
13
  python /opt/eval_job.py <submission_id> <zip_url>
14
 
15
- Pipeline, in order. Synchronous, no fallbacks. Any failure raises
16
- and the container exits non-zero; the Space's poller catches the
17
- ERROR stage and flips the submission row to ``failed``.
 
 
 
18
 
19
  1. Download ``submissions/<id>.zip`` from the submissions dataset
20
  via ``hf_hub_download`` (auth via ``HF_TOKEN``).
@@ -31,6 +34,19 @@ ERROR stage and flips the submission row to ``failed``.
31
 
32
  The Space-side worker then downloads ``reports/<id>.json``, reads
33
  ``run_summary`` out of it, and flips the row to ``completed``.
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  """
35
  from __future__ import annotations
36
 
@@ -56,6 +72,11 @@ REPORT_TIMEOUT_SECONDS = 5 * 60
56
  REPORTS_DIR_IN_REPO = "reports"
57
  RENDERS_DIR_IN_REPO = "renders"
58
 
 
 
 
 
 
59
  # Single canonical view uploaded per fixture for the leaderboard
60
  # gallery thumbnail. "iso" matches the GT render the gallery pairs it
61
  # with, so the gallery columns stay a comparable matrix at one fixed
@@ -79,6 +100,25 @@ def main() -> int:
79
  "(submission_blob_url from the row)."
80
  ),
81
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  args = parser.parse_args()
83
 
84
  submission_id: str = args.submission_id
@@ -88,6 +128,28 @@ def main() -> int:
88
  submissions_repo = _require_env("HF_SUBMISSIONS_REPO")
89
  worker_count = int(os.environ.get("EVAL_WORKER_COUNT", "8"))
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  print(
92
  f"[eval_job] submission_id={submission_id} "
93
  f"workers={worker_count} repo={submissions_repo}",
@@ -107,6 +169,28 @@ def main() -> int:
107
  return 0
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def _require_env(name: str) -> str:
111
  """Return env var *name* or raise with a clear message."""
112
  value = os.environ.get(name)
@@ -157,6 +241,70 @@ def _prepare_run_dir(
157
  print(f"[eval_job] unpacked into {RUN_DIR}", flush=True)
158
 
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  def _run_eval(run_dir: Path, workers: int) -> None:
161
  """Invoke ``cadgenbench evaluate`` over *run_dir*; raise on non-zero."""
162
  cmd = [
 
12
  --secrets HF_TOKEN \\
13
  python /opt/eval_job.py <submission_id> <zip_url>
14
 
15
+ Two run modes:
16
+
17
+ **Whole-submission (default, no ``--fixtures``)** -- the original path.
18
+ Synchronous, no fallbacks. Any failure raises and the container exits
19
+ non-zero; the Space's poller catches the ERROR stage and flips the
20
+ submission row to ``failed``.
21
 
22
  1. Download ``submissions/<id>.zip`` from the submissions dataset
23
  via ``hf_hub_download`` (auth via ``HF_TOKEN``).
 
34
 
35
  The Space-side worker then downloads ``reports/<id>.json``, reads
36
  ``run_summary`` out of it, and flips the row to ``completed``.
37
+
38
+ **Shard (``--fixtures f1,f2,... --shard-id shard_000``)** -- used by
39
+ the Space's sharded submit path (UC3) to fan a large submission across
40
+ several jobs. Steps 1-2 are identical, then the run dir is pruned to
41
+ just this shard's fixtures, ``cadgenbench evaluate`` runs over that
42
+ subset, and the resulting per-fixture dirs (``result.json`` + renders)
43
+ are uploaded *verbatim* under ``reports/<id>/shards/<shard_id>/``. No
44
+ report HTML, ``report.json``, or gallery render is produced per shard:
45
+ the Space downloads every shard's fixture dirs, merges them into one
46
+ run dir, and builds the single ``run_summary`` + report + gallery from
47
+ the merged whole (mirroring the orchestrator's ``_merge_eval``). Exit 0
48
+ on success; any failure exits non-zero and the Space marks that shard
49
+ ERROR and retries it.
50
  """
51
  from __future__ import annotations
52
 
 
72
  REPORTS_DIR_IN_REPO = "reports"
73
  RENDERS_DIR_IN_REPO = "renders"
74
 
75
+ # Sub-prefix under ``reports/<id>/`` where each shard uploads its raw
76
+ # per-fixture dirs in shard mode. The Space merges these and deletes the
77
+ # whole ``shards/`` tree after a successful merge.
78
+ SHARDS_DIR_NAME = "shards"
79
+
80
  # Single canonical view uploaded per fixture for the leaderboard
81
  # gallery thumbnail. "iso" matches the GT render the gallery pairs it
82
  # with, so the gallery columns stay a comparable matrix at one fixed
 
100
  "(submission_blob_url from the row)."
101
  ),
102
  )
103
+ parser.add_argument(
104
+ "--fixtures",
105
+ default=None,
106
+ help=(
107
+ "Comma-separated fixture subset for shard mode. When set, the "
108
+ "run dir is pruned to just these fixtures, evaluated, and the "
109
+ "per-fixture dirs are uploaded under "
110
+ "reports/<id>/shards/<shard-id>/ for the Space to merge. "
111
+ "Omit for the original whole-submission path."
112
+ ),
113
+ )
114
+ parser.add_argument(
115
+ "--shard-id",
116
+ default=None,
117
+ help=(
118
+ "Shard label (e.g. shard_000) naming this shard's upload prefix. "
119
+ "Required when --fixtures is set."
120
+ ),
121
+ )
122
  args = parser.parse_args()
123
 
124
  submission_id: str = args.submission_id
 
128
  submissions_repo = _require_env("HF_SUBMISSIONS_REPO")
129
  worker_count = int(os.environ.get("EVAL_WORKER_COUNT", "8"))
130
 
131
+ shard_fixtures = _parse_fixtures_arg(args.fixtures)
132
+ if shard_fixtures is not None:
133
+ if not args.shard_id:
134
+ raise RuntimeError("--shard-id is required when --fixtures is set.")
135
+ print(
136
+ f"[eval_job] submission_id={submission_id} shard={args.shard_id} "
137
+ f"fixtures={len(shard_fixtures)} workers={worker_count} "
138
+ f"repo={submissions_repo}",
139
+ flush=True,
140
+ )
141
+ _prepare_run_dir(submission_id, zip_url, submissions_repo, token)
142
+ _prune_run_dir(RUN_DIR, shard_fixtures)
143
+ _run_eval(RUN_DIR, worker_count)
144
+ _upload_shard_artifacts(
145
+ submission_id, args.shard_id, RUN_DIR, submissions_repo, token,
146
+ )
147
+ print(
148
+ f"[eval_job] done: {submission_id} shard={args.shard_id}",
149
+ flush=True,
150
+ )
151
+ return 0
152
+
153
  print(
154
  f"[eval_job] submission_id={submission_id} "
155
  f"workers={worker_count} repo={submissions_repo}",
 
169
  return 0
170
 
171
 
172
+ def _parse_fixtures_arg(raw: str | None) -> list[str] | None:
173
+ """Parse the ``--fixtures`` CSV into a deduped list, or ``None``.
174
+
175
+ ``None`` (flag absent) selects the whole-submission path. A present
176
+ but empty/whitespace value is a usage error: a shard with no
177
+ fixtures is never something the Space should dispatch.
178
+ """
179
+ if raw is None:
180
+ return None
181
+ names: list[str] = []
182
+ seen: set[str] = set()
183
+ for part in raw.split(","):
184
+ name = part.strip()
185
+ if not name or name in seen:
186
+ continue
187
+ seen.add(name)
188
+ names.append(name)
189
+ if not names:
190
+ raise RuntimeError("--fixtures was set but resolved to no fixture names.")
191
+ return names
192
+
193
+
194
  def _require_env(name: str) -> str:
195
  """Return env var *name* or raise with a clear message."""
196
  value = os.environ.get(name)
 
241
  print(f"[eval_job] unpacked into {RUN_DIR}", flush=True)
242
 
243
 
244
+ def _prune_run_dir(run_dir: Path, fixtures: list[str]) -> None:
245
+ """Drop every fixture dir under *run_dir* not in *fixtures*.
246
+
247
+ Shard mode unpacks the whole zip (the candidate STEPs for every
248
+ fixture) but should only evaluate this shard's slice, so we delete
249
+ the other fixture dirs before ``cadgenbench evaluate`` walks the
250
+ tree. Non-fixture files at the root (e.g. ``meta.json``) are left
251
+ untouched. Raises if a requested fixture is absent from the zip,
252
+ which would mean the Space sharded a name the submission didn't
253
+ contain (a contract violation worth a loud, retried failure).
254
+ """
255
+ wanted = set(fixtures)
256
+ present = {p.name for p in run_dir.iterdir() if p.is_dir()}
257
+ missing = wanted - present
258
+ if missing:
259
+ raise RuntimeError(
260
+ f"Shard fixtures missing from submission zip: "
261
+ f"{', '.join(sorted(missing))}"
262
+ )
263
+ removed = 0
264
+ for child in run_dir.iterdir():
265
+ if child.is_dir() and child.name not in wanted:
266
+ shutil.rmtree(child)
267
+ removed += 1
268
+ print(
269
+ f"[eval_job] pruned run dir to {len(wanted)} shard fixture(s) "
270
+ f"(removed {removed})",
271
+ flush=True,
272
+ )
273
+
274
+
275
+ def _upload_shard_artifacts(
276
+ submission_id: str,
277
+ shard_id: str,
278
+ run_dir: Path,
279
+ submissions_repo: str,
280
+ token: str,
281
+ ) -> None:
282
+ """Upload this shard's evaluated per-fixture dirs for the Space to merge.
283
+
284
+ Pushes the pruned ``run_dir`` (each ``<fixture>/`` with its
285
+ ``result.json`` + ``renders/`` + any overlay PNGs) verbatim to
286
+ ``reports/<id>/shards/<shard_id>/`` in one commit. The Space
287
+ downloads every shard's tree, copies the fixture dirs into a single
288
+ merged run dir, and builds the aggregate ``run_summary`` + report +
289
+ gallery from the whole. The per-shard ``run_summary.json`` written
290
+ by ``cadgenbench evaluate`` rides along harmlessly; the merge
291
+ recomputes it over the union and ignores the partials.
292
+ """
293
+ api = HfApi(token=token)
294
+ path_in_repo = f"{REPORTS_DIR_IN_REPO}/{submission_id}/{SHARDS_DIR_NAME}/{shard_id}"
295
+ api.upload_folder(
296
+ folder_path=str(run_dir),
297
+ path_in_repo=path_in_repo,
298
+ repo_id=submissions_repo,
299
+ repo_type="dataset",
300
+ commit_message=f"add eval shard {shard_id} for {submission_id}",
301
+ )
302
+ print(
303
+ f"[eval_job] uploaded shard {shard_id} -> {path_in_repo}",
304
+ flush=True,
305
+ )
306
+
307
+
308
  def _run_eval(run_dir: Path, workers: int) -> None:
309
  """Invoke ``cadgenbench evaluate`` over *run_dir*; raise on non-zero."""
310
  cmd = [