Michael Rabinovich commited on
Commit
8bfcf51
·
1 Parent(s): f1dcca9

stage eval shards through mounted bucket

Browse files
Files changed (1) hide show
  1. eval_job.py +38 -14
eval_job.py CHANGED
@@ -40,13 +40,15 @@ the Space's sharded submit path (UC3) to fan a large submission across
40
  several jobs. Steps 1-2 are identical, then the run dir is pruned to
41
  just this shard's fixtures, ``cadgenbench evaluate`` runs over that
42
  subset, and the resulting per-fixture dirs (``result.json`` + renders)
43
- are uploaded *verbatim* under ``reports/<id>/shards/<shard_id>/``. No
 
 
44
  report HTML, ``report.json``, or gallery render is produced per shard:
45
- the Space downloads every shard's fixture dirs, merges them into one
46
- run dir, and builds the single ``run_summary`` + report + gallery from
47
- the merged whole (mirroring the orchestrator's ``_merge_eval``). Exit 0
48
- on success; any failure exits non-zero and the Space marks that shard
49
- ERROR and retries it.
50
  """
51
  from __future__ import annotations
52
 
@@ -71,6 +73,8 @@ REPORT_TIMEOUT_SECONDS = 5 * 60
71
 
72
  REPORTS_DIR_IN_REPO = "reports"
73
  RENDERS_DIR_IN_REPO = "renders"
 
 
74
 
75
  # Sub-prefix under ``reports/<id>/`` where each shard uploads its raw
76
  # per-fixture dirs in shard mode. The Space merges these and deletes the
@@ -280,15 +284,35 @@ def _upload_shard_artifacts(
280
  ) -> None:
281
  """Upload this shard's evaluated per-fixture dirs for the Space to merge.
282
 
283
- Pushes the pruned ``run_dir`` (each ``<fixture>/`` with its
284
- ``result.json`` + ``renders/`` + any overlay PNGs) verbatim to
285
- ``reports/<id>/shards/<shard_id>/`` in one commit. The Space
286
- downloads every shard's tree, copies the fixture dirs into a single
287
- merged run dir, and builds the aggregate ``run_summary`` + report +
288
- gallery from the whole. The per-shard ``run_summary.json`` written
289
- by ``cadgenbench evaluate`` rides along harmlessly; the merge
290
- recomputes it over the union and ignores the partials.
 
 
291
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  api = HfApi(token=token)
293
  path_in_repo = f"{REPORTS_DIR_IN_REPO}/{submission_id}/{SHARDS_DIR_NAME}/{shard_id}"
294
  api.upload_folder(
 
40
  several jobs. Steps 1-2 are identical, then the run dir is pruned to
41
  just this shard's fixtures, ``cadgenbench evaluate`` runs over that
42
  subset, and the resulting per-fixture dirs (``result.json`` + renders)
43
+ are staged *verbatim*. If ``CADGENBENCH_SHARD_BUCKET_MOUNT`` is set, the
44
+ shard copies them into that mounted bucket; otherwise it uploads under
45
+ ``reports/<id>/shards/<shard_id>/`` in the submissions dataset. No
46
  report HTML, ``report.json``, or gallery render is produced per shard:
47
+ the Space reads every shard's fixture dirs, merges them into one run dir,
48
+ and builds the single ``run_summary`` + report + gallery from the merged
49
+ whole (mirroring the orchestrator's ``_merge_eval``). Exit 0 on success;
50
+ any failure exits non-zero and the Space marks that shard ERROR and
51
+ retries it.
52
  """
53
  from __future__ import annotations
54
 
 
73
 
74
  REPORTS_DIR_IN_REPO = "reports"
75
  RENDERS_DIR_IN_REPO = "renders"
76
+ SHARD_BUCKET_MOUNT_ENV = "CADGENBENCH_SHARD_BUCKET_MOUNT"
77
+ SHARD_BUCKET_PREFIX_ENV = "CADGENBENCH_SHARD_BUCKET_PREFIX"
78
 
79
  # Sub-prefix under ``reports/<id>/`` where each shard uploads its raw
80
  # per-fixture dirs in shard mode. The Space merges these and deletes the
 
284
  ) -> None:
285
  """Upload this shard's evaluated per-fixture dirs for the Space to merge.
286
 
287
+ Persists the pruned ``run_dir`` (each ``<fixture>/`` with its
288
+ ``result.json`` + ``renders/`` + any overlay PNGs) verbatim. In
289
+ bucket mode, this is a filesystem copy into the mounted bucket; in
290
+ legacy mode, it is one dataset-repo commit under
291
+ ``reports/<id>/shards/<shard_id>/``. The Space reads every shard's
292
+ tree, copies the fixture dirs into a single merged run dir, and
293
+ builds the aggregate ``run_summary`` + report + gallery from the
294
+ whole. The per-shard ``run_summary.json`` written by
295
+ ``cadgenbench evaluate`` rides along harmlessly; the merge recomputes
296
+ it over the union and ignores the partials.
297
  """
298
+ bucket_mount = os.environ.get(SHARD_BUCKET_MOUNT_ENV)
299
+ if bucket_mount:
300
+ prefix = os.environ.get(SHARD_BUCKET_PREFIX_ENV, "submissions").strip("/")
301
+ dest = Path(bucket_mount) / prefix / submission_id / SHARDS_DIR_NAME / shard_id
302
+ if not Path(bucket_mount).is_dir():
303
+ raise RuntimeError(
304
+ f"{SHARD_BUCKET_MOUNT_ENV}={bucket_mount!r} is not a mounted directory."
305
+ )
306
+ if dest.exists():
307
+ shutil.rmtree(dest)
308
+ dest.parent.mkdir(parents=True, exist_ok=True)
309
+ shutil.copytree(run_dir, dest)
310
+ print(
311
+ f"[eval_job] staged shard {shard_id} -> {dest}",
312
+ flush=True,
313
+ )
314
+ return
315
+
316
  api = HfApi(token=token)
317
  path_in_repo = f"{REPORTS_DIR_IN_REPO}/{submission_id}/{SHARDS_DIR_NAME}/{shard_id}"
318
  api.upload_folder(