Michael Rabinovich commited on
Commit ·
8bfcf51
1
Parent(s): f1dcca9
stage eval shards through mounted bucket
Browse files- eval_job.py +38 -14
eval_job.py
CHANGED
|
@@ -40,13 +40,15 @@ the Space's sharded submit path (UC3) to fan a large submission across
|
|
| 40 |
several jobs. Steps 1-2 are identical, then the run dir is pruned to
|
| 41 |
just this shard's fixtures, ``cadgenbench evaluate`` runs over that
|
| 42 |
subset, and the resulting per-fixture dirs (``result.json`` + renders)
|
| 43 |
-
are
|
|
|
|
|
|
|
| 44 |
report HTML, ``report.json``, or gallery render is produced per shard:
|
| 45 |
-
the Space
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
"""
|
| 51 |
from __future__ import annotations
|
| 52 |
|
|
@@ -71,6 +73,8 @@ REPORT_TIMEOUT_SECONDS = 5 * 60
|
|
| 71 |
|
| 72 |
REPORTS_DIR_IN_REPO = "reports"
|
| 73 |
RENDERS_DIR_IN_REPO = "renders"
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Sub-prefix under ``reports/<id>/`` where each shard uploads its raw
|
| 76 |
# per-fixture dirs in shard mode. The Space merges these and deletes the
|
|
@@ -280,15 +284,35 @@ def _upload_shard_artifacts(
|
|
| 280 |
) -> None:
|
| 281 |
"""Upload this shard's evaluated per-fixture dirs for the Space to merge.
|
| 282 |
|
| 283 |
-
|
| 284 |
-
``result.json`` + ``renders/`` + any overlay PNGs) verbatim
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
|
|
|
|
|
|
| 291 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
api = HfApi(token=token)
|
| 293 |
path_in_repo = f"{REPORTS_DIR_IN_REPO}/{submission_id}/{SHARDS_DIR_NAME}/{shard_id}"
|
| 294 |
api.upload_folder(
|
|
|
|
| 40 |
several jobs. Steps 1-2 are identical, then the run dir is pruned to
|
| 41 |
just this shard's fixtures, ``cadgenbench evaluate`` runs over that
|
| 42 |
subset, and the resulting per-fixture dirs (``result.json`` + renders)
|
| 43 |
+
are staged *verbatim*. If ``CADGENBENCH_SHARD_BUCKET_MOUNT`` is set, the
|
| 44 |
+
shard copies them into that mounted bucket; otherwise it uploads under
|
| 45 |
+
``reports/<id>/shards/<shard_id>/`` in the submissions dataset. No
|
| 46 |
report HTML, ``report.json``, or gallery render is produced per shard:
|
| 47 |
+
the Space reads every shard's fixture dirs, merges them into one run dir,
|
| 48 |
+
and builds the single ``run_summary`` + report + gallery from the merged
|
| 49 |
+
whole (mirroring the orchestrator's ``_merge_eval``). Exit 0 on success;
|
| 50 |
+
any failure exits non-zero and the Space marks that shard ERROR and
|
| 51 |
+
retries it.
|
| 52 |
"""
|
| 53 |
from __future__ import annotations
|
| 54 |
|
|
|
|
| 73 |
|
| 74 |
REPORTS_DIR_IN_REPO = "reports"
|
| 75 |
RENDERS_DIR_IN_REPO = "renders"
|
| 76 |
+
SHARD_BUCKET_MOUNT_ENV = "CADGENBENCH_SHARD_BUCKET_MOUNT"
|
| 77 |
+
SHARD_BUCKET_PREFIX_ENV = "CADGENBENCH_SHARD_BUCKET_PREFIX"
|
| 78 |
|
| 79 |
# Sub-prefix under ``reports/<id>/`` where each shard uploads its raw
|
| 80 |
# per-fixture dirs in shard mode. The Space merges these and deletes the
|
|
|
|
| 284 |
) -> None:
|
| 285 |
"""Upload this shard's evaluated per-fixture dirs for the Space to merge.
|
| 286 |
|
| 287 |
+
Persists the pruned ``run_dir`` (each ``<fixture>/`` with its
|
| 288 |
+
``result.json`` + ``renders/`` + any overlay PNGs) verbatim. In
|
| 289 |
+
bucket mode, this is a filesystem copy into the mounted bucket; in
|
| 290 |
+
legacy mode, it is one dataset-repo commit under
|
| 291 |
+
``reports/<id>/shards/<shard_id>/``. The Space reads every shard's
|
| 292 |
+
tree, copies the fixture dirs into a single merged run dir, and
|
| 293 |
+
builds the aggregate ``run_summary`` + report + gallery from the
|
| 294 |
+
whole. The per-shard ``run_summary.json`` written by
|
| 295 |
+
``cadgenbench evaluate`` rides along harmlessly; the merge recomputes
|
| 296 |
+
it over the union and ignores the partials.
|
| 297 |
"""
|
| 298 |
+
bucket_mount = os.environ.get(SHARD_BUCKET_MOUNT_ENV)
|
| 299 |
+
if bucket_mount:
|
| 300 |
+
prefix = os.environ.get(SHARD_BUCKET_PREFIX_ENV, "submissions").strip("/")
|
| 301 |
+
dest = Path(bucket_mount) / prefix / submission_id / SHARDS_DIR_NAME / shard_id
|
| 302 |
+
if not Path(bucket_mount).is_dir():
|
| 303 |
+
raise RuntimeError(
|
| 304 |
+
f"{SHARD_BUCKET_MOUNT_ENV}={bucket_mount!r} is not a mounted directory."
|
| 305 |
+
)
|
| 306 |
+
if dest.exists():
|
| 307 |
+
shutil.rmtree(dest)
|
| 308 |
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
| 309 |
+
shutil.copytree(run_dir, dest)
|
| 310 |
+
print(
|
| 311 |
+
f"[eval_job] staged shard {shard_id} -> {dest}",
|
| 312 |
+
flush=True,
|
| 313 |
+
)
|
| 314 |
+
return
|
| 315 |
+
|
| 316 |
api = HfApi(token=token)
|
| 317 |
path_in_repo = f"{REPORTS_DIR_IN_REPO}/{submission_id}/{SHARDS_DIR_NAME}/{shard_id}"
|
| 318 |
api.upload_folder(
|