Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

Michael Rabinovich commited on 1 day ago

Commit

920b1b4

1 Parent(s): dd284f3

stage shards via bucket API instead of volume mount

Avoids Space volume-mount issues on an existing Space: shard jobs sync
artifacts to the HF bucket via the bucket API, and the Space syncs them
back down to merge. Gated on CADGENBENCH_SHARD_BUCKET.

Files changed (3) hide show

requirements.txt +5 -4
submit.py +52 -48
tests/test_submit.py +42 -24

requirements.txt CHANGED Viewed

@@ -12,9 +12,10 @@
 gradio[oauth]==5.50.0
 gradio-leaderboard==0.0.14
 pandas>=2.0
-# huggingface_hub >=1.8 for the Jobs Python API plus bucket volume
-# mounts. Used by submit.py to dispatch + poll per-submission GPU evals
-# and stage sharded artifacts through HF Buckets.
-huggingface_hub>=1.8.0
 datasets>=3.0
 requests>=2.31

 gradio[oauth]==5.50.0
 gradio-leaderboard==0.0.14
 pandas>=2.0
+# huggingface_hub >=1.16 for the Jobs Python API plus the bucket API
+# (sync_bucket / list_bucket_tree / batch_bucket_files). Used by submit.py
+# to dispatch + poll per-submission GPU evals and to stage/merge sharded
+# artifacts through an HF Storage Bucket (no volume mounts).
+huggingface_hub>=1.16.0
 datasets>=3.0
 requests>=2.31

submit.py CHANGED Viewed

@@ -115,11 +115,6 @@ from huggingface_hub import (
 )
 from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError
-try:
-    from huggingface_hub import Volume
-except ImportError:  # pragma: no cover - exercised only on old deploy images
-    Volume = None  # type: ignore[assignment]
 import progress
 from leaderboard import HF_DATA_REPO, HF_ORG, HF_SUBMISSIONS_REPO
@@ -204,22 +199,23 @@ JOB_POLL_MAX_CONSECUTIVE_ERRORS = 5
 # fixtures fans out across several jobs of SHARD_CHUNK_SIZE fixtures
 # each, dispatched all at once (HF queues any overflow past the
 # account's ~8 concurrent slots; queueing is a speed variable, never a
-# failure). Each shard stages its per-fixture dirs into a mounted bucket
-# when CADGENBENCH_SHARD_BUCKET is set, or under
-# ``reports/<id>/shards/<shard_id>/`` in the submissions dataset
-# otherwise; the Space merges them into one run dir, recomputes the
-# aggregate run_summary + report + gallery, and deletes the shards tree.
-# Eval is CPU-bound (tessellation + Manifold booleans), so more machines
-# is the throughput lever. At/under the threshold a submission stays a
-# single job (the original path), so the extra dispatch/merge machinery
-# only kicks in when it pays off.
 SHARD_THRESHOLD = 12
 SHARD_CHUNK_SIZE = 12
 SHARDS_SUBDIR = "shards"
 SHARD_BUCKET = os.getenv("CADGENBENCH_SHARD_BUCKET", "").strip()
-SHARD_BUCKET_MOUNT = os.getenv(
-    "CADGENBENCH_SHARD_BUCKET_MOUNT", "/mnt/cadgenbench-shards",
-).strip()
 SHARD_BUCKET_PREFIX = os.getenv(
     "CADGENBENCH_SHARD_BUCKET_PREFIX", SUBMISSIONS_DIR,
 ).strip("/")
@@ -270,32 +266,30 @@ def _retry_after_seconds(error: HfHubHTTPError) -> float | None:
 def _shard_bucket_enabled() -> bool:
-    """Whether shard scratch should be staged through a mounted bucket."""
     return bool(SHARD_BUCKET)
-def _shard_bucket_source() -> str:
-    """Return the bucket id accepted by ``huggingface_hub.Volume``."""
     source = SHARD_BUCKET
     if source.startswith("hf://buckets/"):
         source = source[len("hf://buckets/"):]
     return source.rstrip("/")
-def _shard_bucket_relative_root(submission_id: str) -> Path:
-    """Relative bucket path containing one directory per shard."""
     parts = [p for p in SHARD_BUCKET_PREFIX.split("/") if p]
-    return Path(*parts, submission_id, SHARDS_SUBDIR)
-def _shard_bucket_root(submission_id: str) -> Path:
-    """Mounted bucket path containing staged shard artifacts."""
-    if not SHARD_BUCKET_MOUNT:
-        raise RuntimeError(
-            "CADGENBENCH_SHARD_BUCKET is set but "
-            "CADGENBENCH_SHARD_BUCKET_MOUNT is empty."
-        )
-    return Path(SHARD_BUCKET_MOUNT) / _shard_bucket_relative_root(submission_id)
 def _jobs_token() -> str | None:
@@ -1186,26 +1180,15 @@ def _dispatch_eval_command(
         value = os.environ.get(key)
         if value:
             env[key] = value
-    run_kwargs: dict[str, Any] = {}
     if _shard_bucket_enabled() and "--shard-id" in extra_args:
-        if Volume is None:
-            raise RuntimeError(
-                "CADGENBENCH_SHARD_BUCKET requires huggingface_hub>=1.8.0 "
-                "for HF Jobs volume mounts."
-            )
         env.update(
             {
-                "CADGENBENCH_SHARD_BUCKET_MOUNT": SHARD_BUCKET_MOUNT,
                 "CADGENBENCH_SHARD_BUCKET_PREFIX": SHARD_BUCKET_PREFIX,
             }
         )
-        run_kwargs["volumes"] = [
-            Volume(
-                type="bucket",
-                source=_shard_bucket_source(),
-                mount_path=SHARD_BUCKET_MOUNT,
-            )
-        ]
     job = run_job(
         image=f"hf.co/spaces/{EVAL_GPU_SPACE}",
         command=[
@@ -1218,7 +1201,6 @@ def _dispatch_eval_command(
         secrets={"HF_TOKEN": token},
         timeout=EVAL_JOB_TIMEOUT,
         token=token,
-        **run_kwargs,
     )
     return job.id
@@ -1492,7 +1474,13 @@ def _merge_shards_and_publish(
     tmp = Path(tempfile.mkdtemp(prefix=f"cgb-merge-{submission_id}-"))
     try:
         if _shard_bucket_enabled():
-            shards_root = _shard_bucket_root(submission_id)
         else:
             download_root = Path(
                 snapshot_download(
@@ -1657,7 +1645,7 @@ def _cleanup_shard_artifacts(submission_id: str) -> None:
     """
     try:
         if _shard_bucket_enabled():
-            shutil.rmtree(_shard_bucket_root(submission_id), ignore_errors=True)
         else:
             _with_hub_retries(
                 lambda: _HF_API.delete_folder(
@@ -1676,6 +1664,22 @@ def _cleanup_shard_artifacts(submission_id: str) -> None:
         )
 def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:
     """Merge ``run_summary.json`` fields into the pending row."""
     updates: dict[str, Any] = {

 )
 from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError
 import progress
 from leaderboard import HF_DATA_REPO, HF_ORG, HF_SUBMISSIONS_REPO
 # fixtures fans out across several jobs of SHARD_CHUNK_SIZE fixtures
 # each, dispatched all at once (HF queues any overflow past the
 # account's ~8 concurrent slots; queueing is a speed variable, never a
+# failure). When CADGENBENCH_SHARD_BUCKET is set, each shard job syncs its
+# per-fixture dirs into that HF Storage Bucket (via the bucket API, no
+# volume mount) and the Space syncs them back down to merge; otherwise the
+# shard uploads under ``reports/<id>/shards/<shard_id>/`` in the
+# submissions dataset. The bucket path avoids the dataset commit-queue
+# 429s that strand concurrent shard commits. The Space merges into one run
+# dir, recomputes the aggregate run_summary + report + gallery, then
+# deletes the staged shards. Eval is CPU-bound (tessellation + Manifold
+# booleans), so more machines is the throughput lever. At/under the
+# threshold a submission stays a single job (the original path), so the
+# extra dispatch/merge machinery only kicks in when it pays off.
 SHARD_THRESHOLD = 12
 SHARD_CHUNK_SIZE = 12
 SHARDS_SUBDIR = "shards"
+# Bucket id (``namespace/bucket-name``, with or without an ``hf://buckets/``
+# prefix). Empty disables bucket staging and keeps the dataset-repo path.
 SHARD_BUCKET = os.getenv("CADGENBENCH_SHARD_BUCKET", "").strip()
 SHARD_BUCKET_PREFIX = os.getenv(
     "CADGENBENCH_SHARD_BUCKET_PREFIX", SUBMISSIONS_DIR,
 ).strip("/")
 def _shard_bucket_enabled() -> bool:
+    """Whether shard scratch should be staged through an HF bucket."""
     return bool(SHARD_BUCKET)
+def _shard_bucket_id() -> str:
+    """Return the bucket id (``namespace/bucket-name``), prefix stripped."""
     source = SHARD_BUCKET
     if source.startswith("hf://buckets/"):
         source = source[len("hf://buckets/"):]
     return source.rstrip("/")
+def _shard_bucket_prefix_path(submission_id: str) -> str:
+    """Bucket-relative path holding one directory per shard for *submission_id*."""
     parts = [p for p in SHARD_BUCKET_PREFIX.split("/") if p]
+    return "/".join([*parts, submission_id, SHARDS_SUBDIR])
+def _shard_bucket_uri(submission_id: str) -> str:
+    """``hf://buckets/...`` URI of the shards tree for *submission_id*."""
+    return (
+        f"hf://buckets/{_shard_bucket_id()}/"
+        f"{_shard_bucket_prefix_path(submission_id)}"
+    )
 def _jobs_token() -> str | None:
         value = os.environ.get(key)
         if value:
             env[key] = value
     if _shard_bucket_enabled() and "--shard-id" in extra_args:
+        # The shard job syncs its artifacts straight to the bucket via the
+        # bucket API (it already has HF_TOKEN); no volume mount is involved.
         env.update(
             {
+                "CADGENBENCH_SHARD_BUCKET": _shard_bucket_id(),
                 "CADGENBENCH_SHARD_BUCKET_PREFIX": SHARD_BUCKET_PREFIX,
             }
         )
     job = run_job(
         image=f"hf.co/spaces/{EVAL_GPU_SPACE}",
         command=[
         secrets={"HF_TOKEN": token},
         timeout=EVAL_JOB_TIMEOUT,
         token=token,
     )
     return job.id
     tmp = Path(tempfile.mkdtemp(prefix=f"cgb-merge-{submission_id}-"))
     try:
         if _shard_bucket_enabled():
+            shards_root = tmp / "dl"
+            shards_root.mkdir(parents=True, exist_ok=True)
+            _HF_API.sync_bucket(
+                source=_shard_bucket_uri(submission_id),
+                dest=str(shards_root),
+                token=_jobs_token(),
+            )
         else:
             download_root = Path(
                 snapshot_download(
     """
     try:
         if _shard_bucket_enabled():
+            _delete_shard_bucket_prefix(submission_id)
         else:
             _with_hub_retries(
                 lambda: _HF_API.delete_folder(
         )
+def _delete_shard_bucket_prefix(submission_id: str) -> None:
+    """Remove every staged file under the submission's bucket shards prefix."""
+    bucket_id = _shard_bucket_id()
+    prefix = _shard_bucket_prefix_path(submission_id)
+    token = _jobs_token()
+    files = [
+        item.path
+        for item in _HF_API.list_bucket_tree(
+            bucket_id, prefix=prefix, recursive=True, token=token,
+        )
+        if not getattr(item, "is_folder", False) and getattr(item, "path", None)
+    ]
+    if files:
+        _HF_API.batch_bucket_files(bucket_id, delete=files, token=token)
 def _flip_row_to_completed(submission_id: str, summary: dict[str, Any]) -> None:
     """Merge ``run_summary.json`` fields into the pending row."""
     updates: dict[str, Any] = {

tests/test_submit.py CHANGED Viewed

@@ -105,14 +105,10 @@ def test_retry_after_header_is_honored(monkeypatch):
     assert slept and slept[0] >= 7.0
-def test_dispatch_shard_mounts_configured_bucket(monkeypatch):
-    """Bucket-configured shard jobs get a read/write bucket volume."""
     captured: dict = {}
-    class FakeVolume:
-        def __init__(self, **kwargs):
-            self.kwargs = kwargs
     def fake_run_job(**kwargs):
         captured.update(kwargs)
         return SimpleNamespace(id="job-123")
@@ -122,9 +118,7 @@ def test_dispatch_shard_mounts_configured_bucket(monkeypatch):
         submit, "SHARD_BUCKET",
         "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
     )
-    monkeypatch.setattr(submit, "SHARD_BUCKET_MOUNT", "/mnt/cgb-shards")
     monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
-    monkeypatch.setattr(submit, "Volume", FakeVolume)
     monkeypatch.setattr(submit, "run_job", fake_run_job)
     job_id = submit._dispatch_eval_command(
@@ -133,17 +127,15 @@ def test_dispatch_shard_mounts_configured_bucket(monkeypatch):
     )
     assert job_id == "job-123"
-    assert captured["env"]["CADGENBENCH_SHARD_BUCKET_MOUNT"] == "/mnt/cgb-shards"
     assert captured["env"]["CADGENBENCH_SHARD_BUCKET_PREFIX"] == "submissions"
-    volume = captured["volumes"][0]
-    assert volume.kwargs == {
-        "type": "bucket",
-        "source": "HuggingAI4Engineering/cadgenbench-eval-staging",
-        "mount_path": "/mnt/cgb-shards",
-    }
-def test_dispatch_whole_submission_does_not_mount_bucket(monkeypatch):
     """Configured bucket staging is only for sharded eval jobs."""
     captured: dict = {}
@@ -161,11 +153,27 @@ def test_dispatch_whole_submission_does_not_mount_bucket(monkeypatch):
     assert job_id == "job-456"
     assert "volumes" not in captured
-    assert "CADGENBENCH_SHARD_BUCKET_MOUNT" not in captured["env"]
-def test_eval_job_stages_shard_to_mounted_bucket(tmp_path: Path, monkeypatch):
-    """In bucket mode the eval job copies shard outputs to the mount."""
     eval_job_path = (
         Path(__file__).resolve().parents[2]
         / "cadgenbench-eval-gpu"
@@ -180,18 +188,28 @@ def test_eval_job_stages_shard_to_mounted_bucket(tmp_path: Path, monkeypatch):
     fixture_dir = run_dir / "101"
     fixture_dir.mkdir(parents=True)
     (fixture_dir / "result.json").write_text("{}", encoding="utf-8")
-    bucket_mount = tmp_path / "bucket"
-    bucket_mount.mkdir()
-    monkeypatch.setenv(eval_job.SHARD_BUCKET_MOUNT_ENV, str(bucket_mount))
     monkeypatch.setenv(eval_job.SHARD_BUCKET_PREFIX_ENV, "submissions")
     eval_job._upload_shard_artifacts(
         "sub-1", "shard_000", run_dir, "ignored/submissions", "ignored-token",
     )
-    staged = bucket_mount / "submissions" / "sub-1" / "shards" / "shard_000"
-    assert (staged / "101" / "result.json").read_text(encoding="utf-8") == "{}"
 def test_poll_until_done_uses_jobs_namespace_and_token(monkeypatch):

     assert slept and slept[0] >= 7.0
+def test_dispatch_shard_passes_bucket_env(monkeypatch):
+    """Bucket-configured shard jobs get the bucket env, no volume mount."""
     captured: dict = {}
     def fake_run_job(**kwargs):
         captured.update(kwargs)
         return SimpleNamespace(id="job-123")
         submit, "SHARD_BUCKET",
         "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
     )
     monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
     monkeypatch.setattr(submit, "run_job", fake_run_job)
     job_id = submit._dispatch_eval_command(
     )
     assert job_id == "job-123"
+    assert captured["env"]["CADGENBENCH_SHARD_BUCKET"] == (
+        "HuggingAI4Engineering/cadgenbench-eval-staging"
+    )
     assert captured["env"]["CADGENBENCH_SHARD_BUCKET_PREFIX"] == "submissions"
+    # Mount-free: no volume is attached to the job.
+    assert "volumes" not in captured
+def test_dispatch_whole_submission_no_bucket_env(monkeypatch):
     """Configured bucket staging is only for sharded eval jobs."""
     captured: dict = {}
     assert job_id == "job-456"
     assert "volumes" not in captured
+    assert "CADGENBENCH_SHARD_BUCKET" not in captured["env"]
+def test_shard_bucket_uri_built_from_id_and_prefix(monkeypatch):
+    """The bucket URI strips any hf:// prefix and nests submission/shards."""
+    monkeypatch.setattr(
+        submit, "SHARD_BUCKET",
+        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
+    )
+    monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
+    assert submit._shard_bucket_id() == (
+        "HuggingAI4Engineering/cadgenbench-eval-staging"
+    )
+    assert submit._shard_bucket_uri("sub-1") == (
+        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
+        "submissions/sub-1/shards"
+    )
+def test_eval_job_syncs_shard_to_bucket(tmp_path: Path, monkeypatch):
+    """In bucket mode the eval job syncs shard outputs to the bucket URI."""
     eval_job_path = (
         Path(__file__).resolve().parents[2]
         / "cadgenbench-eval-gpu"
     fixture_dir = run_dir / "101"
     fixture_dir.mkdir(parents=True)
     (fixture_dir / "result.json").write_text("{}", encoding="utf-8")
+    captured: dict = {}
+    def fake_sync_bucket(self, *, source, dest, token=None):
+        captured.update(source=source, dest=dest)
+    monkeypatch.setattr(eval_job.HfApi, "sync_bucket", fake_sync_bucket)
+    monkeypatch.setenv(
+        eval_job.SHARD_BUCKET_ENV,
+        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
+    )
     monkeypatch.setenv(eval_job.SHARD_BUCKET_PREFIX_ENV, "submissions")
     eval_job._upload_shard_artifacts(
         "sub-1", "shard_000", run_dir, "ignored/submissions", "ignored-token",
     )
+    assert captured["source"] == str(run_dir)
+    assert captured["dest"] == (
+        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
+        "submissions/sub-1/shards/shard_000"
+    )
 def test_poll_until_done_uses_jobs_namespace_and_token(monkeypatch):