Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

App Files Files Community

Michael Rabinovich commited on 1 day ago

Commit

dd284f3

1 Parent(s): 5fb3ebc

fix HF Jobs polling namespace

Browse files

Files changed (2) hide show

submit.py +22 -3
tests/test_submit.py +70 -0

submit.py CHANGED Viewed

@@ -298,6 +298,11 @@ def _shard_bucket_root(submission_id: str) -> Path:
     return Path(SHARD_BUCKET_MOUNT) / _shard_bucket_relative_root(submission_id)
 def _with_hub_retries(fn, *, what: str):
     """Run *fn* (a Hub commit) retrying transient HTTP errors with backoff.
@@ -1287,7 +1292,11 @@ def _poll_shards_until_done(
         for shard_id in running:
             st = shards[shard_id]
             try:
-                info = inspect_job(job_id=st["job_id"])
                 consecutive_errors = 0
             except Exception as e:  # noqa: BLE001 - retry transient API errors
                 consecutive_errors += 1
@@ -1359,7 +1368,11 @@ def _poll_until_done(
     last_stage: str | None = None
     while True:
         try:
-            info = inspect_job(job_id=job_id)
             consecutive_errors = 0
         except Exception as e:  # noqa: BLE001 - retry transient API errors
             consecutive_errors += 1
@@ -1408,7 +1421,13 @@ def _job_failure_reason(
     if status_message:
         parts.append(status_message)
     try:
-        tail = list(fetch_job_logs(job_id=job_id))[-JOB_LOG_TAIL_LINES:]
         if tail:
             parts.append("logs: " + " | ".join(tail))
     except Exception as e:  # noqa: BLE001 - logs are best-effort

     return Path(SHARD_BUCKET_MOUNT) / _shard_bucket_relative_root(submission_id)
+def _jobs_token() -> str | None:
+    """Token used for HF Jobs control-plane calls."""
+    return os.environ.get("HF_TOKEN")
 def _with_hub_retries(fn, *, what: str):
     """Run *fn* (a Hub commit) retrying transient HTTP errors with backoff.
         for shard_id in running:
             st = shards[shard_id]
             try:
+                info = inspect_job(
+                    job_id=st["job_id"],
+                    namespace=EVAL_JOB_NAMESPACE,
+                    token=_jobs_token(),
+                )
                 consecutive_errors = 0
             except Exception as e:  # noqa: BLE001 - retry transient API errors
                 consecutive_errors += 1
     last_stage: str | None = None
     while True:
         try:
+            info = inspect_job(
+                job_id=job_id,
+                namespace=EVAL_JOB_NAMESPACE,
+                token=_jobs_token(),
+            )
             consecutive_errors = 0
         except Exception as e:  # noqa: BLE001 - retry transient API errors
             consecutive_errors += 1
     if status_message:
         parts.append(status_message)
     try:
+        tail = list(
+            fetch_job_logs(
+                job_id=job_id,
+                namespace=EVAL_JOB_NAMESPACE,
+                token=_jobs_token(),
+            )
+        )[-JOB_LOG_TAIL_LINES:]
         if tail:
             parts.append("logs: " + " | ".join(tail))
     except Exception as e:  # noqa: BLE001 - logs are best-effort

tests/test_submit.py CHANGED Viewed

@@ -194,6 +194,76 @@ def test_eval_job_stages_shard_to_mounted_bucket(tmp_path: Path, monkeypatch):
     assert (staged / "101" / "result.json").read_text(encoding="utf-8") == "{}"
 def _stub_meta() -> dict:
     """Minimum meta.json shape that survives ``_load_and_validate_meta``."""
     return {

     assert (staged / "101" / "result.json").read_text(encoding="utf-8") == "{}"
+def test_poll_until_done_uses_jobs_namespace_and_token(monkeypatch):
+    """Polling must target the namespace where Jobs were dispatched."""
+    captured: dict = {}
+    def fake_inspect_job(**kwargs):
+        captured.update(kwargs)
+        return SimpleNamespace(
+            status=SimpleNamespace(stage="COMPLETED", message=None),
+        )
+    monkeypatch.setenv("HF_TOKEN", "hf_test")
+    monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)
+    assert submit._poll_until_done("job-123", "sub-1") == ("COMPLETED", None)
+    assert captured == {
+        "job_id": "job-123",
+        "namespace": submit.EVAL_JOB_NAMESPACE,
+        "token": "hf_test",
+    }
+def test_shard_poll_uses_jobs_namespace_and_token(monkeypatch):
+    """Sharded polling uses the same Jobs namespace/token as dispatch."""
+    captured: dict = {}
+    def fake_inspect_job(**kwargs):
+        captured.update(kwargs)
+        return SimpleNamespace(
+            status=SimpleNamespace(stage="COMPLETED", message=None),
+        )
+    monkeypatch.setenv("HF_TOKEN", "hf_test")
+    monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)
+    monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
+    failures = submit._poll_shards_until_done(
+        "sub-1",
+        "https://example.test/sub-1.zip",
+        {"shard_000": {"job_id": "job-123", "stage": None, "message": None}},
+    )
+    assert failures == []
+    assert captured == {
+        "job_id": "job-123",
+        "namespace": submit.EVAL_JOB_NAMESPACE,
+        "token": "hf_test",
+    }
+def test_job_failure_reason_fetches_logs_with_namespace_and_token(monkeypatch):
+    """Failure diagnostics fetch logs from the same Jobs namespace."""
+    captured: dict = {}
+    def fake_fetch_job_logs(**kwargs):
+        captured.update(kwargs)
+        return ["line 1\n", "line 2\n"]
+    monkeypatch.setenv("HF_TOKEN", "hf_test")
+    monkeypatch.setattr(submit, "fetch_job_logs", fake_fetch_job_logs)
+    reason = submit._job_failure_reason("job-123", "ERROR", "boom")
+    assert "line 2" in reason
+    assert captured == {
+        "job_id": "job-123",
+        "namespace": submit.EVAL_JOB_NAMESPACE,
+        "token": "hf_test",
+    }
 def _stub_meta() -> dict:
     """Minimum meta.json shape that survives ``_load_and_validate_meta``."""
     return {