cadgenbench-leaderboard / tests /test_submit.py
Michael Rabinovich
stage shards via bucket API instead of volume mount
920b1b4
"""Unit tests for the submit-tab pending-row builder.
C4 contract: ``_build_pending_row`` defaults the three Bundle 1+2
schema fields (``validation_status="unvalidated"``,
``validation_method=None``, ``hf_username=None``) and keeps the
existing metadata + sha256 fields intact.
Hub I/O (``_resolve_data_revision`` reads from
``HfApi.dataset_info``) is monkeypatched out, so the suite has zero
network traffic.
"""
from __future__ import annotations
import importlib.util
from pathlib import Path
from types import SimpleNamespace
import pytest
import submit
def _hub_http_error(status: int, headers: dict | None = None) -> submit.HfHubHTTPError:
"""An ``HfHubHTTPError`` with a minimal response carrying *status*.
Built without going through the real Hub: a ``SimpleNamespace``
stands in for the httpx response so ``_with_hub_retries`` can read
``response.status_code`` / ``response.headers``. Newer
``huggingface_hub`` makes ``response`` a required keyword-only
constructor argument, so it's passed in directly (and re-assigned
afterwards for the older positional-optional signature too).
"""
response = SimpleNamespace(
status_code=status, headers=headers or {}, request=None,
)
err = submit.HfHubHTTPError(f"HTTP {status}", response=response)
err.response = response
return err
def test_with_hub_retries_recovers_after_transient(monkeypatch):
"""A 429 (then a 503) is retried; the eventual success is returned."""
monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
statuses = iter([429, 503])
calls = {"n": 0}
def flaky():
calls["n"] += 1
status = next(statuses, None)
if status is not None:
raise _hub_http_error(status)
return "ok"
assert submit._with_hub_retries(flaky, what="test") == "ok"
assert calls["n"] == 3
def test_with_hub_retries_reraises_non_retryable(monkeypatch):
"""A 403 is not in the retry set, so it propagates on the first try."""
monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
calls = {"n": 0}
def forbidden():
calls["n"] += 1
raise _hub_http_error(403)
with pytest.raises(submit.HfHubHTTPError):
submit._with_hub_retries(forbidden, what="test")
assert calls["n"] == 1
def test_with_hub_retries_gives_up_after_wall_cap(monkeypatch):
"""Past the wall cap, a persistent 429 stops being retried and raises."""
monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
# Force the deadline check to trip after the first failure: the
# second monotonic() read (post-failure) already exceeds the cap.
ticks = iter([0.0, submit.HUB_RETRY_MAX_SECONDS + 1])
monkeypatch.setattr(
submit.time, "monotonic", lambda: next(ticks, submit.HUB_RETRY_MAX_SECONDS + 1)
)
calls = {"n": 0}
def always_429():
calls["n"] += 1
raise _hub_http_error(429)
with pytest.raises(submit.HfHubHTTPError):
submit._with_hub_retries(always_429, what="test")
assert calls["n"] == 1
def test_retry_after_header_is_honored(monkeypatch):
"""A ``Retry-After`` seconds value sets the floor for the sleep delay."""
slept: list[float] = []
monkeypatch.setattr(submit.time, "sleep", lambda d: slept.append(d))
calls = {"n": 0}
def flaky():
calls["n"] += 1
if calls["n"] == 1:
raise _hub_http_error(429, headers={"Retry-After": "7"})
return "ok"
assert submit._with_hub_retries(flaky, what="test") == "ok"
assert slept and slept[0] >= 7.0
def test_dispatch_shard_passes_bucket_env(monkeypatch):
"""Bucket-configured shard jobs get the bucket env, no volume mount."""
captured: dict = {}
def fake_run_job(**kwargs):
captured.update(kwargs)
return SimpleNamespace(id="job-123")
monkeypatch.setenv("HF_TOKEN", "hf_test")
monkeypatch.setattr(
submit, "SHARD_BUCKET",
"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
)
monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
monkeypatch.setattr(submit, "run_job", fake_run_job)
job_id = submit._dispatch_eval_command(
"sub-1", "https://example.test/sub-1.zip",
["--shard-id", "shard_000", "--fixtures", "101,102"],
)
assert job_id == "job-123"
assert captured["env"]["CADGENBENCH_SHARD_BUCKET"] == (
"HuggingAI4Engineering/cadgenbench-eval-staging"
)
assert captured["env"]["CADGENBENCH_SHARD_BUCKET_PREFIX"] == "submissions"
# Mount-free: no volume is attached to the job.
assert "volumes" not in captured
def test_dispatch_whole_submission_no_bucket_env(monkeypatch):
"""Configured bucket staging is only for sharded eval jobs."""
captured: dict = {}
def fake_run_job(**kwargs):
captured.update(kwargs)
return SimpleNamespace(id="job-456")
monkeypatch.setenv("HF_TOKEN", "hf_test")
monkeypatch.setattr(submit, "SHARD_BUCKET", "org/bucket")
monkeypatch.setattr(submit, "run_job", fake_run_job)
job_id = submit._dispatch_eval_command(
"sub-1", "https://example.test/sub-1.zip", [],
)
assert job_id == "job-456"
assert "volumes" not in captured
assert "CADGENBENCH_SHARD_BUCKET" not in captured["env"]
def test_shard_bucket_uri_built_from_id_and_prefix(monkeypatch):
"""The bucket URI strips any hf:// prefix and nests submission/shards."""
monkeypatch.setattr(
submit, "SHARD_BUCKET",
"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
)
monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
assert submit._shard_bucket_id() == (
"HuggingAI4Engineering/cadgenbench-eval-staging"
)
assert submit._shard_bucket_uri("sub-1") == (
"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
"submissions/sub-1/shards"
)
def test_eval_job_syncs_shard_to_bucket(tmp_path: Path, monkeypatch):
"""In bucket mode the eval job syncs shard outputs to the bucket URI."""
eval_job_path = (
Path(__file__).resolve().parents[2]
/ "cadgenbench-eval-gpu"
/ "eval_job.py"
)
spec = importlib.util.spec_from_file_location("eval_job_for_test", eval_job_path)
assert spec and spec.loader
eval_job = importlib.util.module_from_spec(spec)
spec.loader.exec_module(eval_job)
run_dir = tmp_path / "run"
fixture_dir = run_dir / "101"
fixture_dir.mkdir(parents=True)
(fixture_dir / "result.json").write_text("{}", encoding="utf-8")
captured: dict = {}
def fake_sync_bucket(self, *, source, dest, token=None):
captured.update(source=source, dest=dest)
monkeypatch.setattr(eval_job.HfApi, "sync_bucket", fake_sync_bucket)
monkeypatch.setenv(
eval_job.SHARD_BUCKET_ENV,
"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
)
monkeypatch.setenv(eval_job.SHARD_BUCKET_PREFIX_ENV, "submissions")
eval_job._upload_shard_artifacts(
"sub-1", "shard_000", run_dir, "ignored/submissions", "ignored-token",
)
assert captured["source"] == str(run_dir)
assert captured["dest"] == (
"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
"submissions/sub-1/shards/shard_000"
)
def test_poll_until_done_uses_jobs_namespace_and_token(monkeypatch):
"""Polling must target the namespace where Jobs were dispatched."""
captured: dict = {}
def fake_inspect_job(**kwargs):
captured.update(kwargs)
return SimpleNamespace(
status=SimpleNamespace(stage="COMPLETED", message=None),
)
monkeypatch.setenv("HF_TOKEN", "hf_test")
monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)
assert submit._poll_until_done("job-123", "sub-1") == ("COMPLETED", None)
assert captured == {
"job_id": "job-123",
"namespace": submit.EVAL_JOB_NAMESPACE,
"token": "hf_test",
}
def test_shard_poll_uses_jobs_namespace_and_token(monkeypatch):
"""Sharded polling uses the same Jobs namespace/token as dispatch."""
captured: dict = {}
def fake_inspect_job(**kwargs):
captured.update(kwargs)
return SimpleNamespace(
status=SimpleNamespace(stage="COMPLETED", message=None),
)
monkeypatch.setenv("HF_TOKEN", "hf_test")
monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)
monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
failures = submit._poll_shards_until_done(
"sub-1",
"https://example.test/sub-1.zip",
{"shard_000": {"job_id": "job-123", "stage": None, "message": None}},
)
assert failures == []
assert captured == {
"job_id": "job-123",
"namespace": submit.EVAL_JOB_NAMESPACE,
"token": "hf_test",
}
def test_job_failure_reason_fetches_logs_with_namespace_and_token(monkeypatch):
"""Failure diagnostics fetch logs from the same Jobs namespace."""
captured: dict = {}
def fake_fetch_job_logs(**kwargs):
captured.update(kwargs)
return ["line 1\n", "line 2\n"]
monkeypatch.setenv("HF_TOKEN", "hf_test")
monkeypatch.setattr(submit, "fetch_job_logs", fake_fetch_job_logs)
reason = submit._job_failure_reason("job-123", "ERROR", "boom")
assert "line 2" in reason
assert captured == {
"job_id": "job-123",
"namespace": submit.EVAL_JOB_NAMESPACE,
"token": "hf_test",
}
def _stub_meta() -> dict:
"""Minimum meta.json shape that survives ``_load_and_validate_meta``."""
return {
"submitter_name": "team-test",
"submission_name": "Stub Agent v1",
"agent_url": "https://github.com/example/stub-agent",
"notes": "test row, not a real submission",
"agree_to_publish": True,
}
def test_pending_row_defaults_new_fields(monkeypatch):
"""Three Bundle 1+2 fields land with their schema defaults."""
monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
row = submit._build_pending_row(
submission_id="sub-test-x",
meta=_stub_meta(),
blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
submission_sha256="a" * 64,
)
assert row["validation_status"] == "unvalidated"
assert row["validation_method"] is None
assert row["hf_username"] is None
def test_pending_row_preserves_sha256(monkeypatch):
"""Existing dedup path's row-level half: sha256 still gets stamped on the row."""
monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
expected_hash = "f" * 64
row = submit._build_pending_row(
submission_id="sub-test-x",
meta=_stub_meta(),
blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
submission_sha256=expected_hash,
)
assert row["submission_sha256"] == expected_hash
def test_pending_row_populates_hf_username_when_provided(monkeypatch):
"""C10 OAuth path: profile.username flows into the row's hf_username.
The submit handler reads ``gr.OAuthProfile`` (injected by Gradio)
and passes ``profile.username`` through as a kwarg. This test
exercises just the row builder's side of that handoff so a
refactor that drops the kwarg gets caught.
"""
monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
row = submit._build_pending_row(
submission_id="sub-test-x",
meta=_stub_meta(),
blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
submission_sha256="a" * 64,
hf_username="alice",
)
assert row["hf_username"] == "alice"
def test_pending_row_hf_username_defaults_to_none(monkeypatch):
"""Omitting the kwarg keeps `hf_username` null.
Covers the pre-OAuth callers (test fixtures, scripts) that don't
have a profile in scope. Pre-C10 row writers and any future
non-OAuth caller default cleanly.
"""
monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
row = submit._build_pending_row(
submission_id="sub-test-x",
meta=_stub_meta(),
blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
submission_sha256="a" * 64,
)
assert row["hf_username"] is None
def test_pending_row_preserves_existing_metadata(monkeypatch):
"""Pre-Bundle-1+2 fields keep their values from meta + args.
Regression guard: a future refactor of ``_build_pending_row`` that
accidentally drops one of these keys would silently change the
schema of every row the Space writes.
"""
monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
meta = _stub_meta()
row = submit._build_pending_row(
submission_id="sub-test-x",
meta=meta,
blob_url="https://example.test/sub-test-x.zip",
submission_sha256="0" * 64,
)
assert row["submission_id"] == "sub-test-x"
assert row["status"] == "pending"
assert row["failure_reason"] is None
assert row["submitter_name"] == meta["submitter_name"]
assert row["submission_name"] == meta["submission_name"]
assert row["agent_url"] == meta["agent_url"]
assert row["notes"] == meta["notes"]
assert row["submission_blob_url"] == "https://example.test/sub-test-x.zip"
assert row["cadgenbench_data_revision"] == "test-rev"
# Score-shaped fields are null on a fresh pending row.
for k in (
"aggregate_score",
"validity_rate",
"score_by_task_type",
"per_task_scores",
"per_fixture_scores",
"per_fixture_breakdown",
):
assert row[k] is None
def test_validate_steps_allows_missing_output_step(tmp_path: Path, monkeypatch):
"""Missing fixture outputs are accepted; evaluator scores them as missing."""
(tmp_path / "101").mkdir()
(tmp_path / "102").mkdir()
calls: list[Path] = []
monkeypatch.setattr(submit, "parse_step", lambda p: calls.append(p))
submit._validate_steps_parseable(tmp_path, {"101", "102"})
assert calls == []
def test_validate_steps_checks_present_output_stp(tmp_path: Path, monkeypatch):
"""Present candidate files are still cheap-parse checked."""
fixture = tmp_path / "101"
fixture.mkdir()
candidate = fixture / "output.stp"
candidate.write_text("ISO-10303-21;\n")
calls: list[Path] = []
monkeypatch.setattr(submit, "parse_step", lambda p: calls.append(p))
submit._validate_steps_parseable(tmp_path, {"101"})
assert calls == [candidate]