Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

cadgenbench-leaderboard / tests /test_submit.py

Michael Rabinovich

stage shards via bucket API instead of volume mount

920b1b4 1 day ago

14.6 kB

	"""Unit tests for the submit-tab pending-row builder.

	C4 contract: ``_build_pending_row`` defaults the three Bundle 1+2
	schema fields (``validation_status="unvalidated"``,
	``validation_method=None``, ``hf_username=None``) and keeps the
	existing metadata + sha256 fields intact.

	Hub I/O (``_resolve_data_revision`` reads from
	``HfApi.dataset_info``) is monkeypatched out, so the suite has zero
	network traffic.
	"""
	from __future__ import annotations

	import importlib.util
	from pathlib import Path
	from types import SimpleNamespace

	import pytest

	import submit


	def _hub_http_error(status: int, headers: dict \| None = None) -> submit.HfHubHTTPError:
	"""An ``HfHubHTTPError`` with a minimal response carrying status.

	Built without going through the real Hub: a ``SimpleNamespace``
	stands in for the httpx response so ``_with_hub_retries`` can read
	``response.status_code`` / ``response.headers``. Newer
	``huggingface_hub`` makes ``response`` a required keyword-only
	constructor argument, so it's passed in directly (and re-assigned
	afterwards for the older positional-optional signature too).
	"""
	response = SimpleNamespace(
	status_code=status, headers=headers or {}, request=None,
	)
	err = submit.HfHubHTTPError(f"HTTP {status}", response=response)
	err.response = response
	return err


	def test_with_hub_retries_recovers_after_transient(monkeypatch):
	"""A 429 (then a 503) is retried; the eventual success is returned."""
	monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
	statuses = iter([429, 503])
	calls = {"n": 0}

	def flaky():
	calls["n"] += 1
	status = next(statuses, None)
	if status is not None:
	raise _hub_http_error(status)
	return "ok"

	assert submit._with_hub_retries(flaky, what="test") == "ok"
	assert calls["n"] == 3


	def test_with_hub_retries_reraises_non_retryable(monkeypatch):
	"""A 403 is not in the retry set, so it propagates on the first try."""
	monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
	calls = {"n": 0}

	def forbidden():
	calls["n"] += 1
	raise _hub_http_error(403)

	with pytest.raises(submit.HfHubHTTPError):
	submit._with_hub_retries(forbidden, what="test")
	assert calls["n"] == 1


	def test_with_hub_retries_gives_up_after_wall_cap(monkeypatch):
	"""Past the wall cap, a persistent 429 stops being retried and raises."""
	monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
	# Force the deadline check to trip after the first failure: the
	# second monotonic() read (post-failure) already exceeds the cap.
	ticks = iter([0.0, submit.HUB_RETRY_MAX_SECONDS + 1])
	monkeypatch.setattr(
	submit.time, "monotonic", lambda: next(ticks, submit.HUB_RETRY_MAX_SECONDS + 1)
	)
	calls = {"n": 0}

	def always_429():
	calls["n"] += 1
	raise _hub_http_error(429)

	with pytest.raises(submit.HfHubHTTPError):
	submit._with_hub_retries(always_429, what="test")
	assert calls["n"] == 1


	def test_retry_after_header_is_honored(monkeypatch):
	"""A ``Retry-After`` seconds value sets the floor for the sleep delay."""
	slept: list[float] = []
	monkeypatch.setattr(submit.time, "sleep", lambda d: slept.append(d))
	calls = {"n": 0}

	def flaky():
	calls["n"] += 1
	if calls["n"] == 1:
	raise _hub_http_error(429, headers={"Retry-After": "7"})
	return "ok"

	assert submit._with_hub_retries(flaky, what="test") == "ok"
	assert slept and slept[0] >= 7.0


	def test_dispatch_shard_passes_bucket_env(monkeypatch):
	"""Bucket-configured shard jobs get the bucket env, no volume mount."""
	captured: dict = {}

	def fake_run_job(**kwargs):
	captured.update(kwargs)
	return SimpleNamespace(id="job-123")

	monkeypatch.setenv("HF_TOKEN", "hf_test")
	monkeypatch.setattr(
	submit, "SHARD_BUCKET",
	"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
	)
	monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
	monkeypatch.setattr(submit, "run_job", fake_run_job)

	job_id = submit._dispatch_eval_command(
	"sub-1", "https://example.test/sub-1.zip",
	["--shard-id", "shard_000", "--fixtures", "101,102"],
	)

	assert job_id == "job-123"
	assert captured["env"]["CADGENBENCH_SHARD_BUCKET"] == (
	"HuggingAI4Engineering/cadgenbench-eval-staging"
	)
	assert captured["env"]["CADGENBENCH_SHARD_BUCKET_PREFIX"] == "submissions"
	# Mount-free: no volume is attached to the job.
	assert "volumes" not in captured


	def test_dispatch_whole_submission_no_bucket_env(monkeypatch):
	"""Configured bucket staging is only for sharded eval jobs."""
	captured: dict = {}

	def fake_run_job(**kwargs):
	captured.update(kwargs)
	return SimpleNamespace(id="job-456")

	monkeypatch.setenv("HF_TOKEN", "hf_test")
	monkeypatch.setattr(submit, "SHARD_BUCKET", "org/bucket")
	monkeypatch.setattr(submit, "run_job", fake_run_job)

	job_id = submit._dispatch_eval_command(
	"sub-1", "https://example.test/sub-1.zip", [],
	)

	assert job_id == "job-456"
	assert "volumes" not in captured
	assert "CADGENBENCH_SHARD_BUCKET" not in captured["env"]


	def test_shard_bucket_uri_built_from_id_and_prefix(monkeypatch):
	"""The bucket URI strips any hf:// prefix and nests submission/shards."""
	monkeypatch.setattr(
	submit, "SHARD_BUCKET",
	"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
	)
	monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
	assert submit._shard_bucket_id() == (
	"HuggingAI4Engineering/cadgenbench-eval-staging"
	)
	assert submit._shard_bucket_uri("sub-1") == (
	"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
	"submissions/sub-1/shards"
	)


	def test_eval_job_syncs_shard_to_bucket(tmp_path: Path, monkeypatch):
	"""In bucket mode the eval job syncs shard outputs to the bucket URI."""
	eval_job_path = (
	Path(__file__).resolve().parents[2]
	/ "cadgenbench-eval-gpu"
	/ "eval_job.py"
	)
	spec = importlib.util.spec_from_file_location("eval_job_for_test", eval_job_path)
	assert spec and spec.loader
	eval_job = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(eval_job)

	run_dir = tmp_path / "run"
	fixture_dir = run_dir / "101"
	fixture_dir.mkdir(parents=True)
	(fixture_dir / "result.json").write_text("{}", encoding="utf-8")

	captured: dict = {}

	def fake_sync_bucket(self, *, source, dest, token=None):
	captured.update(source=source, dest=dest)

	monkeypatch.setattr(eval_job.HfApi, "sync_bucket", fake_sync_bucket)
	monkeypatch.setenv(
	eval_job.SHARD_BUCKET_ENV,
	"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
	)
	monkeypatch.setenv(eval_job.SHARD_BUCKET_PREFIX_ENV, "submissions")

	eval_job._upload_shard_artifacts(
	"sub-1", "shard_000", run_dir, "ignored/submissions", "ignored-token",
	)

	assert captured["source"] == str(run_dir)
	assert captured["dest"] == (
	"hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
	"submissions/sub-1/shards/shard_000"
	)


	def test_poll_until_done_uses_jobs_namespace_and_token(monkeypatch):
	"""Polling must target the namespace where Jobs were dispatched."""
	captured: dict = {}

	def fake_inspect_job(**kwargs):
	captured.update(kwargs)
	return SimpleNamespace(
	status=SimpleNamespace(stage="COMPLETED", message=None),
	)

	monkeypatch.setenv("HF_TOKEN", "hf_test")
	monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)

	assert submit._poll_until_done("job-123", "sub-1") == ("COMPLETED", None)
	assert captured == {
	"job_id": "job-123",
	"namespace": submit.EVAL_JOB_NAMESPACE,
	"token": "hf_test",
	}


	def test_shard_poll_uses_jobs_namespace_and_token(monkeypatch):
	"""Sharded polling uses the same Jobs namespace/token as dispatch."""
	captured: dict = {}

	def fake_inspect_job(**kwargs):
	captured.update(kwargs)
	return SimpleNamespace(
	status=SimpleNamespace(stage="COMPLETED", message=None),
	)

	monkeypatch.setenv("HF_TOKEN", "hf_test")
	monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)
	monkeypatch.setattr(submit.time, "sleep", lambda *_: None)

	failures = submit._poll_shards_until_done(
	"sub-1",
	"https://example.test/sub-1.zip",
	{"shard_000": {"job_id": "job-123", "stage": None, "message": None}},
	)

	assert failures == []
	assert captured == {
	"job_id": "job-123",
	"namespace": submit.EVAL_JOB_NAMESPACE,
	"token": "hf_test",
	}


	def test_job_failure_reason_fetches_logs_with_namespace_and_token(monkeypatch):
	"""Failure diagnostics fetch logs from the same Jobs namespace."""
	captured: dict = {}

	def fake_fetch_job_logs(**kwargs):
	captured.update(kwargs)
	return ["line 1\n", "line 2\n"]

	monkeypatch.setenv("HF_TOKEN", "hf_test")
	monkeypatch.setattr(submit, "fetch_job_logs", fake_fetch_job_logs)

	reason = submit._job_failure_reason("job-123", "ERROR", "boom")

	assert "line 2" in reason
	assert captured == {
	"job_id": "job-123",
	"namespace": submit.EVAL_JOB_NAMESPACE,
	"token": "hf_test",
	}


	def _stub_meta() -> dict:
	"""Minimum meta.json shape that survives ``_load_and_validate_meta``."""
	return {
	"submitter_name": "team-test",
	"submission_name": "Stub Agent v1",
	"agent_url": "https://github.com/example/stub-agent",
	"notes": "test row, not a real submission",
	"agree_to_publish": True,
	}


	def test_pending_row_defaults_new_fields(monkeypatch):
	"""Three Bundle 1+2 fields land with their schema defaults."""
	monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
	row = submit._build_pending_row(
	submission_id="sub-test-x",
	meta=_stub_meta(),
	blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
	submission_sha256="a" * 64,
	)
	assert row["validation_status"] == "unvalidated"
	assert row["validation_method"] is None
	assert row["hf_username"] is None


	def test_pending_row_preserves_sha256(monkeypatch):
	"""Existing dedup path's row-level half: sha256 still gets stamped on the row."""
	monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
	expected_hash = "f" * 64
	row = submit._build_pending_row(
	submission_id="sub-test-x",
	meta=_stub_meta(),
	blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
	submission_sha256=expected_hash,
	)
	assert row["submission_sha256"] == expected_hash


	def test_pending_row_populates_hf_username_when_provided(monkeypatch):
	"""C10 OAuth path: profile.username flows into the row's hf_username.

	The submit handler reads ``gr.OAuthProfile`` (injected by Gradio)
	and passes ``profile.username`` through as a kwarg. This test
	exercises just the row builder's side of that handoff so a
	refactor that drops the kwarg gets caught.
	"""
	monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
	row = submit._build_pending_row(
	submission_id="sub-test-x",
	meta=_stub_meta(),
	blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
	submission_sha256="a" * 64,
	hf_username="alice",
	)
	assert row["hf_username"] == "alice"


	def test_pending_row_hf_username_defaults_to_none(monkeypatch):
	"""Omitting the kwarg keeps `hf_username` null.

	Covers the pre-OAuth callers (test fixtures, scripts) that don't
	have a profile in scope. Pre-C10 row writers and any future
	non-OAuth caller default cleanly.
	"""
	monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
	row = submit._build_pending_row(
	submission_id="sub-test-x",
	meta=_stub_meta(),
	blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
	submission_sha256="a" * 64,
	)
	assert row["hf_username"] is None


	def test_pending_row_preserves_existing_metadata(monkeypatch):
	"""Pre-Bundle-1+2 fields keep their values from meta + args.

	Regression guard: a future refactor of ``_build_pending_row`` that
	accidentally drops one of these keys would silently change the
	schema of every row the Space writes.
	"""
	monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
	meta = _stub_meta()
	row = submit._build_pending_row(
	submission_id="sub-test-x",
	meta=meta,
	blob_url="https://example.test/sub-test-x.zip",
	submission_sha256="0" * 64,
	)
	assert row["submission_id"] == "sub-test-x"
	assert row["status"] == "pending"
	assert row["failure_reason"] is None
	assert row["submitter_name"] == meta["submitter_name"]
	assert row["submission_name"] == meta["submission_name"]
	assert row["agent_url"] == meta["agent_url"]
	assert row["notes"] == meta["notes"]
	assert row["submission_blob_url"] == "https://example.test/sub-test-x.zip"
	assert row["cadgenbench_data_revision"] == "test-rev"
	# Score-shaped fields are null on a fresh pending row.
	for k in (
	"aggregate_score",
	"validity_rate",
	"score_by_task_type",
	"per_task_scores",
	"per_fixture_scores",
	"per_fixture_breakdown",
	):
	assert row[k] is None


	def test_validate_steps_allows_missing_output_step(tmp_path: Path, monkeypatch):
	"""Missing fixture outputs are accepted; evaluator scores them as missing."""
	(tmp_path / "101").mkdir()
	(tmp_path / "102").mkdir()
	calls: list[Path] = []
	monkeypatch.setattr(submit, "parse_step", lambda p: calls.append(p))

	submit._validate_steps_parseable(tmp_path, {"101", "102"})

	assert calls == []


	def test_validate_steps_checks_present_output_stp(tmp_path: Path, monkeypatch):
	"""Present candidate files are still cheap-parse checked."""
	fixture = tmp_path / "101"
	fixture.mkdir()
	candidate = fixture / "output.stp"
	candidate.write_text("ISO-10303-21;\n")
	calls: list[Path] = []
	monkeypatch.setattr(submit, "parse_step", lambda p: calls.append(p))

	submit._validate_steps_parseable(tmp_path, {"101"})

	assert calls == [candidate]