Spaces:

HuggingAI4Engineering
/

cadgenbench-leaderboard

Running

File size: 14,607 Bytes

"""Unit tests for the submit-tab pending-row builder.

C4 contract: ``_build_pending_row`` defaults the three Bundle 1+2
schema fields (``validation_status="unvalidated"``,
``validation_method=None``, ``hf_username=None``) and keeps the
existing metadata + sha256 fields intact.

Hub I/O (``_resolve_data_revision`` reads from
``HfApi.dataset_info``) is monkeypatched out, so the suite has zero
network traffic.
"""
from __future__ import annotations

import importlib.util
from pathlib import Path
from types import SimpleNamespace

import pytest

import submit


def _hub_http_error(status: int, headers: dict | None = None) -> submit.HfHubHTTPError:
    """An ``HfHubHTTPError`` with a minimal response carrying *status*.

    Built without going through the real Hub: a ``SimpleNamespace``
    stands in for the httpx response so ``_with_hub_retries`` can read
    ``response.status_code`` / ``response.headers``. Newer
    ``huggingface_hub`` makes ``response`` a required keyword-only
    constructor argument, so it's passed in directly (and re-assigned
    afterwards for the older positional-optional signature too).
    """
    response = SimpleNamespace(
        status_code=status, headers=headers or {}, request=None,
    )
    err = submit.HfHubHTTPError(f"HTTP {status}", response=response)
    err.response = response
    return err


def test_with_hub_retries_recovers_after_transient(monkeypatch):
    """A 429 (then a 503) is retried; the eventual success is returned."""
    monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
    statuses = iter([429, 503])
    calls = {"n": 0}

    def flaky():
        calls["n"] += 1
        status = next(statuses, None)
        if status is not None:
            raise _hub_http_error(status)
        return "ok"

    assert submit._with_hub_retries(flaky, what="test") == "ok"
    assert calls["n"] == 3


def test_with_hub_retries_reraises_non_retryable(monkeypatch):
    """A 403 is not in the retry set, so it propagates on the first try."""
    monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
    calls = {"n": 0}

    def forbidden():
        calls["n"] += 1
        raise _hub_http_error(403)

    with pytest.raises(submit.HfHubHTTPError):
        submit._with_hub_retries(forbidden, what="test")
    assert calls["n"] == 1


def test_with_hub_retries_gives_up_after_wall_cap(monkeypatch):
    """Past the wall cap, a persistent 429 stops being retried and raises."""
    monkeypatch.setattr(submit.time, "sleep", lambda *_: None)
    # Force the deadline check to trip after the first failure: the
    # second monotonic() read (post-failure) already exceeds the cap.
    ticks = iter([0.0, submit.HUB_RETRY_MAX_SECONDS + 1])
    monkeypatch.setattr(
        submit.time, "monotonic", lambda: next(ticks, submit.HUB_RETRY_MAX_SECONDS + 1)
    )
    calls = {"n": 0}

    def always_429():
        calls["n"] += 1
        raise _hub_http_error(429)

    with pytest.raises(submit.HfHubHTTPError):
        submit._with_hub_retries(always_429, what="test")
    assert calls["n"] == 1


def test_retry_after_header_is_honored(monkeypatch):
    """A ``Retry-After`` seconds value sets the floor for the sleep delay."""
    slept: list[float] = []
    monkeypatch.setattr(submit.time, "sleep", lambda d: slept.append(d))
    calls = {"n": 0}

    def flaky():
        calls["n"] += 1
        if calls["n"] == 1:
            raise _hub_http_error(429, headers={"Retry-After": "7"})
        return "ok"

    assert submit._with_hub_retries(flaky, what="test") == "ok"
    assert slept and slept[0] >= 7.0


def test_dispatch_shard_passes_bucket_env(monkeypatch):
    """Bucket-configured shard jobs get the bucket env, no volume mount."""
    captured: dict = {}

    def fake_run_job(**kwargs):
        captured.update(kwargs)
        return SimpleNamespace(id="job-123")

    monkeypatch.setenv("HF_TOKEN", "hf_test")
    monkeypatch.setattr(
        submit, "SHARD_BUCKET",
        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
    )
    monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
    monkeypatch.setattr(submit, "run_job", fake_run_job)

    job_id = submit._dispatch_eval_command(
        "sub-1", "https://example.test/sub-1.zip",
        ["--shard-id", "shard_000", "--fixtures", "101,102"],
    )

    assert job_id == "job-123"
    assert captured["env"]["CADGENBENCH_SHARD_BUCKET"] == (
        "HuggingAI4Engineering/cadgenbench-eval-staging"
    )
    assert captured["env"]["CADGENBENCH_SHARD_BUCKET_PREFIX"] == "submissions"
    # Mount-free: no volume is attached to the job.
    assert "volumes" not in captured


def test_dispatch_whole_submission_no_bucket_env(monkeypatch):
    """Configured bucket staging is only for sharded eval jobs."""
    captured: dict = {}

    def fake_run_job(**kwargs):
        captured.update(kwargs)
        return SimpleNamespace(id="job-456")

    monkeypatch.setenv("HF_TOKEN", "hf_test")
    monkeypatch.setattr(submit, "SHARD_BUCKET", "org/bucket")
    monkeypatch.setattr(submit, "run_job", fake_run_job)

    job_id = submit._dispatch_eval_command(
        "sub-1", "https://example.test/sub-1.zip", [],
    )

    assert job_id == "job-456"
    assert "volumes" not in captured
    assert "CADGENBENCH_SHARD_BUCKET" not in captured["env"]


def test_shard_bucket_uri_built_from_id_and_prefix(monkeypatch):
    """The bucket URI strips any hf:// prefix and nests submission/shards."""
    monkeypatch.setattr(
        submit, "SHARD_BUCKET",
        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
    )
    monkeypatch.setattr(submit, "SHARD_BUCKET_PREFIX", "submissions")
    assert submit._shard_bucket_id() == (
        "HuggingAI4Engineering/cadgenbench-eval-staging"
    )
    assert submit._shard_bucket_uri("sub-1") == (
        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
        "submissions/sub-1/shards"
    )


def test_eval_job_syncs_shard_to_bucket(tmp_path: Path, monkeypatch):
    """In bucket mode the eval job syncs shard outputs to the bucket URI."""
    eval_job_path = (
        Path(__file__).resolve().parents[2]
        / "cadgenbench-eval-gpu"
        / "eval_job.py"
    )
    spec = importlib.util.spec_from_file_location("eval_job_for_test", eval_job_path)
    assert spec and spec.loader
    eval_job = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(eval_job)

    run_dir = tmp_path / "run"
    fixture_dir = run_dir / "101"
    fixture_dir.mkdir(parents=True)
    (fixture_dir / "result.json").write_text("{}", encoding="utf-8")

    captured: dict = {}

    def fake_sync_bucket(self, *, source, dest, token=None):
        captured.update(source=source, dest=dest)

    monkeypatch.setattr(eval_job.HfApi, "sync_bucket", fake_sync_bucket)
    monkeypatch.setenv(
        eval_job.SHARD_BUCKET_ENV,
        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging",
    )
    monkeypatch.setenv(eval_job.SHARD_BUCKET_PREFIX_ENV, "submissions")

    eval_job._upload_shard_artifacts(
        "sub-1", "shard_000", run_dir, "ignored/submissions", "ignored-token",
    )

    assert captured["source"] == str(run_dir)
    assert captured["dest"] == (
        "hf://buckets/HuggingAI4Engineering/cadgenbench-eval-staging/"
        "submissions/sub-1/shards/shard_000"
    )


def test_poll_until_done_uses_jobs_namespace_and_token(monkeypatch):
    """Polling must target the namespace where Jobs were dispatched."""
    captured: dict = {}

    def fake_inspect_job(**kwargs):
        captured.update(kwargs)
        return SimpleNamespace(
            status=SimpleNamespace(stage="COMPLETED", message=None),
        )

    monkeypatch.setenv("HF_TOKEN", "hf_test")
    monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)

    assert submit._poll_until_done("job-123", "sub-1") == ("COMPLETED", None)
    assert captured == {
        "job_id": "job-123",
        "namespace": submit.EVAL_JOB_NAMESPACE,
        "token": "hf_test",
    }


def test_shard_poll_uses_jobs_namespace_and_token(monkeypatch):
    """Sharded polling uses the same Jobs namespace/token as dispatch."""
    captured: dict = {}

    def fake_inspect_job(**kwargs):
        captured.update(kwargs)
        return SimpleNamespace(
            status=SimpleNamespace(stage="COMPLETED", message=None),
        )

    monkeypatch.setenv("HF_TOKEN", "hf_test")
    monkeypatch.setattr(submit, "inspect_job", fake_inspect_job)
    monkeypatch.setattr(submit.time, "sleep", lambda *_: None)

    failures = submit._poll_shards_until_done(
        "sub-1",
        "https://example.test/sub-1.zip",
        {"shard_000": {"job_id": "job-123", "stage": None, "message": None}},
    )

    assert failures == []
    assert captured == {
        "job_id": "job-123",
        "namespace": submit.EVAL_JOB_NAMESPACE,
        "token": "hf_test",
    }


def test_job_failure_reason_fetches_logs_with_namespace_and_token(monkeypatch):
    """Failure diagnostics fetch logs from the same Jobs namespace."""
    captured: dict = {}

    def fake_fetch_job_logs(**kwargs):
        captured.update(kwargs)
        return ["line 1\n", "line 2\n"]

    monkeypatch.setenv("HF_TOKEN", "hf_test")
    monkeypatch.setattr(submit, "fetch_job_logs", fake_fetch_job_logs)

    reason = submit._job_failure_reason("job-123", "ERROR", "boom")

    assert "line 2" in reason
    assert captured == {
        "job_id": "job-123",
        "namespace": submit.EVAL_JOB_NAMESPACE,
        "token": "hf_test",
    }


def _stub_meta() -> dict:
    """Minimum meta.json shape that survives ``_load_and_validate_meta``."""
    return {
        "submitter_name": "team-test",
        "submission_name": "Stub Agent v1",
        "agent_url": "https://github.com/example/stub-agent",
        "notes": "test row, not a real submission",
        "agree_to_publish": True,
    }


def test_pending_row_defaults_new_fields(monkeypatch):
    """Three Bundle 1+2 fields land with their schema defaults."""
    monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
    row = submit._build_pending_row(
        submission_id="sub-test-x",
        meta=_stub_meta(),
        blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
        submission_sha256="a" * 64,
    )
    assert row["validation_status"] == "unvalidated"
    assert row["validation_method"] is None
    assert row["hf_username"] is None


def test_pending_row_preserves_sha256(monkeypatch):
    """Existing dedup path's row-level half: sha256 still gets stamped on the row."""
    monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
    expected_hash = "f" * 64
    row = submit._build_pending_row(
        submission_id="sub-test-x",
        meta=_stub_meta(),
        blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
        submission_sha256=expected_hash,
    )
    assert row["submission_sha256"] == expected_hash


def test_pending_row_populates_hf_username_when_provided(monkeypatch):
    """C10 OAuth path: profile.username flows into the row's hf_username.

    The submit handler reads ``gr.OAuthProfile`` (injected by Gradio)
    and passes ``profile.username`` through as a kwarg. This test
    exercises just the row builder's side of that handoff so a
    refactor that drops the kwarg gets caught.
    """
    monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
    row = submit._build_pending_row(
        submission_id="sub-test-x",
        meta=_stub_meta(),
        blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
        submission_sha256="a" * 64,
        hf_username="alice",
    )
    assert row["hf_username"] == "alice"


def test_pending_row_hf_username_defaults_to_none(monkeypatch):
    """Omitting the kwarg keeps `hf_username` null.

    Covers the pre-OAuth callers (test fixtures, scripts) that don't
    have a profile in scope. Pre-C10 row writers and any future
    non-OAuth caller default cleanly.
    """
    monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
    row = submit._build_pending_row(
        submission_id="sub-test-x",
        meta=_stub_meta(),
        blob_url="https://huggingface.co/datasets/example/sub-test-x.zip",
        submission_sha256="a" * 64,
    )
    assert row["hf_username"] is None


def test_pending_row_preserves_existing_metadata(monkeypatch):
    """Pre-Bundle-1+2 fields keep their values from meta + args.

    Regression guard: a future refactor of ``_build_pending_row`` that
    accidentally drops one of these keys would silently change the
    schema of every row the Space writes.
    """
    monkeypatch.setattr(submit, "_resolve_data_revision", lambda: "test-rev")
    meta = _stub_meta()
    row = submit._build_pending_row(
        submission_id="sub-test-x",
        meta=meta,
        blob_url="https://example.test/sub-test-x.zip",
        submission_sha256="0" * 64,
    )
    assert row["submission_id"] == "sub-test-x"
    assert row["status"] == "pending"
    assert row["failure_reason"] is None
    assert row["submitter_name"] == meta["submitter_name"]
    assert row["submission_name"] == meta["submission_name"]
    assert row["agent_url"] == meta["agent_url"]
    assert row["notes"] == meta["notes"]
    assert row["submission_blob_url"] == "https://example.test/sub-test-x.zip"
    assert row["cadgenbench_data_revision"] == "test-rev"
    # Score-shaped fields are null on a fresh pending row.
    for k in (
        "aggregate_score",
        "validity_rate",
        "score_by_task_type",
        "per_task_scores",
        "per_fixture_scores",
        "per_fixture_breakdown",
    ):
        assert row[k] is None


def test_validate_steps_allows_missing_output_step(tmp_path: Path, monkeypatch):
    """Missing fixture outputs are accepted; evaluator scores them as missing."""
    (tmp_path / "101").mkdir()
    (tmp_path / "102").mkdir()
    calls: list[Path] = []
    monkeypatch.setattr(submit, "parse_step", lambda p: calls.append(p))

    submit._validate_steps_parseable(tmp_path, {"101", "102"})

    assert calls == []


def test_validate_steps_checks_present_output_stp(tmp_path: Path, monkeypatch):
    """Present candidate files are still cheap-parse checked."""
    fixture = tmp_path / "101"
    fixture.mkdir()
    candidate = fixture / "output.stp"
    candidate.write_text("ISO-10303-21;\n")
    calls: list[Path] = []
    monkeypatch.setattr(submit, "parse_step", lambda p: calls.append(p))

    submit._validate_steps_parseable(tmp_path, {"101"})

    assert calls == [candidate]