Spaces:

HuggingAI4Engineering
/

CADGenBench

Running

File size: 9,971 Bytes

"""Unit tests for the admin promote / demote helpers.

C5 contract: the admin gate distinguishes in-set from out-of-set
users, ``promote_row`` and ``demote_row`` flip the two validation
fields on the right row, and a repeat promotion is idempotent. Every
``results.jsonl`` read and write is mocked, so the suite makes zero
Hub calls.
"""
from __future__ import annotations

import json
import os
from types import SimpleNamespace

import pytest

# submit.py kicks off a Hub-touching stuck-pending sweep at import.
# Disable it before importing (admin imports submit) so running this
# file in isolation stays offline.
os.environ.setdefault("CADGENBENCH_DISABLE_BOOT_SWEEP", "1")

import admin  # noqa: E402
import submit  # noqa: E402

SEED_ROWS = [
    {
        "submission_id": "alpha",
        "validation_status": "unvalidated",
        "validation_method": None,
        "aggregate_score": 0.5,
    },
    {
        "submission_id": "beta",
        "validation_status": "validated",
        "validation_method": "code",
        "aggregate_score": 0.9,
    },
]


def _jsonl(rows: list[dict]) -> str:
    return "\n".join(json.dumps(r) for r in rows) + "\n"


def _row(rows: list[dict], submission_id: str) -> dict:
    return next(r for r in rows if r["submission_id"] == submission_id)


@pytest.fixture
def hub(monkeypatch):
    """Mock the results.jsonl read + write that ``_hub_rmw_results`` drives.

    ``state["rows"]`` starts as a copy of :data:`SEED_ROWS` and is
    replaced by whatever bytes the helper hands to ``upload_file``,
    re-parsed back into dicts. ``state["uploads"]`` counts the writes
    so a test can assert how many commits a call produced.
    """
    state: dict = {
        "rows": [dict(r) for r in SEED_ROWS],
        "uploads": 0,
        "deleted_paths": [],
    }

    def fake_download() -> str:
        return _jsonl(state["rows"])

    def fake_upload(*, path_or_fileobj, **kwargs) -> None:
        body = (
            path_or_fileobj.decode("utf-8")
            if isinstance(path_or_fileobj, bytes)
            else path_or_fileobj
        )
        state["rows"] = [
            json.loads(line) for line in body.splitlines() if line.strip()
        ]
        state["uploads"] += 1

    def fake_delete_file(*, path_in_repo, **kwargs) -> None:
        state["deleted_paths"].append(path_in_repo)

    monkeypatch.setattr(submit, "_download_results_jsonl", fake_download)
    monkeypatch.setattr(submit._HF_API, "upload_file", fake_upload)
    monkeypatch.setattr(submit._HF_API, "delete_file", fake_delete_file)
    return state


def test_admin_gate_in_set_vs_out_of_set(monkeypatch):
    """is_admin admits logged-in users in the set, rejects everyone else."""
    monkeypatch.setenv("CADGENBENCH_ADMINS", "michaelr27, lvwerra")
    assert admin.is_admin(SimpleNamespace(username="michaelr27")) is True
    assert admin.is_admin(SimpleNamespace(username="lvwerra")) is True
    assert admin.is_admin(SimpleNamespace(username="someone-else")) is False
    # Logged-out is never admin.
    assert admin.is_admin(None) is False
    # Empty / unset variable means no one is admin.
    monkeypatch.delenv("CADGENBENCH_ADMINS", raising=False)
    assert admin.is_admin(SimpleNamespace(username="michaelr27")) is False


def test_promote_happy_path(hub):
    """Promoting an unvalidated row sets status + method, leaves others alone."""
    admin.promote_row("alpha", "manual")
    promoted = _row(hub["rows"], "alpha")
    assert promoted["validation_status"] == "validated"
    assert promoted["validation_method"] == "manual"
    # The other row is untouched.
    assert _row(hub["rows"], "beta")["validation_method"] == "code"
    assert hub["uploads"] == 1


def test_demote_happy_path(hub):
    """Demoting a validated row clears the method and flips status back."""
    admin.demote_row("beta")
    demoted = _row(hub["rows"], "beta")
    assert demoted["validation_status"] == "unvalidated"
    assert demoted["validation_method"] is None
    assert hub["uploads"] == 1


def test_promote_idempotent(hub):
    """Re-promoting an already-validated row lands the same state."""
    admin.promote_row("beta", "code")
    once = dict(_row(hub["rows"], "beta"))
    assert once["validation_status"] == "validated"
    assert once["validation_method"] == "code"
    # Second identical promotion produces an identical row.
    admin.promote_row("beta", "code")
    assert _row(hub["rows"], "beta") == once


def test_promote_rows_bulk(hub):
    """A bulk promote flips every listed row in one write."""
    admin.promote_rows(["alpha", "beta"], "traces")
    for sid in ("alpha", "beta"):
        row = _row(hub["rows"], sid)
        assert row["validation_status"] == "validated"
        assert row["validation_method"] == "traces"
    assert hub["uploads"] == 1


def test_demote_rows_bulk(hub):
    """A bulk demote clears method on every listed row in one write."""
    admin.demote_rows(["alpha", "beta"])
    for sid in ("alpha", "beta"):
        row = _row(hub["rows"], sid)
        assert row["validation_status"] == "unvalidated"
        assert row["validation_method"] is None
    assert hub["uploads"] == 1


def test_promote_rows_missing_id_raises_without_write(hub):
    """An unknown id aborts the whole batch before any upload."""
    with pytest.raises(LookupError):
        admin.promote_rows(["alpha", "ghost"], "code")
    assert hub["uploads"] == 0
    # alpha is untouched since the write never happened.
    assert _row(hub["rows"], "alpha")["validation_status"] == "unvalidated"


def test_empty_selection_raises(hub):
    """Bulk helpers reject an empty / all-falsy selection."""
    for call in (
        lambda: admin.promote_rows([], "code"),
        lambda: admin.demote_rows([None, ""]),
        lambda: admin.delete_rows([]),
    ):
        with pytest.raises(ValueError):
            call()
    assert hub["uploads"] == 0


def test_delete_rows_removes_rows_and_artifacts(hub):
    """Delete drops the rows and best-effort removes their artifacts."""
    admin.delete_rows(["alpha"])
    remaining = {r["submission_id"] for r in hub["rows"]}
    assert remaining == {"beta"}
    # All three companion blobs were targeted for deletion.
    assert hub["deleted_paths"] == [
        "submissions/alpha.zip",
        "reports/alpha.html",
        "reports/alpha.json",
    ]
    assert hub["uploads"] == 1


def _job(job_id: str, stage: str, *args: str) -> SimpleNamespace:
    """A minimal JobInfo stand-in: id, status.stage, and a command argv."""
    return SimpleNamespace(
        id=job_id,
        status=SimpleNamespace(stage=stage, message=None),
        command=["python", "/opt/eval_job.py", *args],
        arguments=None,
    )


@pytest.fixture
def jobs(monkeypatch):
    """Mock the Jobs API (``list_jobs`` / ``cancel_job``) admin imports.

    ``state["jobs"]`` is the list ``list_jobs`` returns;
    ``state["cancelled"]`` records every ``job_id`` a ``cancel_job`` call
    targeted, so a test can assert exactly which jobs were stopped.
    """
    state: dict = {"jobs": [], "cancelled": []}

    def fake_list_jobs(*, namespace=None, token=None):
        return state["jobs"]

    def fake_cancel_job(*, job_id, namespace=None, token=None):
        state["cancelled"].append(job_id)

    monkeypatch.setattr(admin, "list_jobs", fake_list_jobs)
    monkeypatch.setattr(admin, "cancel_job", fake_cancel_job)
    return state


def test_stop_and_delete_cancels_running_then_deletes(hub, jobs):
    """A running job whose command names the id is cancelled, then the row goes."""
    jobs["jobs"] = [
        _job("job-alpha", "RUNNING", "alpha", "https://blob/alpha.zip"),
        _job("job-beta", "RUNNING", "beta", "https://blob/beta.zip"),
    ]
    admin.stop_and_delete_rows(["alpha"])
    # Only alpha's job was cancelled.
    assert jobs["cancelled"] == ["job-alpha"]
    # And alpha's row + artifacts are gone, beta untouched.
    assert {r["submission_id"] for r in hub["rows"]} == {"beta"}
    assert "submissions/alpha.zip" in hub["deleted_paths"]


def test_stop_and_delete_catches_all_shard_jobs(hub, jobs):
    """Every shard job for a submission (same id in argv) is cancelled."""
    jobs["jobs"] = [
        _job("job-a0", "RUNNING", "alpha", "url", "--shard-id", "shard_000"),
        _job("job-a1", "RUNNING", "alpha", "url", "--shard-id", "shard_001"),
    ]
    admin.stop_and_delete_rows(["alpha"])
    assert sorted(jobs["cancelled"]) == ["job-a0", "job-a1"]


def test_stop_and_delete_skips_terminal_jobs(hub, jobs):
    """A finished job for the id is not cancelled, but the row still deletes."""
    jobs["jobs"] = [
        _job("job-alpha", "COMPLETED", "alpha", "url"),
    ]
    admin.stop_and_delete_rows(["alpha"])
    assert jobs["cancelled"] == []
    assert {r["submission_id"] for r in hub["rows"]} == {"beta"}


def test_stop_and_delete_tolerates_list_jobs_failure(hub, monkeypatch):
    """A Jobs-API listing failure must not block the row delete."""
    def boom(*, namespace=None, token=None):
        raise RuntimeError("jobs API down")

    monkeypatch.setattr(admin, "list_jobs", boom)
    admin.stop_and_delete_rows(["alpha"])
    assert {r["submission_id"] for r in hub["rows"]} == {"beta"}


def test_stop_and_delete_tolerates_cancel_failure(hub, jobs, monkeypatch):
    """A cancel that errors is swallowed; the row still deletes."""
    jobs["jobs"] = [_job("job-alpha", "RUNNING", "alpha", "url")]

    def boom(*, job_id, namespace=None, token=None):
        raise RuntimeError("cancel rejected")

    monkeypatch.setattr(admin, "cancel_job", boom)
    admin.stop_and_delete_rows(["alpha"])
    assert {r["submission_id"] for r in hub["rows"]} == {"beta"}


def test_stop_and_delete_empty_selection_raises(hub, jobs):
    """An empty selection is a caller error, before any job/list work."""
    with pytest.raises(ValueError):
        admin.stop_and_delete_rows([])
    assert jobs["cancelled"] == []
    assert hub["uploads"] == 0