| """Unit tests for the admin promote / demote helpers. |
| |
| C5 contract: the admin gate distinguishes in-set from out-of-set |
| users, ``promote_row`` and ``demote_row`` flip the two validation |
| fields on the right row, and a repeat promotion is idempotent. Every |
| ``results.jsonl`` read and write is mocked, so the suite makes zero |
| Hub calls. |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import os |
| from types import SimpleNamespace |
|
|
| import pytest |
|
|
| |
| |
| |
| os.environ.setdefault("CADGENBENCH_DISABLE_BOOT_SWEEP", "1") |
|
|
| import admin |
| import submit |
|
|
| SEED_ROWS = [ |
| { |
| "submission_id": "alpha", |
| "validation_status": "unvalidated", |
| "validation_method": None, |
| "aggregate_score": 0.5, |
| }, |
| { |
| "submission_id": "beta", |
| "validation_status": "validated", |
| "validation_method": "code", |
| "aggregate_score": 0.9, |
| }, |
| ] |
|
|
|
|
| def _jsonl(rows: list[dict]) -> str: |
| return "\n".join(json.dumps(r) for r in rows) + "\n" |
|
|
|
|
| def _row(rows: list[dict], submission_id: str) -> dict: |
| return next(r for r in rows if r["submission_id"] == submission_id) |
|
|
|
|
| @pytest.fixture |
| def hub(monkeypatch): |
| """Mock the results.jsonl read + write that ``_hub_rmw_results`` drives. |
| |
| ``state["rows"]`` starts as a copy of :data:`SEED_ROWS` and is |
| replaced by whatever bytes the helper hands to ``upload_file``, |
| re-parsed back into dicts. ``state["uploads"]`` counts the writes |
| so a test can assert how many commits a call produced. |
| """ |
| state: dict = { |
| "rows": [dict(r) for r in SEED_ROWS], |
| "uploads": 0, |
| "deleted_paths": [], |
| "bucket_listed_prefixes": [], |
| "bucket_deleted_paths": [], |
| } |
|
|
| def fake_download() -> str: |
| return _jsonl(state["rows"]) |
|
|
| def fake_upload(*, path_or_fileobj, **kwargs) -> None: |
| body = ( |
| path_or_fileobj.decode("utf-8") |
| if isinstance(path_or_fileobj, bytes) |
| else path_or_fileobj |
| ) |
| state["rows"] = [ |
| json.loads(line) for line in body.splitlines() if line.strip() |
| ] |
| state["uploads"] += 1 |
|
|
| def fake_delete_file(*, path_in_repo, **kwargs) -> None: |
| state["deleted_paths"].append(path_in_repo) |
|
|
| def fake_list_bucket_tree(bucket_id, *, prefix, recursive=False, **kwargs): |
| state["bucket_listed_prefixes"].append(prefix) |
| return [SimpleNamespace(path=f"{prefix}/101/rotating.webp")] |
|
|
| def fake_batch_bucket_files(bucket_id, *, add=None, delete=None, **kwargs): |
| state["bucket_deleted_paths"].extend(delete or []) |
|
|
| monkeypatch.setattr(submit, "_download_results_jsonl", fake_download) |
| |
| |
| |
| monkeypatch.setattr(admin, "_download_results_jsonl", fake_download) |
| monkeypatch.setattr(submit._HF_API, "upload_file", fake_upload) |
| monkeypatch.setattr(submit._HF_API, "delete_file", fake_delete_file) |
| monkeypatch.setattr(submit._HF_API, "list_bucket_tree", fake_list_bucket_tree) |
| monkeypatch.setattr(submit._HF_API, "batch_bucket_files", fake_batch_bucket_files) |
| return state |
|
|
|
|
| def test_admin_gate_in_set_vs_out_of_set(monkeypatch): |
| """is_admin admits logged-in users in the set, rejects everyone else.""" |
| monkeypatch.setenv("CADGENBENCH_ADMINS", "michaelr27, lvwerra") |
| assert admin.is_admin(SimpleNamespace(username="michaelr27")) is True |
| assert admin.is_admin(SimpleNamespace(username="lvwerra")) is True |
| assert admin.is_admin(SimpleNamespace(username="someone-else")) is False |
| |
| assert admin.is_admin(None) is False |
| |
| monkeypatch.delenv("CADGENBENCH_ADMINS", raising=False) |
| assert admin.is_admin(SimpleNamespace(username="michaelr27")) is False |
|
|
|
|
| def test_promote_happy_path(hub): |
| """Promoting an unvalidated row sets status + method, leaves others alone.""" |
| admin.promote_row("alpha", "manual") |
| promoted = _row(hub["rows"], "alpha") |
| assert promoted["validation_status"] == "validated" |
| assert promoted["validation_method"] == "manual" |
| |
| assert _row(hub["rows"], "beta")["validation_method"] == "code" |
| assert hub["uploads"] == 1 |
|
|
|
|
| def test_demote_happy_path(hub): |
| """Demoting a validated row clears the method and flips status back.""" |
| admin.demote_row("beta") |
| demoted = _row(hub["rows"], "beta") |
| assert demoted["validation_status"] == "unvalidated" |
| assert demoted["validation_method"] is None |
| assert hub["uploads"] == 1 |
|
|
|
|
| def test_promote_idempotent(hub): |
| """Re-promoting an already-validated row lands the same state.""" |
| admin.promote_row("beta", "code") |
| once = dict(_row(hub["rows"], "beta")) |
| assert once["validation_status"] == "validated" |
| assert once["validation_method"] == "code" |
| |
| admin.promote_row("beta", "code") |
| assert _row(hub["rows"], "beta") == once |
|
|
|
|
| def test_promote_rows_bulk(hub): |
| """A bulk promote flips every listed row in one write.""" |
| admin.promote_rows(["alpha", "beta"], "traces") |
| for sid in ("alpha", "beta"): |
| row = _row(hub["rows"], sid) |
| assert row["validation_status"] == "validated" |
| assert row["validation_method"] == "traces" |
| assert hub["uploads"] == 1 |
|
|
|
|
| def test_demote_rows_bulk(hub): |
| """A bulk demote clears method on every listed row in one write.""" |
| admin.demote_rows(["alpha", "beta"]) |
| for sid in ("alpha", "beta"): |
| row = _row(hub["rows"], sid) |
| assert row["validation_status"] == "unvalidated" |
| assert row["validation_method"] is None |
| assert hub["uploads"] == 1 |
|
|
|
|
| def test_promote_rows_missing_id_raises_without_write(hub): |
| """An unknown id aborts the whole batch before any upload.""" |
| with pytest.raises(LookupError): |
| admin.promote_rows(["alpha", "ghost"], "code") |
| assert hub["uploads"] == 0 |
| |
| assert _row(hub["rows"], "alpha")["validation_status"] == "unvalidated" |
|
|
|
|
| def test_empty_selection_raises(hub): |
| """Bulk helpers reject an empty / all-falsy selection.""" |
| for call in ( |
| lambda: admin.promote_rows([], "code"), |
| lambda: admin.demote_rows([None, ""]), |
| lambda: admin.delete_rows([]), |
| ): |
| with pytest.raises(ValueError): |
| call() |
| assert hub["uploads"] == 0 |
|
|
|
|
| def test_delete_rows_removes_rows_and_artifacts(hub): |
| """Delete drops the rows and best-effort removes their artifacts.""" |
| admin.delete_rows(["alpha"]) |
| remaining = {r["submission_id"] for r in hub["rows"]} |
| assert remaining == {"beta"} |
| |
| assert hub["deleted_paths"] == [ |
| "submissions/alpha.zip", |
| "reports/alpha.html", |
| "reports/alpha.json", |
| ] |
| |
| assert hub["bucket_listed_prefixes"] == ["renders/alpha"] |
| assert hub["bucket_deleted_paths"] == ["renders/alpha/101/rotating.webp"] |
| assert hub["uploads"] == 1 |
|
|
|
|
| def _job(job_id: str, stage: str, *args: str) -> SimpleNamespace: |
| """A minimal JobInfo stand-in: id, status.stage, and a command argv.""" |
| return SimpleNamespace( |
| id=job_id, |
| status=SimpleNamespace(stage=stage, message=None), |
| command=["python", "/opt/eval_job.py", *args], |
| arguments=None, |
| ) |
|
|
|
|
| @pytest.fixture |
| def jobs(monkeypatch): |
| """Mock the Jobs API (``list_jobs`` / ``cancel_job``) admin imports. |
| |
| ``state["jobs"]`` is the list ``list_jobs`` returns; |
| ``state["cancelled"]`` records every ``job_id`` a ``cancel_job`` call |
| targeted, so a test can assert exactly which jobs were stopped. |
| """ |
| state: dict = {"jobs": [], "cancelled": []} |
|
|
| def fake_list_jobs(*, namespace=None, token=None): |
| return state["jobs"] |
|
|
| def fake_cancel_job(*, job_id, namespace=None, token=None): |
| state["cancelled"].append(job_id) |
|
|
| monkeypatch.setattr(admin, "list_jobs", fake_list_jobs) |
| monkeypatch.setattr(admin, "cancel_job", fake_cancel_job) |
| return state |
|
|
|
|
| def test_stop_and_delete_cancels_running_then_deletes(hub, jobs): |
| """A running job whose command names the id is cancelled, then the row goes.""" |
| jobs["jobs"] = [ |
| _job("job-alpha", "RUNNING", "alpha", "https://blob/alpha.zip"), |
| _job("job-beta", "RUNNING", "beta", "https://blob/beta.zip"), |
| ] |
| admin.stop_and_delete_rows(["alpha"]) |
| |
| assert jobs["cancelled"] == ["job-alpha"] |
| |
| assert {r["submission_id"] for r in hub["rows"]} == {"beta"} |
| assert "submissions/alpha.zip" in hub["deleted_paths"] |
|
|
|
|
| def test_stop_and_delete_catches_all_shard_jobs(hub, jobs): |
| """Every shard job for a submission (same id in argv) is cancelled.""" |
| jobs["jobs"] = [ |
| _job("job-a0", "RUNNING", "alpha", "url", "--shard-id", "shard_000"), |
| _job("job-a1", "RUNNING", "alpha", "url", "--shard-id", "shard_001"), |
| ] |
| admin.stop_and_delete_rows(["alpha"]) |
| assert sorted(jobs["cancelled"]) == ["job-a0", "job-a1"] |
|
|
|
|
| def test_stop_and_delete_skips_terminal_jobs(hub, jobs): |
| """A finished job for the id is not cancelled, but the row still deletes.""" |
| jobs["jobs"] = [ |
| _job("job-alpha", "COMPLETED", "alpha", "url"), |
| ] |
| admin.stop_and_delete_rows(["alpha"]) |
| assert jobs["cancelled"] == [] |
| assert {r["submission_id"] for r in hub["rows"]} == {"beta"} |
|
|
|
|
| def test_stop_and_delete_tolerates_list_jobs_failure(hub, monkeypatch): |
| """A Jobs-API listing failure must not block the row delete.""" |
| def boom(*, namespace=None, token=None): |
| raise RuntimeError("jobs API down") |
|
|
| monkeypatch.setattr(admin, "list_jobs", boom) |
| admin.stop_and_delete_rows(["alpha"]) |
| assert {r["submission_id"] for r in hub["rows"]} == {"beta"} |
|
|
|
|
| def test_stop_and_delete_tolerates_cancel_failure(hub, jobs, monkeypatch): |
| """A cancel that errors is swallowed; the row still deletes.""" |
| jobs["jobs"] = [_job("job-alpha", "RUNNING", "alpha", "url")] |
|
|
| def boom(*, job_id, namespace=None, token=None): |
| raise RuntimeError("cancel rejected") |
|
|
| monkeypatch.setattr(admin, "cancel_job", boom) |
| admin.stop_and_delete_rows(["alpha"]) |
| assert {r["submission_id"] for r in hub["rows"]} == {"beta"} |
|
|
|
|
| def test_stop_and_delete_empty_selection_raises(hub, jobs): |
| """An empty selection is a caller error, before any job/list work.""" |
| with pytest.raises(ValueError): |
| admin.stop_and_delete_rows([]) |
| assert jobs["cancelled"] == [] |
| assert hub["uploads"] == 0 |
|
|
|
|
| |
|
|
| |
| |
| |
| RESCORE_ROWS = [ |
| { |
| "submission_id": "done", |
| "status": "completed", |
| "failure_reason": None, |
| "submission_blob_url": "https://blob/done.zip", |
| "submitted_at": "2026-01-01T00:00:00Z", |
| "aggregate_score": 0.7, |
| "validity_rate": 1.0, |
| "score_by_task_type": {"generation": 0.7}, |
| "per_task_scores": {"generation": {"score": 0.7}}, |
| "per_fixture_scores": {"f1": {"cad_score": 0.7}}, |
| "per_fixture_breakdown": {"f1": {"validity": 1.0}}, |
| }, |
| { |
| "submission_id": "broke", |
| "status": "failed", |
| "failure_reason": "boom", |
| "submission_blob_url": "https://blob/broke.zip", |
| "submitted_at": "2026-01-02T00:00:00Z", |
| "aggregate_score": None, |
| }, |
| { |
| "submission_id": "legacy", |
| "status": "completed", |
| "submission_blob_url": None, |
| "submitted_at": "2025-01-01T00:00:00Z", |
| "aggregate_score": 0.4, |
| }, |
| { |
| "submission_id": "inflight", |
| "status": "pending", |
| "submission_blob_url": "https://blob/inflight.zip", |
| "submitted_at": "2026-02-01T00:00:00Z", |
| "aggregate_score": None, |
| }, |
| ] |
|
|
|
|
| @pytest.fixture |
| def dispatch(monkeypatch): |
| """Capture rescore dispatch without spawning real workers/threads. |
| |
| Replaces ``_dispatch_rescore_workers`` (which would start a daemon |
| thread that calls the submit path's ``_spawn_worker``) with a synch- |
| ronous recorder, and stubs the fixture-set lookup so the suite never |
| touches the data repo. ``state["targets"]`` is the ``{id: blob_url}`` |
| map handed to dispatch; ``state["fixtures"]`` the fixture list. |
| """ |
| state: dict = {"targets": None, "fixtures": None, "calls": 0} |
|
|
| def fake_dispatch(targets, fixture_names): |
| state["targets"] = dict(targets) |
| state["fixtures"] = list(fixture_names) |
| state["calls"] += 1 |
|
|
| monkeypatch.setattr(admin, "_dispatch_rescore_workers", fake_dispatch) |
| monkeypatch.setattr(admin, "_current_fixture_names", lambda: ["f1", "f2"]) |
| return state |
|
|
|
|
| def test_rescore_rows_flips_to_pending_and_dispatches(hub, dispatch): |
| """Rescore resets the row to the pending regime and queues a worker.""" |
| hub["rows"] = [dict(r) for r in RESCORE_ROWS] |
| dispatched, skipped = admin.rescore_rows(["done"]) |
|
|
| assert dispatched == 1 |
| assert skipped == [] |
| row = _row(hub["rows"], "done") |
| assert row["status"] == "pending" |
| assert row["failure_reason"] is None |
| |
| for field in admin._RESCORE_CLEARED_SCORE_FIELDS: |
| assert row[field] is None |
| |
| assert row["submitted_at"] == "2026-01-01T00:00:00Z" |
| |
| assert dispatch["targets"] == {"done": "https://blob/done.zip"} |
| assert dispatch["fixtures"] == ["f1", "f2"] |
| assert hub["uploads"] == 1 |
|
|
|
|
| def test_rescore_rows_skips_rows_without_zip(hub, dispatch): |
| """A legacy row with no stored zip is skipped, not dispatched or erroring.""" |
| hub["rows"] = [dict(r) for r in RESCORE_ROWS] |
| dispatched, skipped = admin.rescore_rows(["legacy"]) |
|
|
| assert dispatched == 0 |
| assert skipped == ["legacy"] |
| |
| row = _row(hub["rows"], "legacy") |
| assert row["status"] == "completed" |
| assert row["aggregate_score"] == 0.4 |
| |
| assert dispatch["calls"] == 0 |
| |
| assert hub["uploads"] == 1 |
|
|
|
|
| def test_rescore_rows_missing_id_raises_without_dispatch(hub, dispatch): |
| """An unknown id aborts the batch before any worker is queued.""" |
| hub["rows"] = [dict(r) for r in RESCORE_ROWS] |
| with pytest.raises(LookupError): |
| admin.rescore_rows(["done", "ghost"]) |
| |
| |
| assert _row(hub["rows"], "done")["status"] == "completed" |
| assert dispatch["calls"] == 0 |
|
|
|
|
| def test_rescore_rows_empty_selection_raises(hub, dispatch): |
| """An empty selection is a caller error.""" |
| with pytest.raises(ValueError): |
| admin.rescore_rows([]) |
| assert dispatch["calls"] == 0 |
|
|
|
|
| def test_rescore_all_targets_completed_and_failed_only(hub, dispatch): |
| """Rescore-all hits rows with a zip, skipping pending + zip-less rows.""" |
| hub["rows"] = [dict(r) for r in RESCORE_ROWS] |
| dispatched, skipped = admin.rescore_all() |
|
|
| |
| |
| assert dispatched == 2 |
| assert set(dispatch["targets"]) == {"done", "broke"} |
| |
| assert _row(hub["rows"], "done")["status"] == "pending" |
| assert _row(hub["rows"], "broke")["status"] == "pending" |
| |
| assert _row(hub["rows"], "inflight")["status"] == "pending" |
| |
| assert _row(hub["rows"], "legacy")["status"] == "completed" |
|
|
|
|
| def test_rescore_all_empty_board_raises(hub, dispatch): |
| """Rescore-all with nothing rescoreable is a no-op error, no write.""" |
| hub["rows"] = [ |
| {"submission_id": "inflight", "status": "pending", |
| "submission_blob_url": "https://blob/x.zip"}, |
| {"submission_id": "legacy", "status": "completed", |
| "submission_blob_url": None}, |
| ] |
| with pytest.raises(ValueError): |
| admin.rescore_all() |
| assert dispatch["calls"] == 0 |
| assert hub["uploads"] == 0 |
|
|