"""Smoke + dispatch tests for scripts/run_calibration.py. Two failure modes this guards against: 1. Silent broken imports inside cmd_generate_outputs. The runner has no module-level test coverage; a missing symbol like build_default_registry will pass CI and fail at first invocation. test_module_imports asserts the runner is importable. 2. Mixed-corpus calibration items routed to the wrong store. The spec includes both k8s and fastapi questions. test_dispatch_routes_per_corpus verifies each item goes to the orchestrator built for its corpus, and test_unknown_corpus_raises verifies a clear error if the spec drifts from the corpora the runner builds. """ from __future__ import annotations import importlib import json import sys from pathlib import Path import pytest REPO_ROOT = Path(__file__).resolve().parents[2] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) class _StubProvider: def __init__(self, *args, **kwargs): pass class _StubEmbedder: def __init__(self, *args, **kwargs): pass class _StubOrchestrator: """Records each run() call and returns a synthetic AgentResponse-shaped object so cmd_generate_outputs can compute its hash + write its record. """ def __init__(self, corpus_name: str, calls: list) -> None: self.corpus_name = corpus_name self.calls = calls async def run(self, *, question: str, system_prompt: str): self.calls.append({"corpus": self.corpus_name, "question": question}) class _Source: def __init__(self, s: str) -> None: self.source = s class _Resp: answer = f"[{self.corpus_name}] answer to: {question}" sources = [_Source(f"{self.corpus_name}/doc.md")] ranked_sources = [f"{self.corpus_name}/doc.md"] source_chunks = ["chunk text"] return _Resp() def test_module_imports(): """Importing the runner must not raise. Catches broken symbol references inside the module before they cost a calibration run.""" mod = importlib.import_module("scripts.run_calibration") assert hasattr(mod, "cmd_generate_outputs") assert hasattr(mod, "_build_corpus_orchestrator") async def test_dispatch_routes_per_corpus(monkeypatch, tmp_path): runner = importlib.import_module("scripts.run_calibration") monkeypatch.setattr( "agent_bench.core.provider.AnthropicProvider", _StubProvider ) monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder) calls: list = [] built_corpora: list = [] def fake_builder(cfg, corpus_name, embedder, provider): built_corpora.append(corpus_name) return _StubOrchestrator(corpus_name, calls) monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder) out_path = tmp_path / "system_outputs.json" monkeypatch.setattr(runner, "SYSTEM_OUTPUTS", out_path) await runner.cmd_generate_outputs(concurrency=2) assert sorted(built_corpora) == ["fastapi", "k8s"] spec = json.loads(runner.CALIBRATION_SPEC.read_text()) expected_corpus_by_id = {i["id"]: i["corpus"] for i in spec["items"]} records = json.loads(out_path.read_text()) assert len(records) == len(expected_corpus_by_id) seen_ids = set() for rec in records: item_id = rec["item_id"] seen_ids.add(item_id) expected = expected_corpus_by_id[item_id] assert rec["corpus"] == expected assert rec["answer"].startswith(f"[{expected}]") assert rec["sources"] == [f"{expected}/doc.md"] assert isinstance(rec["system_output_hash"], str) assert len(rec["system_output_hash"]) == 64 assert seen_ids == set(expected_corpus_by_id.keys()) by_corpus: dict[str, int] = {} for c in calls: by_corpus[c["corpus"]] = by_corpus.get(c["corpus"], 0) + 1 expected_counts: dict[str, int] = {} for cor in expected_corpus_by_id.values(): expected_counts[cor] = expected_counts.get(cor, 0) + 1 assert by_corpus == expected_counts async def test_unknown_corpus_raises(monkeypatch, tmp_path): runner = importlib.import_module("scripts.run_calibration") monkeypatch.setattr( "agent_bench.core.provider.AnthropicProvider", _StubProvider ) monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder) calls: list = [] def fake_builder(cfg, corpus_name, embedder, provider): return _StubOrchestrator(corpus_name, calls) monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder) monkeypatch.setattr( runner, "SYSTEM_OUTPUTS", tmp_path / "system_outputs.json" ) spec = json.loads(runner.CALIBRATION_SPEC.read_text()) bogus_spec = { "items": [ {**spec["items"][0], "corpus": "phantom_corpus"}, ] } bogus_spec_path = tmp_path / "calibration_v1.json" bogus_spec_path.write_text(json.dumps(bogus_spec)) monkeypatch.setattr(runner, "CALIBRATION_SPEC", bogus_spec_path) with pytest.raises(KeyError) as excinfo: await runner.cmd_generate_outputs(concurrency=1) msg = str(excinfo.value) assert "phantom_corpus" in msg assert "not in cfg.corpora" in msg assert spec["items"][0]["id"] in msg def _write_jsonl(path: Path, rows: list[dict]) -> None: path.write_text("\n".join(json.dumps(r) for r in rows) + "\n") def test_compute_kappa_weights_returns_real_kappa(monkeypatch, tmp_path): """Two judges over 4 items with known agreement patterns. Cohen's κ must come out positive for the high-agreement judge and zero for the chance-agreement judge. """ runner = importlib.import_module("scripts.run_calibration") labels_path = tmp_path / "labels.jsonl" _write_jsonl( labels_path, [ { "item_id": f"i{n}", "dimension": "completeness", "score": gold, "abstained": False, "system_output_hash": f"h{n}", } for n, gold in enumerate([2, 2, 1, 1]) ], ) monkeypatch.setattr(runner, "LABELS_PATH", labels_path) sidecar = tmp_path / "predictions.jsonl" rows = [] for n, gold in enumerate([2, 2, 1, 1]): rows.append( { "judge_id": "good_completeness", "system_output_hash": f"h{n}", "score": gold, } ) rows.append( { "judge_id": "bad_completeness", "system_output_hash": f"h{n}", "score": 1, } ) _write_jsonl(sidecar, rows) weights = runner._compute_kappa_weights( sidecar, "completeness", expected_judge_ids={"good_completeness", "bad_completeness"}, ) assert weights["good_completeness"] == pytest.approx(1.0) assert weights["bad_completeness"] == 0.0 # negative κ clipped to 0 def test_compute_kappa_weights_hard_errors_on_missing_member(monkeypatch, tmp_path): """Asymmetric coverage in the weights source must hard-error, not silently return partial weights — that was the v1 bug that masked gpt-4o-mini's exclusion.""" runner = importlib.import_module("scripts.run_calibration") labels_path = tmp_path / "labels.jsonl" _write_jsonl( labels_path, [ { "item_id": "i0", "dimension": "completeness", "score": 2, "abstained": False, "system_output_hash": "h0", } ], ) monkeypatch.setattr(runner, "LABELS_PATH", labels_path) sidecar = tmp_path / "predictions.jsonl" _write_jsonl( sidecar, [ { "judge_id": "haiku_completeness", "system_output_hash": "h0", "score": 2, } ], ) with pytest.raises(ValueError, match="symmetric coverage"): runner._compute_kappa_weights( sidecar, "completeness", expected_judge_ids={"haiku_completeness", "gpt4o_completeness"}, )