agentbench / tests /scripts /test_run_calibration_dispatch.py
Nomearod's picture
calibrate(jury): v1.1+v1.1.1 — fix weighting bugs; recency-position paraphrase clause
ab0e054
"""Smoke + dispatch tests for scripts/run_calibration.py.
Two failure modes this guards against:
1. Silent broken imports inside cmd_generate_outputs. The runner has no
module-level test coverage; a missing symbol like build_default_registry
will pass CI and fail at first invocation. test_module_imports asserts
the runner is importable.
2. Mixed-corpus calibration items routed to the wrong store. The spec
includes both k8s and fastapi questions. test_dispatch_routes_per_corpus
verifies each item goes to the orchestrator built for its corpus, and
test_unknown_corpus_raises verifies a clear error if the spec drifts
from the corpora the runner builds.
"""
from __future__ import annotations
import importlib
import json
import sys
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
class _StubProvider:
def __init__(self, *args, **kwargs):
pass
class _StubEmbedder:
def __init__(self, *args, **kwargs):
pass
class _StubOrchestrator:
"""Records each run() call and returns a synthetic AgentResponse-shaped
object so cmd_generate_outputs can compute its hash + write its record.
"""
def __init__(self, corpus_name: str, calls: list) -> None:
self.corpus_name = corpus_name
self.calls = calls
async def run(self, *, question: str, system_prompt: str):
self.calls.append({"corpus": self.corpus_name, "question": question})
class _Source:
def __init__(self, s: str) -> None:
self.source = s
class _Resp:
answer = f"[{self.corpus_name}] answer to: {question}"
sources = [_Source(f"{self.corpus_name}/doc.md")]
ranked_sources = [f"{self.corpus_name}/doc.md"]
source_chunks = ["chunk text"]
return _Resp()
def test_module_imports():
"""Importing the runner must not raise. Catches broken symbol references
inside the module before they cost a calibration run."""
mod = importlib.import_module("scripts.run_calibration")
assert hasattr(mod, "cmd_generate_outputs")
assert hasattr(mod, "_build_corpus_orchestrator")
async def test_dispatch_routes_per_corpus(monkeypatch, tmp_path):
runner = importlib.import_module("scripts.run_calibration")
monkeypatch.setattr(
"agent_bench.core.provider.AnthropicProvider", _StubProvider
)
monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder)
calls: list = []
built_corpora: list = []
def fake_builder(cfg, corpus_name, embedder, provider):
built_corpora.append(corpus_name)
return _StubOrchestrator(corpus_name, calls)
monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder)
out_path = tmp_path / "system_outputs.json"
monkeypatch.setattr(runner, "SYSTEM_OUTPUTS", out_path)
await runner.cmd_generate_outputs(concurrency=2)
assert sorted(built_corpora) == ["fastapi", "k8s"]
spec = json.loads(runner.CALIBRATION_SPEC.read_text())
expected_corpus_by_id = {i["id"]: i["corpus"] for i in spec["items"]}
records = json.loads(out_path.read_text())
assert len(records) == len(expected_corpus_by_id)
seen_ids = set()
for rec in records:
item_id = rec["item_id"]
seen_ids.add(item_id)
expected = expected_corpus_by_id[item_id]
assert rec["corpus"] == expected
assert rec["answer"].startswith(f"[{expected}]")
assert rec["sources"] == [f"{expected}/doc.md"]
assert isinstance(rec["system_output_hash"], str)
assert len(rec["system_output_hash"]) == 64
assert seen_ids == set(expected_corpus_by_id.keys())
by_corpus: dict[str, int] = {}
for c in calls:
by_corpus[c["corpus"]] = by_corpus.get(c["corpus"], 0) + 1
expected_counts: dict[str, int] = {}
for cor in expected_corpus_by_id.values():
expected_counts[cor] = expected_counts.get(cor, 0) + 1
assert by_corpus == expected_counts
async def test_unknown_corpus_raises(monkeypatch, tmp_path):
runner = importlib.import_module("scripts.run_calibration")
monkeypatch.setattr(
"agent_bench.core.provider.AnthropicProvider", _StubProvider
)
monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder)
calls: list = []
def fake_builder(cfg, corpus_name, embedder, provider):
return _StubOrchestrator(corpus_name, calls)
monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder)
monkeypatch.setattr(
runner, "SYSTEM_OUTPUTS", tmp_path / "system_outputs.json"
)
spec = json.loads(runner.CALIBRATION_SPEC.read_text())
bogus_spec = {
"items": [
{**spec["items"][0], "corpus": "phantom_corpus"},
]
}
bogus_spec_path = tmp_path / "calibration_v1.json"
bogus_spec_path.write_text(json.dumps(bogus_spec))
monkeypatch.setattr(runner, "CALIBRATION_SPEC", bogus_spec_path)
with pytest.raises(KeyError) as excinfo:
await runner.cmd_generate_outputs(concurrency=1)
msg = str(excinfo.value)
assert "phantom_corpus" in msg
assert "not in cfg.corpora" in msg
assert spec["items"][0]["id"] in msg
def _write_jsonl(path: Path, rows: list[dict]) -> None:
path.write_text("\n".join(json.dumps(r) for r in rows) + "\n")
def test_compute_kappa_weights_returns_real_kappa(monkeypatch, tmp_path):
"""Two judges over 4 items with known agreement patterns. Cohen's κ
must come out positive for the high-agreement judge and zero for the
chance-agreement judge.
"""
runner = importlib.import_module("scripts.run_calibration")
labels_path = tmp_path / "labels.jsonl"
_write_jsonl(
labels_path,
[
{
"item_id": f"i{n}",
"dimension": "completeness",
"score": gold,
"abstained": False,
"system_output_hash": f"h{n}",
}
for n, gold in enumerate([2, 2, 1, 1])
],
)
monkeypatch.setattr(runner, "LABELS_PATH", labels_path)
sidecar = tmp_path / "predictions.jsonl"
rows = []
for n, gold in enumerate([2, 2, 1, 1]):
rows.append(
{
"judge_id": "good_completeness",
"system_output_hash": f"h{n}",
"score": gold,
}
)
rows.append(
{
"judge_id": "bad_completeness",
"system_output_hash": f"h{n}",
"score": 1,
}
)
_write_jsonl(sidecar, rows)
weights = runner._compute_kappa_weights(
sidecar,
"completeness",
expected_judge_ids={"good_completeness", "bad_completeness"},
)
assert weights["good_completeness"] == pytest.approx(1.0)
assert weights["bad_completeness"] == 0.0 # negative κ clipped to 0
def test_compute_kappa_weights_hard_errors_on_missing_member(monkeypatch, tmp_path):
"""Asymmetric coverage in the weights source must hard-error, not
silently return partial weights — that was the v1 bug that masked
gpt-4o-mini's exclusion."""
runner = importlib.import_module("scripts.run_calibration")
labels_path = tmp_path / "labels.jsonl"
_write_jsonl(
labels_path,
[
{
"item_id": "i0",
"dimension": "completeness",
"score": 2,
"abstained": False,
"system_output_hash": "h0",
}
],
)
monkeypatch.setattr(runner, "LABELS_PATH", labels_path)
sidecar = tmp_path / "predictions.jsonl"
_write_jsonl(
sidecar,
[
{
"judge_id": "haiku_completeness",
"system_output_hash": "h0",
"score": 2,
}
],
)
with pytest.raises(ValueError, match="symmetric coverage"):
runner._compute_kappa_weights(
sidecar,
"completeness",
expected_judge_ids={"haiku_completeness", "gpt4o_completeness"},
)