File size: 8,214 Bytes
ee729e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab0e054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
"""Smoke + dispatch tests for scripts/run_calibration.py.

Two failure modes this guards against:

1. Silent broken imports inside cmd_generate_outputs. The runner has no
   module-level test coverage; a missing symbol like build_default_registry
   will pass CI and fail at first invocation. test_module_imports asserts
   the runner is importable.

2. Mixed-corpus calibration items routed to the wrong store. The spec
   includes both k8s and fastapi questions. test_dispatch_routes_per_corpus
   verifies each item goes to the orchestrator built for its corpus, and
   test_unknown_corpus_raises verifies a clear error if the spec drifts
   from the corpora the runner builds.
"""

from __future__ import annotations

import importlib
import json
import sys
from pathlib import Path

import pytest

REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


class _StubProvider:
    def __init__(self, *args, **kwargs):
        pass


class _StubEmbedder:
    def __init__(self, *args, **kwargs):
        pass


class _StubOrchestrator:
    """Records each run() call and returns a synthetic AgentResponse-shaped
    object so cmd_generate_outputs can compute its hash + write its record.
    """

    def __init__(self, corpus_name: str, calls: list) -> None:
        self.corpus_name = corpus_name
        self.calls = calls

    async def run(self, *, question: str, system_prompt: str):
        self.calls.append({"corpus": self.corpus_name, "question": question})

        class _Source:
            def __init__(self, s: str) -> None:
                self.source = s

        class _Resp:
            answer = f"[{self.corpus_name}] answer to: {question}"
            sources = [_Source(f"{self.corpus_name}/doc.md")]
            ranked_sources = [f"{self.corpus_name}/doc.md"]
            source_chunks = ["chunk text"]

        return _Resp()


def test_module_imports():
    """Importing the runner must not raise. Catches broken symbol references
    inside the module before they cost a calibration run."""
    mod = importlib.import_module("scripts.run_calibration")
    assert hasattr(mod, "cmd_generate_outputs")
    assert hasattr(mod, "_build_corpus_orchestrator")


async def test_dispatch_routes_per_corpus(monkeypatch, tmp_path):
    runner = importlib.import_module("scripts.run_calibration")

    monkeypatch.setattr(
        "agent_bench.core.provider.AnthropicProvider", _StubProvider
    )
    monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder)

    calls: list = []
    built_corpora: list = []

    def fake_builder(cfg, corpus_name, embedder, provider):
        built_corpora.append(corpus_name)
        return _StubOrchestrator(corpus_name, calls)

    monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder)

    out_path = tmp_path / "system_outputs.json"
    monkeypatch.setattr(runner, "SYSTEM_OUTPUTS", out_path)

    await runner.cmd_generate_outputs(concurrency=2)

    assert sorted(built_corpora) == ["fastapi", "k8s"]

    spec = json.loads(runner.CALIBRATION_SPEC.read_text())
    expected_corpus_by_id = {i["id"]: i["corpus"] for i in spec["items"]}

    records = json.loads(out_path.read_text())
    assert len(records) == len(expected_corpus_by_id)

    seen_ids = set()
    for rec in records:
        item_id = rec["item_id"]
        seen_ids.add(item_id)
        expected = expected_corpus_by_id[item_id]
        assert rec["corpus"] == expected
        assert rec["answer"].startswith(f"[{expected}]")
        assert rec["sources"] == [f"{expected}/doc.md"]
        assert isinstance(rec["system_output_hash"], str)
        assert len(rec["system_output_hash"]) == 64

    assert seen_ids == set(expected_corpus_by_id.keys())

    by_corpus: dict[str, int] = {}
    for c in calls:
        by_corpus[c["corpus"]] = by_corpus.get(c["corpus"], 0) + 1
    expected_counts: dict[str, int] = {}
    for cor in expected_corpus_by_id.values():
        expected_counts[cor] = expected_counts.get(cor, 0) + 1
    assert by_corpus == expected_counts


async def test_unknown_corpus_raises(monkeypatch, tmp_path):
    runner = importlib.import_module("scripts.run_calibration")

    monkeypatch.setattr(
        "agent_bench.core.provider.AnthropicProvider", _StubProvider
    )
    monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder)

    calls: list = []

    def fake_builder(cfg, corpus_name, embedder, provider):
        return _StubOrchestrator(corpus_name, calls)

    monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder)
    monkeypatch.setattr(
        runner, "SYSTEM_OUTPUTS", tmp_path / "system_outputs.json"
    )

    spec = json.loads(runner.CALIBRATION_SPEC.read_text())
    bogus_spec = {
        "items": [
            {**spec["items"][0], "corpus": "phantom_corpus"},
        ]
    }
    bogus_spec_path = tmp_path / "calibration_v1.json"
    bogus_spec_path.write_text(json.dumps(bogus_spec))
    monkeypatch.setattr(runner, "CALIBRATION_SPEC", bogus_spec_path)

    with pytest.raises(KeyError) as excinfo:
        await runner.cmd_generate_outputs(concurrency=1)

    msg = str(excinfo.value)
    assert "phantom_corpus" in msg
    assert "not in cfg.corpora" in msg
    assert spec["items"][0]["id"] in msg


def _write_jsonl(path: Path, rows: list[dict]) -> None:
    path.write_text("\n".join(json.dumps(r) for r in rows) + "\n")


def test_compute_kappa_weights_returns_real_kappa(monkeypatch, tmp_path):
    """Two judges over 4 items with known agreement patterns. Cohen's κ
    must come out positive for the high-agreement judge and zero for the
    chance-agreement judge.
    """
    runner = importlib.import_module("scripts.run_calibration")

    labels_path = tmp_path / "labels.jsonl"
    _write_jsonl(
        labels_path,
        [
            {
                "item_id": f"i{n}",
                "dimension": "completeness",
                "score": gold,
                "abstained": False,
                "system_output_hash": f"h{n}",
            }
            for n, gold in enumerate([2, 2, 1, 1])
        ],
    )
    monkeypatch.setattr(runner, "LABELS_PATH", labels_path)

    sidecar = tmp_path / "predictions.jsonl"
    rows = []
    for n, gold in enumerate([2, 2, 1, 1]):
        rows.append(
            {
                "judge_id": "good_completeness",
                "system_output_hash": f"h{n}",
                "score": gold,
            }
        )
        rows.append(
            {
                "judge_id": "bad_completeness",
                "system_output_hash": f"h{n}",
                "score": 1,
            }
        )
    _write_jsonl(sidecar, rows)

    weights = runner._compute_kappa_weights(
        sidecar,
        "completeness",
        expected_judge_ids={"good_completeness", "bad_completeness"},
    )
    assert weights["good_completeness"] == pytest.approx(1.0)
    assert weights["bad_completeness"] == 0.0  # negative κ clipped to 0


def test_compute_kappa_weights_hard_errors_on_missing_member(monkeypatch, tmp_path):
    """Asymmetric coverage in the weights source must hard-error, not
    silently return partial weights — that was the v1 bug that masked
    gpt-4o-mini's exclusion."""
    runner = importlib.import_module("scripts.run_calibration")

    labels_path = tmp_path / "labels.jsonl"
    _write_jsonl(
        labels_path,
        [
            {
                "item_id": "i0",
                "dimension": "completeness",
                "score": 2,
                "abstained": False,
                "system_output_hash": "h0",
            }
        ],
    )
    monkeypatch.setattr(runner, "LABELS_PATH", labels_path)

    sidecar = tmp_path / "predictions.jsonl"
    _write_jsonl(
        sidecar,
        [
            {
                "judge_id": "haiku_completeness",
                "system_output_hash": "h0",
                "score": 2,
            }
        ],
    )

    with pytest.raises(ValueError, match="symmetric coverage"):
        runner._compute_kappa_weights(
            sidecar,
            "completeness",
            expected_judge_ids={"haiku_completeness", "gpt4o_completeness"},
        )