Nomearod Claude Opus 4.7 (1M context) commited on
Commit
1d47106
Β·
1 Parent(s): 4fa7c61

feat(calibration): generate_kappa_table with strict/warn modes

Browse files

Joins predictions β‹ˆ labels by (item_id, dimension, system_output_hash).
Hash mismatch ALWAYS raises with first-item expected/actual hashes
plus full mismatched-id list β€” applies in both modes (never warned).
Missing predictions/labels warn-and-exclude by default; --strict
raises (the final-artifact path; make calibrate uses it).

Pairwise abstain exclusion in ΞΊ; per-dimension cause breakdown
(schema_parse / out_of_range / provider_exhausted / genuine) via
the abstain-reason constants from judges/base.py. Abstain-rate
flag fires on STRICTLY greater than 20%; 6/30 (=20%) does not
fire, 7/30 does β€” boundary tested explicitly.

ΞΊ undefined β†’ 'β€”' with footnote (insufficient variance, N<3
agreement-eligible items remaining, or all labels+predictions in
a single category β€” the last condition was load-bearing for the
'all-ones degenerate' test case).

Skips '_members.jsonl' sidecar files in the predictions glob β€” they
hold per-permutation / per-jury-member detail, not aggregate
predictions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/calibration/report.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """generate_kappa_table β€” joins predictions β‹ˆ labels by (item_id, dimension,
2
+ system_output_hash); computes per-row ΞΊ + bootstrap CI + abstain breakdown;
3
+ emits markdown table at docs/_generated/kappa_table.md.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import glob as _glob
9
+ import json
10
+ from collections import defaultdict
11
+ from pathlib import Path
12
+
13
+ import structlog
14
+
15
+ from agent_bench.evaluation.calibration.metrics import bootstrap_ci, cohen_kappa
16
+ from agent_bench.evaluation.judges.base import (
17
+ ABSTAIN_REASON_OUT_OF_RANGE,
18
+ ABSTAIN_REASON_PROVIDER_EXHAUSTED,
19
+ ABSTAIN_REASON_SCHEMA_PARSE,
20
+ )
21
+
22
+ logger = structlog.get_logger()
23
+
24
+ ABSTAIN_THRESHOLD = 0.20 # strictly greater than fires the flag
25
+
26
+
27
+ def _classify_abstain(reasoning: str) -> str:
28
+ if reasoning.startswith(ABSTAIN_REASON_PROVIDER_EXHAUSTED):
29
+ return "provider_exhausted"
30
+ if reasoning.startswith(ABSTAIN_REASON_SCHEMA_PARSE):
31
+ return "schema_parse"
32
+ if reasoning.startswith(ABSTAIN_REASON_OUT_OF_RANGE):
33
+ return "out_of_range"
34
+ return "genuine"
35
+
36
+
37
+ def generate_kappa_table(
38
+ *,
39
+ predictions_glob: str,
40
+ labels_path: str,
41
+ output_path: str,
42
+ strict: bool = False,
43
+ ) -> None:
44
+ """Aggregate predictions across rows + dimensions into one markdown table.
45
+
46
+ On hash mismatch: ALWAYS raises (both modes), with first-item expected
47
+ /actual hashes plus full mismatched-id list.
48
+ On missing prediction or label: WARN+exclude in default mode; RAISE in strict.
49
+ On undefined ΞΊ: render 'β€”' with a footnote (both modes).
50
+ On abstain rate > 20%: render ΞΊ + footnote with cause breakdown (both modes).
51
+ """
52
+ labels: list[dict] = []
53
+ for line in Path(labels_path).read_text().splitlines():
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ labels.append(json.loads(line))
58
+
59
+ label_by_key: dict[tuple[str, str], dict] = {
60
+ (label_rec["item_id"], label_rec["dimension"]): label_rec
61
+ for label_rec in labels
62
+ }
63
+
64
+ pred_files = sorted(_glob.glob(predictions_glob))
65
+ if not pred_files:
66
+ raise ValueError(f"No prediction files matched: {predictions_glob}")
67
+
68
+ rows: list[dict] = []
69
+ for pf in pred_files:
70
+ # Skip sidecar JSONLs (per-member detail, not aggregate predictions)
71
+ if pf.endswith("_members.jsonl"):
72
+ continue
73
+ row_label = (
74
+ Path(pf).stem.replace("calibration_v1_judge_", "")
75
+ )
76
+ preds = json.loads(Path(pf).read_text())
77
+
78
+ # Hash-mismatch detection (always raises)
79
+ mismatches: list[tuple[str, str, str]] = []
80
+ for p in preds:
81
+ key = (p["item_id"], p["dimension"])
82
+ if key in label_by_key:
83
+ expected = label_by_key[key]["system_output_hash"]
84
+ actual = p["system_output_hash"]
85
+ if expected != actual:
86
+ mismatches.append((p["item_id"], expected, actual))
87
+ if mismatches:
88
+ first_id, first_exp, first_act = mismatches[0]
89
+ raise ValueError(
90
+ f"Hash mismatch in {pf}: item {first_id!r} "
91
+ f"label.system_output_hash={first_exp!r} but "
92
+ f"prediction.system_output_hash={first_act!r}. "
93
+ f"Full mismatched-id list ({len(mismatches)}): "
94
+ f"{[m[0] for m in mismatches]}. "
95
+ f"Labels are stale relative to predictions β€” regenerate one or "
96
+ f"the other so hashes align."
97
+ )
98
+
99
+ preds_by_dim: dict[str, list[dict]] = defaultdict(list)
100
+ for p in preds:
101
+ preds_by_dim[p["dimension"]].append(p)
102
+
103
+ labels_by_dim: dict[str, list[dict]] = defaultdict(list)
104
+ for label_rec in labels:
105
+ labels_by_dim[label_rec["dimension"]].append(label_rec)
106
+
107
+ for dim in sorted(preds_by_dim.keys()):
108
+ preds_d = {p["item_id"]: p for p in preds_by_dim[dim]}
109
+ labs_d = {
110
+ label_rec["item_id"]: label_rec
111
+ for label_rec in labels_by_dim.get(dim, [])
112
+ }
113
+
114
+ common = sorted(set(preds_d) & set(labs_d))
115
+ missing_pred = sorted(set(labs_d) - set(preds_d))
116
+ missing_lab = sorted(set(preds_d) - set(labs_d))
117
+ if missing_pred or missing_lab:
118
+ msg = (
119
+ f"row={row_label} dim={dim} "
120
+ f"missing_predictions={missing_pred} "
121
+ f"missing_labels={missing_lab}"
122
+ )
123
+ if strict:
124
+ raise ValueError(f"strict mode: missing items: {msg}")
125
+ logger.warning("calibration_report_missing", message=msg)
126
+
127
+ y_pred: list = []
128
+ y_lab: list = []
129
+ abstains = 0
130
+ abstain_causes: dict[str, int] = {
131
+ "provider_exhausted": 0,
132
+ "schema_parse": 0,
133
+ "out_of_range": 0,
134
+ "genuine": 0,
135
+ }
136
+ for iid in common:
137
+ p = preds_d[iid]
138
+ label_rec = labs_d[iid]
139
+ if p["score"] == "Unknown" or label_rec["score"] == "Unknown":
140
+ abstains += 1
141
+ if p["score"] == "Unknown":
142
+ abstain_causes[
143
+ _classify_abstain(p.get("reasoning", ""))
144
+ ] += 1
145
+ continue
146
+ y_pred.append(int(p["score"]))
147
+ y_lab.append(int(label_rec["score"]))
148
+
149
+ n_eligible = len(y_pred)
150
+ abstain_rate = abstains / max(len(common), 1)
151
+
152
+ if n_eligible < 3:
153
+ rows.append(
154
+ {
155
+ "row": row_label,
156
+ "dim": dim,
157
+ "kappa": None,
158
+ "ci_lo": None,
159
+ "ci_hi": None,
160
+ "n_eligible": n_eligible,
161
+ "abstains": abstains,
162
+ "abstain_rate": abstain_rate,
163
+ "abstain_causes": abstain_causes,
164
+ "footnote": (
165
+ f"ΞΊ undefined: insufficient agreement-eligible "
166
+ f"items (N={n_eligible})"
167
+ ),
168
+ }
169
+ )
170
+ continue
171
+
172
+ try:
173
+ kappa = cohen_kappa(y_lab, y_pred)
174
+ point, lo, hi = bootstrap_ci(
175
+ y_lab, y_pred, cohen_kappa, n_iter=1000, seed=42
176
+ )
177
+ except (ValueError, ZeroDivisionError):
178
+ rows.append(
179
+ {
180
+ "row": row_label,
181
+ "dim": dim,
182
+ "kappa": None,
183
+ "ci_lo": None,
184
+ "ci_hi": None,
185
+ "n_eligible": n_eligible,
186
+ "abstains": abstains,
187
+ "abstain_rate": abstain_rate,
188
+ "abstain_causes": abstain_causes,
189
+ "footnote": (
190
+ "ΞΊ undefined: insufficient variance after "
191
+ "exclusion"
192
+ ),
193
+ }
194
+ )
195
+ continue
196
+
197
+ # Detect degenerate ΞΊ (perfectly constant labels β†’ P_e=1 β†’ kappa
198
+ # was clamped to 1.0 in metrics.py, but with no observed
199
+ # disagreement the result is statistically meaningless)
200
+ if len(set(y_lab)) <= 1 and len(set(y_pred)) <= 1:
201
+ rows.append(
202
+ {
203
+ "row": row_label,
204
+ "dim": dim,
205
+ "kappa": None,
206
+ "ci_lo": None,
207
+ "ci_hi": None,
208
+ "n_eligible": n_eligible,
209
+ "abstains": abstains,
210
+ "abstain_rate": abstain_rate,
211
+ "abstain_causes": abstain_causes,
212
+ "footnote": (
213
+ "ΞΊ undefined: all labels and predictions in a "
214
+ "single category (no variance to measure)"
215
+ ),
216
+ }
217
+ )
218
+ continue
219
+
220
+ footnote = ""
221
+ if abstain_rate > ABSTAIN_THRESHOLD:
222
+ breakdown = ", ".join(
223
+ f"{int(100 * v / abstains)}% {k.replace('_', ' ')}"
224
+ for k, v in abstain_causes.items()
225
+ if v > 0
226
+ )
227
+ footnote = (
228
+ f"ΞΊ computed on N={n_eligible} of {len(common)} items; "
229
+ f"high abstain rate ({100 * abstain_rate:.1f}% β€” "
230
+ f"breakdown: {breakdown}) suggests rubric ambiguity."
231
+ )
232
+
233
+ rows.append(
234
+ {
235
+ "row": row_label,
236
+ "dim": dim,
237
+ "kappa": kappa,
238
+ "ci_lo": lo,
239
+ "ci_hi": hi,
240
+ "n_eligible": n_eligible,
241
+ "abstains": abstains,
242
+ "abstain_rate": abstain_rate,
243
+ "abstain_causes": abstain_causes,
244
+ "footnote": footnote,
245
+ }
246
+ )
247
+
248
+ out = ["# ΞΊ ablation table β€” calibration v1\n"]
249
+ out.append("| Row | Dimension | ΞΊ (95% CI) | N | Abstain rate | Notes |")
250
+ out.append("|---|---|---|---|---|---|")
251
+ for r in rows:
252
+ if r["kappa"] is None:
253
+ kcell = " β€” "
254
+ else:
255
+ kcell = f"{r['kappa']:.3f} ({r['ci_lo']:.3f}, {r['ci_hi']:.3f})"
256
+ rate = f"{100 * r['abstain_rate']:.1f}%"
257
+ out.append(
258
+ f"| {r['row']} | {r['dim']} | {kcell} | {r['n_eligible']} | "
259
+ f"{rate} | {r['footnote']} |"
260
+ )
261
+
262
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
263
+ Path(output_path).write_text("\n".join(out) + "\n")
264
+ logger.info("kappa_table_written", path=output_path, rows=len(rows))
tests/evaluation/test_calibration_report.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for generate_kappa_table β€” joins, hash-mismatch raise, strict, abstain flag."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+ import structlog
10
+
11
+ from agent_bench.evaluation.calibration.report import generate_kappa_table
12
+
13
+
14
+ def _write_predictions(path: Path, records: list[dict]) -> None:
15
+ path.parent.mkdir(parents=True, exist_ok=True)
16
+ path.write_text(json.dumps(records, indent=2))
17
+
18
+
19
+ def _write_labels(path: Path, records: list[dict]) -> None:
20
+ path.parent.mkdir(parents=True, exist_ok=True)
21
+ path.write_text("\n".join(json.dumps(r) for r in records))
22
+
23
+
24
+ def _pred(
25
+ item_id: str, dim: str, score, sys_hash: str = "h1", reasoning: str = ""
26
+ ) -> dict:
27
+ return {
28
+ "item_id": item_id,
29
+ "dimension": dim,
30
+ "score": score,
31
+ "judge_id": "claude-haiku-4-5_" + dim,
32
+ "rubric_version": "abc",
33
+ "system_output_hash": sys_hash,
34
+ "prompt_seed": 0,
35
+ "cost_usd": 0.001,
36
+ "latency_ms": 100.0,
37
+ "reasoning": reasoning,
38
+ "evidence_quotes": [],
39
+ }
40
+
41
+
42
+ def _lbl(item_id: str, dim: str, score, sys_hash: str = "h1") -> dict:
43
+ return {
44
+ "item_id": item_id,
45
+ "dimension": dim,
46
+ "score": score,
47
+ "abstained": score == "Unknown",
48
+ "notes": "",
49
+ "label_timestamp": "2026-05-04T00:00:00Z",
50
+ "system_output_hash": sys_hash,
51
+ }
52
+
53
+
54
+ class TestHashMismatch:
55
+ def test_raises_with_first_item_detail_and_full_list(self, tmp_path):
56
+ preds = [_pred("i1", "groundedness", 1, sys_hash="A")]
57
+ labels = [_lbl("i1", "groundedness", 1, sys_hash="B")]
58
+ _write_predictions(
59
+ tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
60
+ )
61
+ _write_labels(tmp_path / "labels.jsonl", labels)
62
+ with pytest.raises(ValueError) as exc_info:
63
+ generate_kappa_table(
64
+ predictions_glob=str(
65
+ tmp_path / "results" / "calibration_v1_judge_*.json"
66
+ ),
67
+ labels_path=str(tmp_path / "labels.jsonl"),
68
+ output_path=str(tmp_path / "kappa.md"),
69
+ )
70
+ msg = str(exc_info.value)
71
+ assert "i1" in msg
72
+ assert "A" in msg and "B" in msg
73
+
74
+ def test_hash_mismatch_raises_in_strict_mode_too(self, tmp_path):
75
+ preds = [_pred("i1", "groundedness", 1, sys_hash="A")]
76
+ labels = [_lbl("i1", "groundedness", 1, sys_hash="B")]
77
+ _write_predictions(
78
+ tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
79
+ )
80
+ _write_labels(tmp_path / "labels.jsonl", labels)
81
+ with pytest.raises(ValueError):
82
+ generate_kappa_table(
83
+ predictions_glob=str(
84
+ tmp_path / "results" / "calibration_v1_judge_*.json"
85
+ ),
86
+ labels_path=str(tmp_path / "labels.jsonl"),
87
+ output_path=str(tmp_path / "kappa.md"),
88
+ strict=True,
89
+ )
90
+
91
+
92
+ class TestMissingPredictionLabel:
93
+ def test_default_warns_and_excludes(self, tmp_path):
94
+ preds = [
95
+ _pred("i1", "groundedness", 1),
96
+ _pred("i3", "groundedness", 0),
97
+ _pred("i4", "groundedness", 1),
98
+ ]
99
+ labels = [
100
+ _lbl("i1", "groundedness", 1),
101
+ _lbl("i2", "groundedness", 0), # label without prediction
102
+ _lbl("i3", "groundedness", 0),
103
+ _lbl("i4", "groundedness", 1),
104
+ ]
105
+ _write_predictions(
106
+ tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
107
+ )
108
+ _write_labels(tmp_path / "labels.jsonl", labels)
109
+ with structlog.testing.capture_logs() as logs:
110
+ generate_kappa_table(
111
+ predictions_glob=str(
112
+ tmp_path / "results" / "calibration_v1_judge_*.json"
113
+ ),
114
+ labels_path=str(tmp_path / "labels.jsonl"),
115
+ output_path=str(tmp_path / "kappa.md"),
116
+ )
117
+ assert (tmp_path / "kappa.md").exists()
118
+ assert any(
119
+ entry.get("event") == "calibration_report_missing" for entry in logs
120
+ ), f"no missing-warning log in {logs!r}"
121
+
122
+ def test_strict_raises_on_missing_prediction(self, tmp_path):
123
+ preds = [_pred("i1", "groundedness", 1)]
124
+ labels = [
125
+ _lbl("i1", "groundedness", 1),
126
+ _lbl("i2", "groundedness", 0),
127
+ ]
128
+ _write_predictions(
129
+ tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
130
+ )
131
+ _write_labels(tmp_path / "labels.jsonl", labels)
132
+ with pytest.raises(ValueError, match="missing"):
133
+ generate_kappa_table(
134
+ predictions_glob=str(
135
+ tmp_path / "results" / "calibration_v1_judge_*.json"
136
+ ),
137
+ labels_path=str(tmp_path / "labels.jsonl"),
138
+ output_path=str(tmp_path / "kappa.md"),
139
+ strict=True,
140
+ )
141
+
142
+
143
+ class TestAbstainRateFlag:
144
+ def _setup(self, tmp_path: Path, abstain_count: int) -> Path:
145
+ preds = []
146
+ labels = []
147
+ for i in range(30):
148
+ score: int | str = "Unknown" if i < abstain_count else 1
149
+ reasoning = (
150
+ "schema_parse_failed_after_retry: x" if score == "Unknown" else ""
151
+ )
152
+ preds.append(
153
+ _pred(f"i{i}", "groundedness", score, reasoning=reasoning)
154
+ )
155
+ # Half of non-abstain labels score 0 to ensure variance
156
+ label_score = 0 if (score == 1 and i % 2 == 0) else 1
157
+ labels.append(_lbl(f"i{i}", "groundedness", label_score))
158
+ _write_predictions(
159
+ tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
160
+ )
161
+ _write_labels(tmp_path / "labels.jsonl", labels)
162
+ out = tmp_path / "kappa.md"
163
+ generate_kappa_table(
164
+ predictions_glob=str(
165
+ tmp_path / "results" / "calibration_v1_judge_*.json"
166
+ ),
167
+ labels_path=str(tmp_path / "labels.jsonl"),
168
+ output_path=str(out),
169
+ )
170
+ return out
171
+
172
+ def test_at_20_percent_boundary_does_not_fire(self, tmp_path):
173
+ # 6/30 = exactly 20% β€” flag is ">" (strictly greater), so not fired.
174
+ out = self._setup(tmp_path, abstain_count=6)
175
+ assert "high abstain rate" not in out.read_text().lower()
176
+
177
+ def test_above_20_percent_fires(self, tmp_path):
178
+ # 7/30 = 23.3% β€” flag fires
179
+ out = self._setup(tmp_path, abstain_count=7)
180
+ text = out.read_text().lower()
181
+ assert "high abstain rate" in text
182
+ assert "schema parse" in text
183
+
184
+
185
+ class TestKappaUndefined:
186
+ def test_renders_dash_with_footnote(self, tmp_path):
187
+ # All same label β†’ degenerate; report renders ' β€” '
188
+ preds = [_pred(f"i{i}", "groundedness", 1) for i in range(5)]
189
+ labels = [_lbl(f"i{i}", "groundedness", 1) for i in range(5)]
190
+ _write_predictions(
191
+ tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
192
+ )
193
+ _write_labels(tmp_path / "labels.jsonl", labels)
194
+ out = tmp_path / "kappa.md"
195
+ generate_kappa_table(
196
+ predictions_glob=str(
197
+ tmp_path / "results" / "calibration_v1_judge_*.json"
198
+ ),
199
+ labels_path=str(tmp_path / "labels.jsonl"),
200
+ output_path=str(out),
201
+ )
202
+ text = out.read_text()
203
+ assert " β€” " in text