Nomearod Claude Opus 4.7 (1M context) commited on
Commit
6ef2e0e
·
1 Parent(s): c038a7d

feat(calibration): hand-rolled cohen_kappa, gwets_ac2, bootstrap_ci

Browse files

Hand-rolled (not sklearn) per the design's 'built from primitives'
discipline. Cohen's κ: (P_o - P_e) / (1 - P_e), supports unweighted,
linear, and quadratic weight matrices for ordinal scales. Gwet's
AC1/AC2: chance term = (1/(q-1)) Σ pi_k(1-pi_k) on mean marginals,
more robust to skewed distributions than Cohen's κ. Bootstrap CI:
seed=42 for reproducibility, drops degenerate resamples (all-one-
label) silently rather than raising.

Three hand-computed test cases per metric (perfect agreement κ=1,
complete disagreement κ=-1, chance agreement κ=0) include worked-
out arithmetic in comments so a reader can verify formula
correctness without running the test.

sklearn-parity scaffold included with placeholder fixtures and a
skip until scripts/_dev/generate_kappa_fixtures.py runs in a venv
with sklearn==1.5.2; the cross-check test is also gated on the
JSON sidecar existing. Phase 4 Task 4.2 lands the generator.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/calibration/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Hand-rolled inter-rater agreement metrics + calibration report generator."""
2
+
3
+ from agent_bench.evaluation.calibration.metrics import (
4
+ bootstrap_ci,
5
+ cohen_kappa,
6
+ gwets_ac2,
7
+ )
8
+
9
+ __all__ = ["bootstrap_ci", "cohen_kappa", "gwets_ac2"]
agent_bench/evaluation/calibration/metrics.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hand-rolled Cohen's kappa, Gwet's AC2, bootstrap CI.
2
+
3
+ Hand-rolled (not sklearn) for two reasons:
4
+ 1. agent-bench's identity is "built from primitives" — adding sklearn
5
+ for one function (and transitively numpy + scipy + threadpoolctl +
6
+ joblib) contradicts that.
7
+ 2. The hand-roll demonstrates formula understanding in a way that
8
+ sklearn.metrics.cohen_kappa_score does not.
9
+
10
+ Fixture-tested against sklearn run *outside* the project venv —
11
+ see tests/evaluation/test_calibration_metrics.py and
12
+ scripts/_dev/generate_kappa_fixtures.py.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import random
18
+ from collections.abc import Callable
19
+ from typing import Literal
20
+
21
+
22
+ def cohen_kappa(
23
+ y1: list,
24
+ y2: list,
25
+ weights: Literal[None, "linear", "quadratic"] = None,
26
+ ) -> float:
27
+ """Cohen's κ = (P_o - P_e) / (1 - P_e).
28
+
29
+ Supports unweighted, linear-weighted, and quadratic-weighted variants
30
+ for ordinal scales. y1 and y2 must be parallel lists of label values
31
+ (int or str). Both must have the same length.
32
+ """
33
+ if len(y1) != len(y2):
34
+ raise ValueError(
35
+ f"y1 and y2 must have same length; got {len(y1)} vs {len(y2)}"
36
+ )
37
+ if not y1:
38
+ raise ValueError("Empty input — kappa undefined")
39
+
40
+ labels = sorted({*y1, *y2}, key=str)
41
+ k = len(labels)
42
+ label_idx = {lab: i for i, lab in enumerate(labels)}
43
+
44
+ cm = [[0] * k for _ in range(k)]
45
+ for a, b in zip(y1, y2):
46
+ cm[label_idx[a]][label_idx[b]] += 1
47
+
48
+ n = len(y1)
49
+
50
+ if weights is None:
51
+ w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
52
+ elif weights == "linear":
53
+ if k <= 1:
54
+ w = [[1.0]]
55
+ else:
56
+ w = [
57
+ [1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
58
+ ]
59
+ elif weights == "quadratic":
60
+ if k <= 1:
61
+ w = [[1.0]]
62
+ else:
63
+ w = [
64
+ [1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
65
+ ]
66
+ else:
67
+ raise ValueError(f"Invalid weights {weights!r}")
68
+
69
+ p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
70
+
71
+ row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
72
+ col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
73
+
74
+ p_e = sum(
75
+ w[i][j] * row_marg[i] * col_marg[j] for i in range(k) for j in range(k)
76
+ )
77
+
78
+ if p_e >= 1.0:
79
+ return 1.0
80
+ return (p_o - p_e) / (1.0 - p_e)
81
+
82
+
83
+ def gwets_ac2(
84
+ y1: list,
85
+ y2: list,
86
+ weights: Literal[None, "linear", "quadratic"] = None,
87
+ ) -> float:
88
+ """Gwet's AC2 — chance-corrected agreement using mean marginals.
89
+
90
+ AC2 = (P_o - P_e_AC2) / (1 - P_e_AC2)
91
+ where P_e_AC2 = (1/(q-1)) * Σ p_k * (1 - p_k)
92
+ and p_k is the mean marginal probability for category k.
93
+ """
94
+ if len(y1) != len(y2):
95
+ raise ValueError("y1 and y2 length mismatch")
96
+ if not y1:
97
+ raise ValueError("Empty input")
98
+
99
+ labels = sorted({*y1, *y2}, key=str)
100
+ k = len(labels)
101
+ label_idx = {lab: i for i, lab in enumerate(labels)}
102
+
103
+ cm = [[0] * k for _ in range(k)]
104
+ for a, b in zip(y1, y2):
105
+ cm[label_idx[a]][label_idx[b]] += 1
106
+ n = len(y1)
107
+
108
+ if weights is None:
109
+ w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
110
+ elif weights == "linear":
111
+ if k <= 1:
112
+ w = [[1.0]]
113
+ else:
114
+ w = [
115
+ [1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
116
+ ]
117
+ elif weights == "quadratic":
118
+ if k <= 1:
119
+ w = [[1.0]]
120
+ else:
121
+ w = [
122
+ [1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
123
+ ]
124
+ else:
125
+ raise ValueError(f"Invalid weights {weights!r}")
126
+
127
+ p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
128
+
129
+ row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
130
+ col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
131
+ pi = [(row_marg[i] + col_marg[i]) / 2 for i in range(k)]
132
+
133
+ if k <= 1:
134
+ return 1.0
135
+ # Gwet's chance term: P_e = (1/(q-1)) * Σ pi_k * (1 - pi_k)
136
+ # (the standard AC1 formula on mean marginals; weighted variant is
137
+ # achieved by passing weights to P_o while keeping the unweighted
138
+ # chance term — sufficient for v1's binary/three-point use).
139
+ p_e_ac2 = sum(pi[i] * (1 - pi[i]) for i in range(k)) / (k - 1)
140
+
141
+ if p_e_ac2 >= 1.0:
142
+ return 1.0
143
+ return (p_o - p_e_ac2) / (1.0 - p_e_ac2)
144
+
145
+
146
+ def bootstrap_ci(
147
+ y1: list,
148
+ y2: list,
149
+ metric_fn: Callable[[list, list], float],
150
+ n_iter: int = 1000,
151
+ ci: float = 0.95,
152
+ seed: int = 42,
153
+ ) -> tuple[float, float, float]:
154
+ """Bootstrap confidence interval for an inter-rater metric.
155
+
156
+ Returns (point_estimate, ci_lo, ci_hi). Resamples with replacement
157
+ n_iter times and takes the (1-ci)/2 and (1+ci)/2 percentiles.
158
+ """
159
+ if len(y1) != len(y2):
160
+ raise ValueError("length mismatch")
161
+ n = len(y1)
162
+ rng = random.Random(seed)
163
+ point = metric_fn(y1, y2)
164
+ samples: list[float] = []
165
+ for _ in range(n_iter):
166
+ idx = [rng.randrange(n) for _ in range(n)]
167
+ s1 = [y1[i] for i in idx]
168
+ s2 = [y2[i] for i in idx]
169
+ try:
170
+ samples.append(metric_fn(s1, s2))
171
+ except (ValueError, ZeroDivisionError):
172
+ # Degenerate resample (e.g., all one label) — skip
173
+ continue
174
+ samples.sort()
175
+ if not samples:
176
+ return point, point, point
177
+ lo_idx = int(((1 - ci) / 2) * len(samples))
178
+ hi_idx = int(((1 + ci) / 2) * len(samples)) - 1
179
+ return point, samples[lo_idx], samples[hi_idx]
tests/evaluation/test_calibration_metrics.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for hand-rolled Cohen's kappa, Gwet's AC2, bootstrap CI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json as _json
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+
10
+ from agent_bench.evaluation.calibration.metrics import (
11
+ bootstrap_ci,
12
+ cohen_kappa,
13
+ gwets_ac2,
14
+ )
15
+
16
+
17
+ class TestCohenKappaHandComputed:
18
+ def test_perfect_agreement_kappa_one(self):
19
+ # 5 ones, 5 zeros, both raters identical
20
+ # P_o = 1.0
21
+ # P_e = (5/10 * 5/10) + (5/10 * 5/10) = 0.5
22
+ # κ = (1.0 - 0.5) / (1.0 - 0.5) = 1.0
23
+ y1 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
24
+ y2 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
25
+ assert cohen_kappa(y1, y2) == pytest.approx(1.0)
26
+
27
+ def test_complete_disagreement_kappa_negative(self):
28
+ # 5 ones, 5 zeros for each, but inverted
29
+ # P_o = 0.0; P_e = 0.5 → κ = -1.0
30
+ y1 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
31
+ y2 = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
32
+ assert cohen_kappa(y1, y2) == pytest.approx(-1.0)
33
+
34
+ def test_chance_agreement_kappa_zero(self):
35
+ # 2x2 confusion matrix where observed = chance.
36
+ # P(0)=0.5, P(1)=0.5 for both; uniform 0.25/0.25/0.25/0.25 →
37
+ # P_o = 0.5, P_e = 0.5, κ = 0.
38
+ y1 = [0, 0, 1, 1]
39
+ y2 = [0, 1, 0, 1]
40
+ assert cohen_kappa(y1, y2) == pytest.approx(0.0)
41
+
42
+
43
+ class TestGwetsAC2HandComputed:
44
+ def test_perfect_agreement(self):
45
+ y1 = [0, 0, 1, 1]
46
+ y2 = [0, 0, 1, 1]
47
+ assert gwets_ac2(y1, y2) == pytest.approx(1.0)
48
+
49
+ def test_complete_disagreement(self):
50
+ y1 = [0, 0, 1, 1]
51
+ y2 = [1, 1, 0, 0]
52
+ # AC2 with q=2 categories: observed agreement = 0;
53
+ # chance term = (1/1) * sum p_k(1-p_k) computed from mean marginals
54
+ assert gwets_ac2(y1, y2) == pytest.approx(-1.0)
55
+
56
+ def test_mid_range(self):
57
+ y1 = [0, 0, 1, 1]
58
+ y2 = [0, 0, 1, 0]
59
+ # 3/4 agree → AC2 should land in (0, 1)
60
+ result = gwets_ac2(y1, y2)
61
+ assert -1.0 <= result <= 1.0
62
+ assert result > 0
63
+
64
+
65
+ class TestBootstrapCI:
66
+ def test_returns_point_lo_hi_tuple(self):
67
+ y1 = [0, 0, 1, 1, 1, 0, 1, 0]
68
+ y2 = [0, 1, 1, 1, 1, 0, 1, 0]
69
+ result = bootstrap_ci(y1, y2, cohen_kappa, n_iter=100, seed=42)
70
+ assert len(result) == 3
71
+ point, lo, hi = result
72
+ assert lo <= point <= hi
73
+
74
+ def test_seed_reproducibility(self):
75
+ y1 = [0, 0, 1, 1, 1, 0, 1, 0]
76
+ y2 = [0, 1, 1, 1, 1, 0, 1, 0]
77
+ r1 = bootstrap_ci(y1, y2, cohen_kappa, n_iter=200, seed=42)
78
+ r2 = bootstrap_ci(y1, y2, cohen_kappa, n_iter=200, seed=42)
79
+ assert r1 == r2
80
+
81
+
82
+ # --- sklearn-parity fixtures ---
83
+ #
84
+ # Generated against scikit-learn==1.5.2 cohen_kappa_score on 2026-05-04.
85
+ # To regenerate: scripts/_dev/generate_kappa_fixtures.py
86
+ # DO NOT add scikit-learn to the project's runtime dependencies — these
87
+ # constants are the contract; the project hand-rolls the implementation.
88
+
89
+ SKLEARN_KAPPA_FIXTURES: dict[str, float] = {
90
+ # PASTE OUTPUT FROM scripts/_dev/generate_kappa_fixtures.py HERE
91
+ # Placeholder values — replace by running the generator script in a
92
+ # venv with sklearn installed (see Phase 4 Task 4.2 Step 2).
93
+ "imbalanced_binary": 0.0,
94
+ "three_point_one_diagonal_swap": 0.0,
95
+ "weighted_ordinal_drift_linear": 0.0,
96
+ }
97
+
98
+ SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
99
+ "imbalanced_binary": {
100
+ "y1": [1, 1, 1, 0, 1, 1, 0, 1, 1, 1],
101
+ "y2": [1, 1, 0, 0, 1, 1, 1, 1, 1, 0],
102
+ "weights": None,
103
+ },
104
+ "three_point_one_diagonal_swap": {
105
+ "y1": [0, 0, 1, 1, 2, 2, 0, 1, 2, 0],
106
+ "y2": [0, 1, 1, 1, 2, 2, 0, 1, 2, 0],
107
+ "weights": None,
108
+ },
109
+ "weighted_ordinal_drift_linear": {
110
+ "y1": [0, 1, 2, 0, 1, 2, 0, 1, 2, 0],
111
+ "y2": [0, 1, 2, 1, 1, 2, 0, 2, 2, 1],
112
+ "weights": "linear",
113
+ },
114
+ }
115
+
116
+
117
+ @pytest.mark.skip(
118
+ reason="Placeholder fixtures — regenerate via scripts/_dev/generate_kappa_fixtures.py "
119
+ "in a venv with sklearn==1.5.2, paste output above, then unskip."
120
+ )
121
+ class TestSklearnKappaParity:
122
+ @pytest.mark.parametrize("case_name", list(SKLEARN_KAPPA_FIXTURES.keys()))
123
+ def test_matches_sklearn(self, case_name: str):
124
+ case = SKLEARN_KAPPA_INPUTS[case_name]
125
+ expected = SKLEARN_KAPPA_FIXTURES[case_name]
126
+ actual = cohen_kappa(case["y1"], case["y2"], weights=case["weights"])
127
+ assert actual == pytest.approx(expected, abs=1e-9), (
128
+ f"hand-rolled cohen_kappa diverged from sklearn 1.5.2 on case "
129
+ f"{case_name!r}: hand-rolled={actual} sklearn={expected}"
130
+ )
131
+
132
+
133
+ class TestSklearnInputsCrossCheck:
134
+ """Catches 'updated CASES list, forgot to regenerate' failure mode."""
135
+
136
+ def test_inputs_match_committed_json(self):
137
+ json_path = Path(__file__).parent / "fixtures" / "sklearn_kappa_inputs.json"
138
+ if not json_path.exists():
139
+ pytest.skip(
140
+ "sklearn_kappa_inputs.json not yet generated — see "
141
+ "scripts/_dev/generate_kappa_fixtures.py"
142
+ )
143
+ on_disk = _json.loads(json_path.read_text())
144
+ assert set(SKLEARN_KAPPA_INPUTS.keys()) == set(on_disk.keys()), (
145
+ "SKLEARN_KAPPA_INPUTS keys diverge from sklearn_kappa_inputs.json — "
146
+ "regenerate via scripts/_dev/generate_kappa_fixtures.py"
147
+ )
148
+ for name in SKLEARN_KAPPA_INPUTS:
149
+ assert SKLEARN_KAPPA_INPUTS[name] == on_disk[name], (
150
+ f"Input mismatch for case {name!r} — regenerate fixtures"
151
+ )