Spaces:
Running
feat(calibration): hand-rolled cohen_kappa, gwets_ac2, bootstrap_ci
Browse filesHand-rolled (not sklearn) per the design's 'built from primitives'
discipline. Cohen's κ: (P_o - P_e) / (1 - P_e), supports unweighted,
linear, and quadratic weight matrices for ordinal scales. Gwet's
AC1/AC2: chance term = (1/(q-1)) Σ pi_k(1-pi_k) on mean marginals,
more robust to skewed distributions than Cohen's κ. Bootstrap CI:
seed=42 for reproducibility, drops degenerate resamples (all-one-
label) silently rather than raising.
Three hand-computed test cases per metric (perfect agreement κ=1,
complete disagreement κ=-1, chance agreement κ=0) include worked-
out arithmetic in comments so a reader can verify formula
correctness without running the test.
sklearn-parity scaffold included with placeholder fixtures and a
skip until scripts/_dev/generate_kappa_fixtures.py runs in a venv
with sklearn==1.5.2; the cross-check test is also gated on the
JSON sidecar existing. Phase 4 Task 4.2 lands the generator.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hand-rolled inter-rater agreement metrics + calibration report generator."""
|
| 2 |
+
|
| 3 |
+
from agent_bench.evaluation.calibration.metrics import (
|
| 4 |
+
bootstrap_ci,
|
| 5 |
+
cohen_kappa,
|
| 6 |
+
gwets_ac2,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
__all__ = ["bootstrap_ci", "cohen_kappa", "gwets_ac2"]
|
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hand-rolled Cohen's kappa, Gwet's AC2, bootstrap CI.
|
| 2 |
+
|
| 3 |
+
Hand-rolled (not sklearn) for two reasons:
|
| 4 |
+
1. agent-bench's identity is "built from primitives" — adding sklearn
|
| 5 |
+
for one function (and transitively numpy + scipy + threadpoolctl +
|
| 6 |
+
joblib) contradicts that.
|
| 7 |
+
2. The hand-roll demonstrates formula understanding in a way that
|
| 8 |
+
sklearn.metrics.cohen_kappa_score does not.
|
| 9 |
+
|
| 10 |
+
Fixture-tested against sklearn run *outside* the project venv —
|
| 11 |
+
see tests/evaluation/test_calibration_metrics.py and
|
| 12 |
+
scripts/_dev/generate_kappa_fixtures.py.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import random
|
| 18 |
+
from collections.abc import Callable
|
| 19 |
+
from typing import Literal
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def cohen_kappa(
|
| 23 |
+
y1: list,
|
| 24 |
+
y2: list,
|
| 25 |
+
weights: Literal[None, "linear", "quadratic"] = None,
|
| 26 |
+
) -> float:
|
| 27 |
+
"""Cohen's κ = (P_o - P_e) / (1 - P_e).
|
| 28 |
+
|
| 29 |
+
Supports unweighted, linear-weighted, and quadratic-weighted variants
|
| 30 |
+
for ordinal scales. y1 and y2 must be parallel lists of label values
|
| 31 |
+
(int or str). Both must have the same length.
|
| 32 |
+
"""
|
| 33 |
+
if len(y1) != len(y2):
|
| 34 |
+
raise ValueError(
|
| 35 |
+
f"y1 and y2 must have same length; got {len(y1)} vs {len(y2)}"
|
| 36 |
+
)
|
| 37 |
+
if not y1:
|
| 38 |
+
raise ValueError("Empty input — kappa undefined")
|
| 39 |
+
|
| 40 |
+
labels = sorted({*y1, *y2}, key=str)
|
| 41 |
+
k = len(labels)
|
| 42 |
+
label_idx = {lab: i for i, lab in enumerate(labels)}
|
| 43 |
+
|
| 44 |
+
cm = [[0] * k for _ in range(k)]
|
| 45 |
+
for a, b in zip(y1, y2):
|
| 46 |
+
cm[label_idx[a]][label_idx[b]] += 1
|
| 47 |
+
|
| 48 |
+
n = len(y1)
|
| 49 |
+
|
| 50 |
+
if weights is None:
|
| 51 |
+
w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
|
| 52 |
+
elif weights == "linear":
|
| 53 |
+
if k <= 1:
|
| 54 |
+
w = [[1.0]]
|
| 55 |
+
else:
|
| 56 |
+
w = [
|
| 57 |
+
[1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
|
| 58 |
+
]
|
| 59 |
+
elif weights == "quadratic":
|
| 60 |
+
if k <= 1:
|
| 61 |
+
w = [[1.0]]
|
| 62 |
+
else:
|
| 63 |
+
w = [
|
| 64 |
+
[1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
|
| 65 |
+
]
|
| 66 |
+
else:
|
| 67 |
+
raise ValueError(f"Invalid weights {weights!r}")
|
| 68 |
+
|
| 69 |
+
p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
|
| 70 |
+
|
| 71 |
+
row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
|
| 72 |
+
col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
|
| 73 |
+
|
| 74 |
+
p_e = sum(
|
| 75 |
+
w[i][j] * row_marg[i] * col_marg[j] for i in range(k) for j in range(k)
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if p_e >= 1.0:
|
| 79 |
+
return 1.0
|
| 80 |
+
return (p_o - p_e) / (1.0 - p_e)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def gwets_ac2(
|
| 84 |
+
y1: list,
|
| 85 |
+
y2: list,
|
| 86 |
+
weights: Literal[None, "linear", "quadratic"] = None,
|
| 87 |
+
) -> float:
|
| 88 |
+
"""Gwet's AC2 — chance-corrected agreement using mean marginals.
|
| 89 |
+
|
| 90 |
+
AC2 = (P_o - P_e_AC2) / (1 - P_e_AC2)
|
| 91 |
+
where P_e_AC2 = (1/(q-1)) * Σ p_k * (1 - p_k)
|
| 92 |
+
and p_k is the mean marginal probability for category k.
|
| 93 |
+
"""
|
| 94 |
+
if len(y1) != len(y2):
|
| 95 |
+
raise ValueError("y1 and y2 length mismatch")
|
| 96 |
+
if not y1:
|
| 97 |
+
raise ValueError("Empty input")
|
| 98 |
+
|
| 99 |
+
labels = sorted({*y1, *y2}, key=str)
|
| 100 |
+
k = len(labels)
|
| 101 |
+
label_idx = {lab: i for i, lab in enumerate(labels)}
|
| 102 |
+
|
| 103 |
+
cm = [[0] * k for _ in range(k)]
|
| 104 |
+
for a, b in zip(y1, y2):
|
| 105 |
+
cm[label_idx[a]][label_idx[b]] += 1
|
| 106 |
+
n = len(y1)
|
| 107 |
+
|
| 108 |
+
if weights is None:
|
| 109 |
+
w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
|
| 110 |
+
elif weights == "linear":
|
| 111 |
+
if k <= 1:
|
| 112 |
+
w = [[1.0]]
|
| 113 |
+
else:
|
| 114 |
+
w = [
|
| 115 |
+
[1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
|
| 116 |
+
]
|
| 117 |
+
elif weights == "quadratic":
|
| 118 |
+
if k <= 1:
|
| 119 |
+
w = [[1.0]]
|
| 120 |
+
else:
|
| 121 |
+
w = [
|
| 122 |
+
[1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
|
| 123 |
+
]
|
| 124 |
+
else:
|
| 125 |
+
raise ValueError(f"Invalid weights {weights!r}")
|
| 126 |
+
|
| 127 |
+
p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
|
| 128 |
+
|
| 129 |
+
row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
|
| 130 |
+
col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
|
| 131 |
+
pi = [(row_marg[i] + col_marg[i]) / 2 for i in range(k)]
|
| 132 |
+
|
| 133 |
+
if k <= 1:
|
| 134 |
+
return 1.0
|
| 135 |
+
# Gwet's chance term: P_e = (1/(q-1)) * Σ pi_k * (1 - pi_k)
|
| 136 |
+
# (the standard AC1 formula on mean marginals; weighted variant is
|
| 137 |
+
# achieved by passing weights to P_o while keeping the unweighted
|
| 138 |
+
# chance term — sufficient for v1's binary/three-point use).
|
| 139 |
+
p_e_ac2 = sum(pi[i] * (1 - pi[i]) for i in range(k)) / (k - 1)
|
| 140 |
+
|
| 141 |
+
if p_e_ac2 >= 1.0:
|
| 142 |
+
return 1.0
|
| 143 |
+
return (p_o - p_e_ac2) / (1.0 - p_e_ac2)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def bootstrap_ci(
|
| 147 |
+
y1: list,
|
| 148 |
+
y2: list,
|
| 149 |
+
metric_fn: Callable[[list, list], float],
|
| 150 |
+
n_iter: int = 1000,
|
| 151 |
+
ci: float = 0.95,
|
| 152 |
+
seed: int = 42,
|
| 153 |
+
) -> tuple[float, float, float]:
|
| 154 |
+
"""Bootstrap confidence interval for an inter-rater metric.
|
| 155 |
+
|
| 156 |
+
Returns (point_estimate, ci_lo, ci_hi). Resamples with replacement
|
| 157 |
+
n_iter times and takes the (1-ci)/2 and (1+ci)/2 percentiles.
|
| 158 |
+
"""
|
| 159 |
+
if len(y1) != len(y2):
|
| 160 |
+
raise ValueError("length mismatch")
|
| 161 |
+
n = len(y1)
|
| 162 |
+
rng = random.Random(seed)
|
| 163 |
+
point = metric_fn(y1, y2)
|
| 164 |
+
samples: list[float] = []
|
| 165 |
+
for _ in range(n_iter):
|
| 166 |
+
idx = [rng.randrange(n) for _ in range(n)]
|
| 167 |
+
s1 = [y1[i] for i in idx]
|
| 168 |
+
s2 = [y2[i] for i in idx]
|
| 169 |
+
try:
|
| 170 |
+
samples.append(metric_fn(s1, s2))
|
| 171 |
+
except (ValueError, ZeroDivisionError):
|
| 172 |
+
# Degenerate resample (e.g., all one label) — skip
|
| 173 |
+
continue
|
| 174 |
+
samples.sort()
|
| 175 |
+
if not samples:
|
| 176 |
+
return point, point, point
|
| 177 |
+
lo_idx = int(((1 - ci) / 2) * len(samples))
|
| 178 |
+
hi_idx = int(((1 + ci) / 2) * len(samples)) - 1
|
| 179 |
+
return point, samples[lo_idx], samples[hi_idx]
|
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for hand-rolled Cohen's kappa, Gwet's AC2, bootstrap CI."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json as _json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
|
| 10 |
+
from agent_bench.evaluation.calibration.metrics import (
|
| 11 |
+
bootstrap_ci,
|
| 12 |
+
cohen_kappa,
|
| 13 |
+
gwets_ac2,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TestCohenKappaHandComputed:
|
| 18 |
+
def test_perfect_agreement_kappa_one(self):
|
| 19 |
+
# 5 ones, 5 zeros, both raters identical
|
| 20 |
+
# P_o = 1.0
|
| 21 |
+
# P_e = (5/10 * 5/10) + (5/10 * 5/10) = 0.5
|
| 22 |
+
# κ = (1.0 - 0.5) / (1.0 - 0.5) = 1.0
|
| 23 |
+
y1 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
|
| 24 |
+
y2 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
|
| 25 |
+
assert cohen_kappa(y1, y2) == pytest.approx(1.0)
|
| 26 |
+
|
| 27 |
+
def test_complete_disagreement_kappa_negative(self):
|
| 28 |
+
# 5 ones, 5 zeros for each, but inverted
|
| 29 |
+
# P_o = 0.0; P_e = 0.5 → κ = -1.0
|
| 30 |
+
y1 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
|
| 31 |
+
y2 = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
|
| 32 |
+
assert cohen_kappa(y1, y2) == pytest.approx(-1.0)
|
| 33 |
+
|
| 34 |
+
def test_chance_agreement_kappa_zero(self):
|
| 35 |
+
# 2x2 confusion matrix where observed = chance.
|
| 36 |
+
# P(0)=0.5, P(1)=0.5 for both; uniform 0.25/0.25/0.25/0.25 →
|
| 37 |
+
# P_o = 0.5, P_e = 0.5, κ = 0.
|
| 38 |
+
y1 = [0, 0, 1, 1]
|
| 39 |
+
y2 = [0, 1, 0, 1]
|
| 40 |
+
assert cohen_kappa(y1, y2) == pytest.approx(0.0)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class TestGwetsAC2HandComputed:
|
| 44 |
+
def test_perfect_agreement(self):
|
| 45 |
+
y1 = [0, 0, 1, 1]
|
| 46 |
+
y2 = [0, 0, 1, 1]
|
| 47 |
+
assert gwets_ac2(y1, y2) == pytest.approx(1.0)
|
| 48 |
+
|
| 49 |
+
def test_complete_disagreement(self):
|
| 50 |
+
y1 = [0, 0, 1, 1]
|
| 51 |
+
y2 = [1, 1, 0, 0]
|
| 52 |
+
# AC2 with q=2 categories: observed agreement = 0;
|
| 53 |
+
# chance term = (1/1) * sum p_k(1-p_k) computed from mean marginals
|
| 54 |
+
assert gwets_ac2(y1, y2) == pytest.approx(-1.0)
|
| 55 |
+
|
| 56 |
+
def test_mid_range(self):
|
| 57 |
+
y1 = [0, 0, 1, 1]
|
| 58 |
+
y2 = [0, 0, 1, 0]
|
| 59 |
+
# 3/4 agree → AC2 should land in (0, 1)
|
| 60 |
+
result = gwets_ac2(y1, y2)
|
| 61 |
+
assert -1.0 <= result <= 1.0
|
| 62 |
+
assert result > 0
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class TestBootstrapCI:
|
| 66 |
+
def test_returns_point_lo_hi_tuple(self):
|
| 67 |
+
y1 = [0, 0, 1, 1, 1, 0, 1, 0]
|
| 68 |
+
y2 = [0, 1, 1, 1, 1, 0, 1, 0]
|
| 69 |
+
result = bootstrap_ci(y1, y2, cohen_kappa, n_iter=100, seed=42)
|
| 70 |
+
assert len(result) == 3
|
| 71 |
+
point, lo, hi = result
|
| 72 |
+
assert lo <= point <= hi
|
| 73 |
+
|
| 74 |
+
def test_seed_reproducibility(self):
|
| 75 |
+
y1 = [0, 0, 1, 1, 1, 0, 1, 0]
|
| 76 |
+
y2 = [0, 1, 1, 1, 1, 0, 1, 0]
|
| 77 |
+
r1 = bootstrap_ci(y1, y2, cohen_kappa, n_iter=200, seed=42)
|
| 78 |
+
r2 = bootstrap_ci(y1, y2, cohen_kappa, n_iter=200, seed=42)
|
| 79 |
+
assert r1 == r2
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# --- sklearn-parity fixtures ---
|
| 83 |
+
#
|
| 84 |
+
# Generated against scikit-learn==1.5.2 cohen_kappa_score on 2026-05-04.
|
| 85 |
+
# To regenerate: scripts/_dev/generate_kappa_fixtures.py
|
| 86 |
+
# DO NOT add scikit-learn to the project's runtime dependencies — these
|
| 87 |
+
# constants are the contract; the project hand-rolls the implementation.
|
| 88 |
+
|
| 89 |
+
SKLEARN_KAPPA_FIXTURES: dict[str, float] = {
|
| 90 |
+
# PASTE OUTPUT FROM scripts/_dev/generate_kappa_fixtures.py HERE
|
| 91 |
+
# Placeholder values — replace by running the generator script in a
|
| 92 |
+
# venv with sklearn installed (see Phase 4 Task 4.2 Step 2).
|
| 93 |
+
"imbalanced_binary": 0.0,
|
| 94 |
+
"three_point_one_diagonal_swap": 0.0,
|
| 95 |
+
"weighted_ordinal_drift_linear": 0.0,
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
|
| 99 |
+
"imbalanced_binary": {
|
| 100 |
+
"y1": [1, 1, 1, 0, 1, 1, 0, 1, 1, 1],
|
| 101 |
+
"y2": [1, 1, 0, 0, 1, 1, 1, 1, 1, 0],
|
| 102 |
+
"weights": None,
|
| 103 |
+
},
|
| 104 |
+
"three_point_one_diagonal_swap": {
|
| 105 |
+
"y1": [0, 0, 1, 1, 2, 2, 0, 1, 2, 0],
|
| 106 |
+
"y2": [0, 1, 1, 1, 2, 2, 0, 1, 2, 0],
|
| 107 |
+
"weights": None,
|
| 108 |
+
},
|
| 109 |
+
"weighted_ordinal_drift_linear": {
|
| 110 |
+
"y1": [0, 1, 2, 0, 1, 2, 0, 1, 2, 0],
|
| 111 |
+
"y2": [0, 1, 2, 1, 1, 2, 0, 2, 2, 1],
|
| 112 |
+
"weights": "linear",
|
| 113 |
+
},
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@pytest.mark.skip(
|
| 118 |
+
reason="Placeholder fixtures — regenerate via scripts/_dev/generate_kappa_fixtures.py "
|
| 119 |
+
"in a venv with sklearn==1.5.2, paste output above, then unskip."
|
| 120 |
+
)
|
| 121 |
+
class TestSklearnKappaParity:
|
| 122 |
+
@pytest.mark.parametrize("case_name", list(SKLEARN_KAPPA_FIXTURES.keys()))
|
| 123 |
+
def test_matches_sklearn(self, case_name: str):
|
| 124 |
+
case = SKLEARN_KAPPA_INPUTS[case_name]
|
| 125 |
+
expected = SKLEARN_KAPPA_FIXTURES[case_name]
|
| 126 |
+
actual = cohen_kappa(case["y1"], case["y2"], weights=case["weights"])
|
| 127 |
+
assert actual == pytest.approx(expected, abs=1e-9), (
|
| 128 |
+
f"hand-rolled cohen_kappa diverged from sklearn 1.5.2 on case "
|
| 129 |
+
f"{case_name!r}: hand-rolled={actual} sklearn={expected}"
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class TestSklearnInputsCrossCheck:
|
| 134 |
+
"""Catches 'updated CASES list, forgot to regenerate' failure mode."""
|
| 135 |
+
|
| 136 |
+
def test_inputs_match_committed_json(self):
|
| 137 |
+
json_path = Path(__file__).parent / "fixtures" / "sklearn_kappa_inputs.json"
|
| 138 |
+
if not json_path.exists():
|
| 139 |
+
pytest.skip(
|
| 140 |
+
"sklearn_kappa_inputs.json not yet generated — see "
|
| 141 |
+
"scripts/_dev/generate_kappa_fixtures.py"
|
| 142 |
+
)
|
| 143 |
+
on_disk = _json.loads(json_path.read_text())
|
| 144 |
+
assert set(SKLEARN_KAPPA_INPUTS.keys()) == set(on_disk.keys()), (
|
| 145 |
+
"SKLEARN_KAPPA_INPUTS keys diverge from sklearn_kappa_inputs.json — "
|
| 146 |
+
"regenerate via scripts/_dev/generate_kappa_fixtures.py"
|
| 147 |
+
)
|
| 148 |
+
for name in SKLEARN_KAPPA_INPUTS:
|
| 149 |
+
assert SKLEARN_KAPPA_INPUTS[name] == on_disk[name], (
|
| 150 |
+
f"Input mismatch for case {name!r} — regenerate fixtures"
|
| 151 |
+
)
|