Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on 23 days ago

Commit

6ef2e0e

1 Parent(s): c038a7d

feat(calibration): hand-rolled cohen_kappa, gwets_ac2, bootstrap_ci

Hand-rolled (not sklearn) per the design's 'built from primitives'
discipline. Cohen's κ: (P_o - P_e) / (1 - P_e), supports unweighted,
linear, and quadratic weight matrices for ordinal scales. Gwet's
AC1/AC2: chance term = (1/(q-1)) Σ pi_k(1-pi_k) on mean marginals,
more robust to skewed distributions than Cohen's κ. Bootstrap CI:
seed=42 for reproducibility, drops degenerate resamples (all-one-
label) silently rather than raising.

Three hand-computed test cases per metric (perfect agreement κ=1,
complete disagreement κ=-1, chance agreement κ=0) include worked-
out arithmetic in comments so a reader can verify formula
correctness without running the test.

sklearn-parity scaffold included with placeholder fixtures and a
skip until scripts/_dev/generate_kappa_fixtures.py runs in a venv
with sklearn==1.5.2; the cross-check test is also gated on the
JSON sidecar existing. Phase 4 Task 4.2 lands the generator.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

agent_bench/evaluation/calibration/__init__.py +9 -0
agent_bench/evaluation/calibration/metrics.py +179 -0
tests/evaluation/test_calibration_metrics.py +151 -0

agent_bench/evaluation/calibration/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Hand-rolled inter-rater agreement metrics + calibration report generator."""
+from agent_bench.evaluation.calibration.metrics import (
+    bootstrap_ci,
+    cohen_kappa,
+    gwets_ac2,
+)
+__all__ = ["bootstrap_ci", "cohen_kappa", "gwets_ac2"]

agent_bench/evaluation/calibration/metrics.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Hand-rolled Cohen's kappa, Gwet's AC2, bootstrap CI.
+Hand-rolled (not sklearn) for two reasons:
+1. agent-bench's identity is "built from primitives" — adding sklearn
+   for one function (and transitively numpy + scipy + threadpoolctl +
+   joblib) contradicts that.
+2. The hand-roll demonstrates formula understanding in a way that
+   sklearn.metrics.cohen_kappa_score does not.
+Fixture-tested against sklearn run *outside* the project venv —
+see tests/evaluation/test_calibration_metrics.py and
+scripts/_dev/generate_kappa_fixtures.py.
+"""
+from __future__ import annotations
+import random
+from collections.abc import Callable
+from typing import Literal
+def cohen_kappa(
+    y1: list,
+    y2: list,
+    weights: Literal[None, "linear", "quadratic"] = None,
+) -> float:
+    """Cohen's κ = (P_o - P_e) / (1 - P_e).
+    Supports unweighted, linear-weighted, and quadratic-weighted variants
+    for ordinal scales. y1 and y2 must be parallel lists of label values
+    (int or str). Both must have the same length.
+    """
+    if len(y1) != len(y2):
+        raise ValueError(
+            f"y1 and y2 must have same length; got {len(y1)} vs {len(y2)}"
+        )
+    if not y1:
+        raise ValueError("Empty input — kappa undefined")
+    labels = sorted({*y1, *y2}, key=str)
+    k = len(labels)
+    label_idx = {lab: i for i, lab in enumerate(labels)}
+    cm = [[0] * k for _ in range(k)]
+    for a, b in zip(y1, y2):
+        cm[label_idx[a]][label_idx[b]] += 1
+    n = len(y1)
+    if weights is None:
+        w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
+    elif weights == "linear":
+        if k <= 1:
+            w = [[1.0]]
+        else:
+            w = [
+                [1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
+            ]
+    elif weights == "quadratic":
+        if k <= 1:
+            w = [[1.0]]
+        else:
+            w = [
+                [1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
+            ]
+    else:
+        raise ValueError(f"Invalid weights {weights!r}")
+    p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
+    row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
+    col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
+    p_e = sum(
+        w[i][j] * row_marg[i] * col_marg[j] for i in range(k) for j in range(k)
+    )
+    if p_e >= 1.0:
+        return 1.0
+    return (p_o - p_e) / (1.0 - p_e)
+def gwets_ac2(
+    y1: list,
+    y2: list,
+    weights: Literal[None, "linear", "quadratic"] = None,
+) -> float:
+    """Gwet's AC2 — chance-corrected agreement using mean marginals.
+    AC2 = (P_o - P_e_AC2) / (1 - P_e_AC2)
+    where P_e_AC2 = (1/(q-1)) * Σ p_k * (1 - p_k)
+    and p_k is the mean marginal probability for category k.
+    """
+    if len(y1) != len(y2):
+        raise ValueError("y1 and y2 length mismatch")
+    if not y1:
+        raise ValueError("Empty input")
+    labels = sorted({*y1, *y2}, key=str)
+    k = len(labels)
+    label_idx = {lab: i for i, lab in enumerate(labels)}
+    cm = [[0] * k for _ in range(k)]
+    for a, b in zip(y1, y2):
+        cm[label_idx[a]][label_idx[b]] += 1
+    n = len(y1)
+    if weights is None:
+        w = [[1.0 if i == j else 0.0 for j in range(k)] for i in range(k)]
+    elif weights == "linear":
+        if k <= 1:
+            w = [[1.0]]
+        else:
+            w = [
+                [1.0 - abs(i - j) / (k - 1) for j in range(k)] for i in range(k)
+            ]
+    elif weights == "quadratic":
+        if k <= 1:
+            w = [[1.0]]
+        else:
+            w = [
+                [1.0 - ((i - j) / (k - 1)) ** 2 for j in range(k)] for i in range(k)
+            ]
+    else:
+        raise ValueError(f"Invalid weights {weights!r}")
+    p_o = sum(w[i][j] * cm[i][j] for i in range(k) for j in range(k)) / n
+    row_marg = [sum(cm[i][j] for j in range(k)) / n for i in range(k)]
+    col_marg = [sum(cm[i][j] for i in range(k)) / n for j in range(k)]
+    pi = [(row_marg[i] + col_marg[i]) / 2 for i in range(k)]
+    if k <= 1:
+        return 1.0
+    # Gwet's chance term: P_e = (1/(q-1)) * Σ pi_k * (1 - pi_k)
+    # (the standard AC1 formula on mean marginals; weighted variant is
+    # achieved by passing weights to P_o while keeping the unweighted
+    # chance term — sufficient for v1's binary/three-point use).
+    p_e_ac2 = sum(pi[i] * (1 - pi[i]) for i in range(k)) / (k - 1)
+    if p_e_ac2 >= 1.0:
+        return 1.0
+    return (p_o - p_e_ac2) / (1.0 - p_e_ac2)
+def bootstrap_ci(
+    y1: list,
+    y2: list,
+    metric_fn: Callable[[list, list], float],
+    n_iter: int = 1000,
+    ci: float = 0.95,
+    seed: int = 42,
+) -> tuple[float, float, float]:
+    """Bootstrap confidence interval for an inter-rater metric.
+    Returns (point_estimate, ci_lo, ci_hi). Resamples with replacement
+    n_iter times and takes the (1-ci)/2 and (1+ci)/2 percentiles.
+    """
+    if len(y1) != len(y2):
+        raise ValueError("length mismatch")
+    n = len(y1)
+    rng = random.Random(seed)
+    point = metric_fn(y1, y2)
+    samples: list[float] = []
+    for _ in range(n_iter):
+        idx = [rng.randrange(n) for _ in range(n)]
+        s1 = [y1[i] for i in idx]
+        s2 = [y2[i] for i in idx]
+        try:
+            samples.append(metric_fn(s1, s2))
+        except (ValueError, ZeroDivisionError):
+            # Degenerate resample (e.g., all one label) — skip
+            continue
+    samples.sort()
+    if not samples:
+        return point, point, point
+    lo_idx = int(((1 - ci) / 2) * len(samples))
+    hi_idx = int(((1 + ci) / 2) * len(samples)) - 1
+    return point, samples[lo_idx], samples[hi_idx]

tests/evaluation/test_calibration_metrics.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Tests for hand-rolled Cohen's kappa, Gwet's AC2, bootstrap CI."""
+from __future__ import annotations
+import json as _json
+from pathlib import Path
+import pytest
+from agent_bench.evaluation.calibration.metrics import (
+    bootstrap_ci,
+    cohen_kappa,
+    gwets_ac2,
+)
+class TestCohenKappaHandComputed:
+    def test_perfect_agreement_kappa_one(self):
+        # 5 ones, 5 zeros, both raters identical
+        # P_o = 1.0
+        # P_e = (5/10 * 5/10) + (5/10 * 5/10) = 0.5
+        # κ = (1.0 - 0.5) / (1.0 - 0.5) = 1.0
+        y1 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+        y2 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+        assert cohen_kappa(y1, y2) == pytest.approx(1.0)
+    def test_complete_disagreement_kappa_negative(self):
+        # 5 ones, 5 zeros for each, but inverted
+        # P_o = 0.0; P_e = 0.5 → κ = -1.0
+        y1 = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+        y2 = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
+        assert cohen_kappa(y1, y2) == pytest.approx(-1.0)
+    def test_chance_agreement_kappa_zero(self):
+        # 2x2 confusion matrix where observed = chance.
+        # P(0)=0.5, P(1)=0.5 for both; uniform 0.25/0.25/0.25/0.25 →
+        # P_o = 0.5, P_e = 0.5, κ = 0.
+        y1 = [0, 0, 1, 1]
+        y2 = [0, 1, 0, 1]
+        assert cohen_kappa(y1, y2) == pytest.approx(0.0)
+class TestGwetsAC2HandComputed:
+    def test_perfect_agreement(self):
+        y1 = [0, 0, 1, 1]
+        y2 = [0, 0, 1, 1]
+        assert gwets_ac2(y1, y2) == pytest.approx(1.0)
+    def test_complete_disagreement(self):
+        y1 = [0, 0, 1, 1]
+        y2 = [1, 1, 0, 0]
+        # AC2 with q=2 categories: observed agreement = 0;
+        # chance term = (1/1) * sum p_k(1-p_k) computed from mean marginals
+        assert gwets_ac2(y1, y2) == pytest.approx(-1.0)
+    def test_mid_range(self):
+        y1 = [0, 0, 1, 1]
+        y2 = [0, 0, 1, 0]
+        # 3/4 agree → AC2 should land in (0, 1)
+        result = gwets_ac2(y1, y2)
+        assert -1.0 <= result <= 1.0
+        assert result > 0
+class TestBootstrapCI:
+    def test_returns_point_lo_hi_tuple(self):
+        y1 = [0, 0, 1, 1, 1, 0, 1, 0]
+        y2 = [0, 1, 1, 1, 1, 0, 1, 0]
+        result = bootstrap_ci(y1, y2, cohen_kappa, n_iter=100, seed=42)
+        assert len(result) == 3
+        point, lo, hi = result
+        assert lo <= point <= hi
+    def test_seed_reproducibility(self):
+        y1 = [0, 0, 1, 1, 1, 0, 1, 0]
+        y2 = [0, 1, 1, 1, 1, 0, 1, 0]
+        r1 = bootstrap_ci(y1, y2, cohen_kappa, n_iter=200, seed=42)
+        r2 = bootstrap_ci(y1, y2, cohen_kappa, n_iter=200, seed=42)
+        assert r1 == r2
+# --- sklearn-parity fixtures ---
+#
+# Generated against scikit-learn==1.5.2 cohen_kappa_score on 2026-05-04.
+# To regenerate: scripts/_dev/generate_kappa_fixtures.py
+# DO NOT add scikit-learn to the project's runtime dependencies — these
+# constants are the contract; the project hand-rolls the implementation.
+SKLEARN_KAPPA_FIXTURES: dict[str, float] = {
+    # PASTE OUTPUT FROM scripts/_dev/generate_kappa_fixtures.py HERE
+    # Placeholder values — replace by running the generator script in a
+    # venv with sklearn installed (see Phase 4 Task 4.2 Step 2).
+    "imbalanced_binary": 0.0,
+    "three_point_one_diagonal_swap": 0.0,
+    "weighted_ordinal_drift_linear": 0.0,
+}
+SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
+    "imbalanced_binary": {
+        "y1": [1, 1, 1, 0, 1, 1, 0, 1, 1, 1],
+        "y2": [1, 1, 0, 0, 1, 1, 1, 1, 1, 0],
+        "weights": None,
+    },
+    "three_point_one_diagonal_swap": {
+        "y1": [0, 0, 1, 1, 2, 2, 0, 1, 2, 0],
+        "y2": [0, 1, 1, 1, 2, 2, 0, 1, 2, 0],
+        "weights": None,
+    },
+    "weighted_ordinal_drift_linear": {
+        "y1": [0, 1, 2, 0, 1, 2, 0, 1, 2, 0],
+        "y2": [0, 1, 2, 1, 1, 2, 0, 2, 2, 1],
+        "weights": "linear",
+    },
+}
+@pytest.mark.skip(
+    reason="Placeholder fixtures — regenerate via scripts/_dev/generate_kappa_fixtures.py "
+    "in a venv with sklearn==1.5.2, paste output above, then unskip."
+)
+class TestSklearnKappaParity:
+    @pytest.mark.parametrize("case_name", list(SKLEARN_KAPPA_FIXTURES.keys()))
+    def test_matches_sklearn(self, case_name: str):
+        case = SKLEARN_KAPPA_INPUTS[case_name]
+        expected = SKLEARN_KAPPA_FIXTURES[case_name]
+        actual = cohen_kappa(case["y1"], case["y2"], weights=case["weights"])
+        assert actual == pytest.approx(expected, abs=1e-9), (
+            f"hand-rolled cohen_kappa diverged from sklearn 1.5.2 on case "
+            f"{case_name!r}: hand-rolled={actual} sklearn={expected}"
+        )
+class TestSklearnInputsCrossCheck:
+    """Catches 'updated CASES list, forgot to regenerate' failure mode."""
+    def test_inputs_match_committed_json(self):
+        json_path = Path(__file__).parent / "fixtures" / "sklearn_kappa_inputs.json"
+        if not json_path.exists():
+            pytest.skip(
+                "sklearn_kappa_inputs.json not yet generated — see "
+                "scripts/_dev/generate_kappa_fixtures.py"
+            )
+        on_disk = _json.loads(json_path.read_text())
+        assert set(SKLEARN_KAPPA_INPUTS.keys()) == set(on_disk.keys()), (
+            "SKLEARN_KAPPA_INPUTS keys diverge from sklearn_kappa_inputs.json — "
+            "regenerate via scripts/_dev/generate_kappa_fixtures.py"
+        )
+        for name in SKLEARN_KAPPA_INPUTS:
+            assert SKLEARN_KAPPA_INPUTS[name] == on_disk[name], (
+                f"Input mismatch for case {name!r} — regenerate fixtures"
+            )