Spaces:

Nomearod
/

agentbench

Sleeping

Nomearod Claude Opus 4.7 (1M context) commited on May 4

Commit

1d47106

1 Parent(s): 4fa7c61

feat(calibration): generate_kappa_table with strict/warn modes

Joins predictions ⋈ labels by (item_id, dimension, system_output_hash).
Hash mismatch ALWAYS raises with first-item expected/actual hashes
plus full mismatched-id list — applies in both modes (never warned).
Missing predictions/labels warn-and-exclude by default; --strict
raises (the final-artifact path; make calibrate uses it).

Pairwise abstain exclusion in κ; per-dimension cause breakdown
(schema_parse / out_of_range / provider_exhausted / genuine) via
the abstain-reason constants from judges/base.py. Abstain-rate
flag fires on STRICTLY greater than 20%; 6/30 (=20%) does not
fire, 7/30 does — boundary tested explicitly.

κ undefined → '—' with footnote (insufficient variance, N<3
agreement-eligible items remaining, or all labels+predictions in
a single category — the last condition was load-bearing for the
'all-ones degenerate' test case).

Skips '_members.jsonl' sidecar files in the predictions glob — they
hold per-permutation / per-jury-member detail, not aggregate
predictions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

agent_bench/evaluation/calibration/report.py +264 -0
tests/evaluation/test_calibration_report.py +203 -0

agent_bench/evaluation/calibration/report.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""generate_kappa_table — joins predictions ⋈ labels by (item_id, dimension,
+system_output_hash); computes per-row κ + bootstrap CI + abstain breakdown;
+emits markdown table at docs/_generated/kappa_table.md.
+"""
+from __future__ import annotations
+import glob as _glob
+import json
+from collections import defaultdict
+from pathlib import Path
+import structlog
+from agent_bench.evaluation.calibration.metrics import bootstrap_ci, cohen_kappa
+from agent_bench.evaluation.judges.base import (
+    ABSTAIN_REASON_OUT_OF_RANGE,
+    ABSTAIN_REASON_PROVIDER_EXHAUSTED,
+    ABSTAIN_REASON_SCHEMA_PARSE,
+)
+logger = structlog.get_logger()
+ABSTAIN_THRESHOLD = 0.20  # strictly greater than fires the flag
+def _classify_abstain(reasoning: str) -> str:
+    if reasoning.startswith(ABSTAIN_REASON_PROVIDER_EXHAUSTED):
+        return "provider_exhausted"
+    if reasoning.startswith(ABSTAIN_REASON_SCHEMA_PARSE):
+        return "schema_parse"
+    if reasoning.startswith(ABSTAIN_REASON_OUT_OF_RANGE):
+        return "out_of_range"
+    return "genuine"
+def generate_kappa_table(
+    *,
+    predictions_glob: str,
+    labels_path: str,
+    output_path: str,
+    strict: bool = False,
+) -> None:
+    """Aggregate predictions across rows + dimensions into one markdown table.
+    On hash mismatch: ALWAYS raises (both modes), with first-item expected
+    /actual hashes plus full mismatched-id list.
+    On missing prediction or label: WARN+exclude in default mode; RAISE in strict.
+    On undefined κ: render '—' with a footnote (both modes).
+    On abstain rate > 20%: render κ + footnote with cause breakdown (both modes).
+    """
+    labels: list[dict] = []
+    for line in Path(labels_path).read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        labels.append(json.loads(line))
+    label_by_key: dict[tuple[str, str], dict] = {
+        (label_rec["item_id"], label_rec["dimension"]): label_rec
+        for label_rec in labels
+    }
+    pred_files = sorted(_glob.glob(predictions_glob))
+    if not pred_files:
+        raise ValueError(f"No prediction files matched: {predictions_glob}")
+    rows: list[dict] = []
+    for pf in pred_files:
+        # Skip sidecar JSONLs (per-member detail, not aggregate predictions)
+        if pf.endswith("_members.jsonl"):
+            continue
+        row_label = (
+            Path(pf).stem.replace("calibration_v1_judge_", "")
+        )
+        preds = json.loads(Path(pf).read_text())
+        # Hash-mismatch detection (always raises)
+        mismatches: list[tuple[str, str, str]] = []
+        for p in preds:
+            key = (p["item_id"], p["dimension"])
+            if key in label_by_key:
+                expected = label_by_key[key]["system_output_hash"]
+                actual = p["system_output_hash"]
+                if expected != actual:
+                    mismatches.append((p["item_id"], expected, actual))
+        if mismatches:
+            first_id, first_exp, first_act = mismatches[0]
+            raise ValueError(
+                f"Hash mismatch in {pf}: item {first_id!r} "
+                f"label.system_output_hash={first_exp!r} but "
+                f"prediction.system_output_hash={first_act!r}. "
+                f"Full mismatched-id list ({len(mismatches)}): "
+                f"{[m[0] for m in mismatches]}. "
+                f"Labels are stale relative to predictions — regenerate one or "
+                f"the other so hashes align."
+            )
+        preds_by_dim: dict[str, list[dict]] = defaultdict(list)
+        for p in preds:
+            preds_by_dim[p["dimension"]].append(p)
+        labels_by_dim: dict[str, list[dict]] = defaultdict(list)
+        for label_rec in labels:
+            labels_by_dim[label_rec["dimension"]].append(label_rec)
+        for dim in sorted(preds_by_dim.keys()):
+            preds_d = {p["item_id"]: p for p in preds_by_dim[dim]}
+            labs_d = {
+                label_rec["item_id"]: label_rec
+                for label_rec in labels_by_dim.get(dim, [])
+            }
+            common = sorted(set(preds_d) & set(labs_d))
+            missing_pred = sorted(set(labs_d) - set(preds_d))
+            missing_lab = sorted(set(preds_d) - set(labs_d))
+            if missing_pred or missing_lab:
+                msg = (
+                    f"row={row_label} dim={dim} "
+                    f"missing_predictions={missing_pred} "
+                    f"missing_labels={missing_lab}"
+                )
+                if strict:
+                    raise ValueError(f"strict mode: missing items: {msg}")
+                logger.warning("calibration_report_missing", message=msg)
+            y_pred: list = []
+            y_lab: list = []
+            abstains = 0
+            abstain_causes: dict[str, int] = {
+                "provider_exhausted": 0,
+                "schema_parse": 0,
+                "out_of_range": 0,
+                "genuine": 0,
+            }
+            for iid in common:
+                p = preds_d[iid]
+                label_rec = labs_d[iid]
+                if p["score"] == "Unknown" or label_rec["score"] == "Unknown":
+                    abstains += 1
+                    if p["score"] == "Unknown":
+                        abstain_causes[
+                            _classify_abstain(p.get("reasoning", ""))
+                        ] += 1
+                    continue
+                y_pred.append(int(p["score"]))
+                y_lab.append(int(label_rec["score"]))
+            n_eligible = len(y_pred)
+            abstain_rate = abstains / max(len(common), 1)
+            if n_eligible < 3:
+                rows.append(
+                    {
+                        "row": row_label,
+                        "dim": dim,
+                        "kappa": None,
+                        "ci_lo": None,
+                        "ci_hi": None,
+                        "n_eligible": n_eligible,
+                        "abstains": abstains,
+                        "abstain_rate": abstain_rate,
+                        "abstain_causes": abstain_causes,
+                        "footnote": (
+                            f"κ undefined: insufficient agreement-eligible "
+                            f"items (N={n_eligible})"
+                        ),
+                    }
+                )
+                continue
+            try:
+                kappa = cohen_kappa(y_lab, y_pred)
+                point, lo, hi = bootstrap_ci(
+                    y_lab, y_pred, cohen_kappa, n_iter=1000, seed=42
+                )
+            except (ValueError, ZeroDivisionError):
+                rows.append(
+                    {
+                        "row": row_label,
+                        "dim": dim,
+                        "kappa": None,
+                        "ci_lo": None,
+                        "ci_hi": None,
+                        "n_eligible": n_eligible,
+                        "abstains": abstains,
+                        "abstain_rate": abstain_rate,
+                        "abstain_causes": abstain_causes,
+                        "footnote": (
+                            "κ undefined: insufficient variance after "
+                            "exclusion"
+                        ),
+                    }
+                )
+                continue
+            # Detect degenerate κ (perfectly constant labels → P_e=1 → kappa
+            # was clamped to 1.0 in metrics.py, but with no observed
+            # disagreement the result is statistically meaningless)
+            if len(set(y_lab)) <= 1 and len(set(y_pred)) <= 1:
+                rows.append(
+                    {
+                        "row": row_label,
+                        "dim": dim,
+                        "kappa": None,
+                        "ci_lo": None,
+                        "ci_hi": None,
+                        "n_eligible": n_eligible,
+                        "abstains": abstains,
+                        "abstain_rate": abstain_rate,
+                        "abstain_causes": abstain_causes,
+                        "footnote": (
+                            "κ undefined: all labels and predictions in a "
+                            "single category (no variance to measure)"
+                        ),
+                    }
+                )
+                continue
+            footnote = ""
+            if abstain_rate > ABSTAIN_THRESHOLD:
+                breakdown = ", ".join(
+                    f"{int(100 * v / abstains)}% {k.replace('_', ' ')}"
+                    for k, v in abstain_causes.items()
+                    if v > 0
+                )
+                footnote = (
+                    f"κ computed on N={n_eligible} of {len(common)} items; "
+                    f"high abstain rate ({100 * abstain_rate:.1f}% — "
+                    f"breakdown: {breakdown}) suggests rubric ambiguity."
+                )
+            rows.append(
+                {
+                    "row": row_label,
+                    "dim": dim,
+                    "kappa": kappa,
+                    "ci_lo": lo,
+                    "ci_hi": hi,
+                    "n_eligible": n_eligible,
+                    "abstains": abstains,
+                    "abstain_rate": abstain_rate,
+                    "abstain_causes": abstain_causes,
+                    "footnote": footnote,
+                }
+            )
+    out = ["# κ ablation table — calibration v1\n"]
+    out.append("| Row | Dimension | κ (95% CI) | N | Abstain rate | Notes |")
+    out.append("|---|---|---|---|---|---|")
+    for r in rows:
+        if r["kappa"] is None:
+            kcell = " — "
+        else:
+            kcell = f"{r['kappa']:.3f} ({r['ci_lo']:.3f}, {r['ci_hi']:.3f})"
+        rate = f"{100 * r['abstain_rate']:.1f}%"
+        out.append(
+            f"| {r['row']} | {r['dim']} | {kcell} | {r['n_eligible']} | "
+            f"{rate} | {r['footnote']} |"
+        )
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    Path(output_path).write_text("\n".join(out) + "\n")
+    logger.info("kappa_table_written", path=output_path, rows=len(rows))

tests/evaluation/test_calibration_report.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""Tests for generate_kappa_table — joins, hash-mismatch raise, strict, abstain flag."""
+from __future__ import annotations
+import json
+from pathlib import Path
+import pytest
+import structlog
+from agent_bench.evaluation.calibration.report import generate_kappa_table
+def _write_predictions(path: Path, records: list[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(records, indent=2))
+def _write_labels(path: Path, records: list[dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text("\n".join(json.dumps(r) for r in records))
+def _pred(
+    item_id: str, dim: str, score, sys_hash: str = "h1", reasoning: str = ""
+) -> dict:
+    return {
+        "item_id": item_id,
+        "dimension": dim,
+        "score": score,
+        "judge_id": "claude-haiku-4-5_" + dim,
+        "rubric_version": "abc",
+        "system_output_hash": sys_hash,
+        "prompt_seed": 0,
+        "cost_usd": 0.001,
+        "latency_ms": 100.0,
+        "reasoning": reasoning,
+        "evidence_quotes": [],
+    }
+def _lbl(item_id: str, dim: str, score, sys_hash: str = "h1") -> dict:
+    return {
+        "item_id": item_id,
+        "dimension": dim,
+        "score": score,
+        "abstained": score == "Unknown",
+        "notes": "",
+        "label_timestamp": "2026-05-04T00:00:00Z",
+        "system_output_hash": sys_hash,
+    }
+class TestHashMismatch:
+    def test_raises_with_first_item_detail_and_full_list(self, tmp_path):
+        preds = [_pred("i1", "groundedness", 1, sys_hash="A")]
+        labels = [_lbl("i1", "groundedness", 1, sys_hash="B")]
+        _write_predictions(
+            tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
+        )
+        _write_labels(tmp_path / "labels.jsonl", labels)
+        with pytest.raises(ValueError) as exc_info:
+            generate_kappa_table(
+                predictions_glob=str(
+                    tmp_path / "results" / "calibration_v1_judge_*.json"
+                ),
+                labels_path=str(tmp_path / "labels.jsonl"),
+                output_path=str(tmp_path / "kappa.md"),
+            )
+        msg = str(exc_info.value)
+        assert "i1" in msg
+        assert "A" in msg and "B" in msg
+    def test_hash_mismatch_raises_in_strict_mode_too(self, tmp_path):
+        preds = [_pred("i1", "groundedness", 1, sys_hash="A")]
+        labels = [_lbl("i1", "groundedness", 1, sys_hash="B")]
+        _write_predictions(
+            tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
+        )
+        _write_labels(tmp_path / "labels.jsonl", labels)
+        with pytest.raises(ValueError):
+            generate_kappa_table(
+                predictions_glob=str(
+                    tmp_path / "results" / "calibration_v1_judge_*.json"
+                ),
+                labels_path=str(tmp_path / "labels.jsonl"),
+                output_path=str(tmp_path / "kappa.md"),
+                strict=True,
+            )
+class TestMissingPredictionLabel:
+    def test_default_warns_and_excludes(self, tmp_path):
+        preds = [
+            _pred("i1", "groundedness", 1),
+            _pred("i3", "groundedness", 0),
+            _pred("i4", "groundedness", 1),
+        ]
+        labels = [
+            _lbl("i1", "groundedness", 1),
+            _lbl("i2", "groundedness", 0),  # label without prediction
+            _lbl("i3", "groundedness", 0),
+            _lbl("i4", "groundedness", 1),
+        ]
+        _write_predictions(
+            tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
+        )
+        _write_labels(tmp_path / "labels.jsonl", labels)
+        with structlog.testing.capture_logs() as logs:
+            generate_kappa_table(
+                predictions_glob=str(
+                    tmp_path / "results" / "calibration_v1_judge_*.json"
+                ),
+                labels_path=str(tmp_path / "labels.jsonl"),
+                output_path=str(tmp_path / "kappa.md"),
+            )
+        assert (tmp_path / "kappa.md").exists()
+        assert any(
+            entry.get("event") == "calibration_report_missing" for entry in logs
+        ), f"no missing-warning log in {logs!r}"
+    def test_strict_raises_on_missing_prediction(self, tmp_path):
+        preds = [_pred("i1", "groundedness", 1)]
+        labels = [
+            _lbl("i1", "groundedness", 1),
+            _lbl("i2", "groundedness", 0),
+        ]
+        _write_predictions(
+            tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
+        )
+        _write_labels(tmp_path / "labels.jsonl", labels)
+        with pytest.raises(ValueError, match="missing"):
+            generate_kappa_table(
+                predictions_glob=str(
+                    tmp_path / "results" / "calibration_v1_judge_*.json"
+                ),
+                labels_path=str(tmp_path / "labels.jsonl"),
+                output_path=str(tmp_path / "kappa.md"),
+                strict=True,
+            )
+class TestAbstainRateFlag:
+    def _setup(self, tmp_path: Path, abstain_count: int) -> Path:
+        preds = []
+        labels = []
+        for i in range(30):
+            score: int | str = "Unknown" if i < abstain_count else 1
+            reasoning = (
+                "schema_parse_failed_after_retry: x" if score == "Unknown" else ""
+            )
+            preds.append(
+                _pred(f"i{i}", "groundedness", score, reasoning=reasoning)
+            )
+            # Half of non-abstain labels score 0 to ensure variance
+            label_score = 0 if (score == 1 and i % 2 == 0) else 1
+            labels.append(_lbl(f"i{i}", "groundedness", label_score))
+        _write_predictions(
+            tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
+        )
+        _write_labels(tmp_path / "labels.jsonl", labels)
+        out = tmp_path / "kappa.md"
+        generate_kappa_table(
+            predictions_glob=str(
+                tmp_path / "results" / "calibration_v1_judge_*.json"
+            ),
+            labels_path=str(tmp_path / "labels.jsonl"),
+            output_path=str(out),
+        )
+        return out
+    def test_at_20_percent_boundary_does_not_fire(self, tmp_path):
+        # 6/30 = exactly 20% — flag is ">"  (strictly greater), so not fired.
+        out = self._setup(tmp_path, abstain_count=6)
+        assert "high abstain rate" not in out.read_text().lower()
+    def test_above_20_percent_fires(self, tmp_path):
+        # 7/30 = 23.3% — flag fires
+        out = self._setup(tmp_path, abstain_count=7)
+        text = out.read_text().lower()
+        assert "high abstain rate" in text
+        assert "schema parse" in text
+class TestKappaUndefined:
+    def test_renders_dash_with_footnote(self, tmp_path):
+        # All same label → degenerate; report renders ' — '
+        preds = [_pred(f"i{i}", "groundedness", 1) for i in range(5)]
+        labels = [_lbl(f"i{i}", "groundedness", 1) for i in range(5)]
+        _write_predictions(
+            tmp_path / "results" / "calibration_v1_judge_baseline.json", preds
+        )
+        _write_labels(tmp_path / "labels.jsonl", labels)
+        out = tmp_path / "kappa.md"
+        generate_kappa_table(
+            predictions_glob=str(
+                tmp_path / "results" / "calibration_v1_judge_*.json"
+            ),
+            labels_path=str(tmp_path / "labels.jsonl"),
+            output_path=str(out),
+        )
+        text = out.read_text()
+        assert " — " in text