Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on 27 days ago

Commit

3a2ed35

1 Parent(s): 6ef2e0e

test(calibration): sklearn-parity fixtures + cross-check CI test

Four-part discipline per the design:
1. scripts/_dev/generate_kappa_fixtures.py — committed; runs from a
venv outside the project (sklearn is NOT a runtime dep but is
available transitively via sentence-transformers in dev installs).
2. SKLEARN_KAPPA_FIXTURES inline constants in test file — locality
preserved, type-checked, version-pinned (sklearn 1.5.2, 2026-05-04).
3. Load-bearing 'DO NOT add scikit-learn' comment.
4. Cross-check CI test (TestSklearnInputsCrossCheck) compares the
inline SKLEARN_KAPPA_INPUTS against the JSON sidecar written by
the generator; catches 'updated CASES list, forgot to regenerate'
at CI time.

Three real sklearn-parity cases now pass (imbalanced binary,
three-point with one diagonal swap, weighted ordinal with linear
weights). Tolerance 1e-7 accommodates sklearn's 10-decimal printed
precision.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

scripts/_dev/generate_kappa_fixtures.py +94 -0
tests/evaluation/fixtures/sklearn_kappa_inputs.json +83 -0
tests/evaluation/test_calibration_metrics.py +7 -11

scripts/_dev/generate_kappa_fixtures.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Generate sklearn-parity fixtures for tests/evaluation/test_calibration_metrics.py.
+Run from a venv with sklearn installed (NOT the project venv):
+    python -m venv /tmp/sklearn-fixture-venv
+    /tmp/sklearn-fixture-venv/bin/pip install scikit-learn==1.5.2
+    /tmp/sklearn-fixture-venv/bin/python scripts/_dev/generate_kappa_fixtures.py
+The script:
+  1. Defines CASES (input arrays + weight option).
+  2. Computes sklearn.metrics.cohen_kappa_score for each case.
+  3. Prints copy-pasteable Python constants for the test file.
+  4. Writes inputs to tests/evaluation/fixtures/sklearn_kappa_inputs.json
+     for the cross-check CI test (forgot-to-regenerate detection).
+DO NOT add scikit-learn to the project's runtime dependencies — these
+constants are the contract; the project hand-rolls the implementation.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+try:
+    from sklearn.metrics import cohen_kappa_score
+except ImportError as e:
+    raise SystemExit(
+        "scikit-learn not installed. Install in a venv outside this project:\n"
+        "  python -m venv /tmp/sklearn-fixture-venv\n"
+        "  /tmp/sklearn-fixture-venv/bin/pip install scikit-learn==1.5.2\n"
+        "  /tmp/sklearn-fixture-venv/bin/python scripts/_dev/generate_kappa_fixtures.py"
+    ) from e
+CASES: list[dict] = [
+    {
+        "name": "imbalanced_binary",
+        "y1": [1, 1, 1, 0, 1, 1, 0, 1, 1, 1],
+        "y2": [1, 1, 0, 0, 1, 1, 1, 1, 1, 0],
+        "weights": None,
+    },
+    {
+        "name": "three_point_one_diagonal_swap",
+        "y1": [0, 0, 1, 1, 2, 2, 0, 1, 2, 0],
+        "y2": [0, 1, 1, 1, 2, 2, 0, 1, 2, 0],
+        "weights": None,
+    },
+    {
+        "name": "weighted_ordinal_drift_linear",
+        "y1": [0, 1, 2, 0, 1, 2, 0, 1, 2, 0],
+        "y2": [0, 1, 2, 1, 1, 2, 0, 2, 2, 1],
+        "weights": "linear",
+    },
+]
+OUT_INPUTS = (
+    Path(__file__).resolve().parents[2]
+    / "tests"
+    / "evaluation"
+    / "fixtures"
+    / "sklearn_kappa_inputs.json"
+)
+print("# --- Paste into test_calibration_metrics.py ---\n")
+print("SKLEARN_KAPPA_FIXTURES: dict[str, float] = {")
+for case in CASES:
+    expected = cohen_kappa_score(case["y1"], case["y2"], weights=case["weights"])
+    print(f'    "{case["name"]}": {expected:.10f},  # sklearn 1.5.2')
+print("}")
+print("\nSKLEARN_KAPPA_INPUTS: dict[str, dict] = {")
+for case in CASES:
+    print(f'    "{case["name"]}": {{')
+    print(f'        "y1": {case["y1"]},')
+    print(f'        "y2": {case["y2"]},')
+    print(f'        "weights": {case["weights"]!r},')
+    print("    },")
+print("}")
+OUT_INPUTS.parent.mkdir(parents=True, exist_ok=True)
+OUT_INPUTS.write_text(
+    json.dumps(
+        {
+            case["name"]: {
+                "y1": case["y1"],
+                "y2": case["y2"],
+                "weights": case["weights"],
+            }
+            for case in CASES
+        },
+        indent=2,
+    )
+)
+print(f"\n# Wrote {OUT_INPUTS}")

tests/evaluation/fixtures/sklearn_kappa_inputs.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "imbalanced_binary": {
+    "y1": [
+      1,
+      1,
+      1,
+      0,
+      1,
+      1,
+      0,
+      1,
+      1,
+      1
+    ],
+    "y2": [
+      1,
+      1,
+      0,
+      0,
+      1,
+      1,
+      1,
+      1,
+      1,
+      0
+    ],
+    "weights": null
+  },
+  "three_point_one_diagonal_swap": {
+    "y1": [
+      0,
+      0,
+      1,
+      1,
+      2,
+      2,
+      0,
+      1,
+      2,
+      0
+    ],
+    "y2": [
+      0,
+      1,
+      1,
+      1,
+      2,
+      2,
+      0,
+      1,
+      2,
+      0
+    ],
+    "weights": null
+  },
+  "weighted_ordinal_drift_linear": {
+    "y1": [
+      0,
+      1,
+      2,
+      0,
+      1,
+      2,
+      0,
+      1,
+      2,
+      0
+    ],
+    "y2": [
+      0,
+      1,
+      2,
+      1,
+      1,
+      2,
+      0,
+      2,
+      2,
+      1
+    ],
+    "weights": "linear"
+  }
+}

tests/evaluation/test_calibration_metrics.py CHANGED Viewed

@@ -87,12 +87,11 @@ class TestBootstrapCI:
 # constants are the contract; the project hand-rolls the implementation.
 SKLEARN_KAPPA_FIXTURES: dict[str, float] = {
-    # PASTE OUTPUT FROM scripts/_dev/generate_kappa_fixtures.py HERE
-    # Placeholder values — replace by running the generator script in a
-    # venv with sklearn installed (see Phase 4 Task 4.2 Step 2).
-    "imbalanced_binary": 0.0,
-    "three_point_one_diagonal_swap": 0.0,
-    "weighted_ordinal_drift_linear": 0.0,
 }
 SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
@@ -114,17 +113,14 @@ SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
 }
-@pytest.mark.skip(
-    reason="Placeholder fixtures — regenerate via scripts/_dev/generate_kappa_fixtures.py "
-    "in a venv with sklearn==1.5.2, paste output above, then unskip."
-)
 class TestSklearnKappaParity:
     @pytest.mark.parametrize("case_name", list(SKLEARN_KAPPA_FIXTURES.keys()))
     def test_matches_sklearn(self, case_name: str):
         case = SKLEARN_KAPPA_INPUTS[case_name]
         expected = SKLEARN_KAPPA_FIXTURES[case_name]
         actual = cohen_kappa(case["y1"], case["y2"], weights=case["weights"])
-        assert actual == pytest.approx(expected, abs=1e-9), (
             f"hand-rolled cohen_kappa diverged from sklearn 1.5.2 on case "
             f"{case_name!r}: hand-rolled={actual} sklearn={expected}"
         )

 # constants are the contract; the project hand-rolls the implementation.
 SKLEARN_KAPPA_FIXTURES: dict[str, float] = {
+    # Generated against scikit-learn==1.5.2 cohen_kappa_score on 2026-05-04.
+    # To regenerate: scripts/_dev/generate_kappa_fixtures.py
+    "imbalanced_binary": 0.2105263158,
+    "three_point_one_diagonal_swap": 0.8507462687,
+    "weighted_ordinal_drift_linear": 0.6666666667,
 }
 SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
 }
 class TestSklearnKappaParity:
     @pytest.mark.parametrize("case_name", list(SKLEARN_KAPPA_FIXTURES.keys()))
     def test_matches_sklearn(self, case_name: str):
         case = SKLEARN_KAPPA_INPUTS[case_name]
         expected = SKLEARN_KAPPA_FIXTURES[case_name]
         actual = cohen_kappa(case["y1"], case["y2"], weights=case["weights"])
+        # Tolerance 1e-7 accommodates sklearn's printed precision of 10 decimals
+        assert actual == pytest.approx(expected, abs=1e-7), (
             f"hand-rolled cohen_kappa diverged from sklearn 1.5.2 on case "
             f"{case_name!r}: hand-rolled={actual} sklearn={expected}"
         )