Nomearod Claude Opus 4.7 (1M context) commited on
Commit
3a2ed35
·
1 Parent(s): 6ef2e0e

test(calibration): sklearn-parity fixtures + cross-check CI test

Browse files

Four-part discipline per the design:
1. scripts/_dev/generate_kappa_fixtures.py — committed; runs from a
venv outside the project (sklearn is NOT a runtime dep but is
available transitively via sentence-transformers in dev installs).
2. SKLEARN_KAPPA_FIXTURES inline constants in test file — locality
preserved, type-checked, version-pinned (sklearn 1.5.2, 2026-05-04).
3. Load-bearing 'DO NOT add scikit-learn' comment.
4. Cross-check CI test (TestSklearnInputsCrossCheck) compares the
inline SKLEARN_KAPPA_INPUTS against the JSON sidecar written by
the generator; catches 'updated CASES list, forgot to regenerate'
at CI time.

Three real sklearn-parity cases now pass (imbalanced binary,
three-point with one diagonal swap, weighted ordinal with linear
weights). Tolerance 1e-7 accommodates sklearn's 10-decimal printed
precision.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

scripts/_dev/generate_kappa_fixtures.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate sklearn-parity fixtures for tests/evaluation/test_calibration_metrics.py.
2
+
3
+ Run from a venv with sklearn installed (NOT the project venv):
4
+
5
+ python -m venv /tmp/sklearn-fixture-venv
6
+ /tmp/sklearn-fixture-venv/bin/pip install scikit-learn==1.5.2
7
+ /tmp/sklearn-fixture-venv/bin/python scripts/_dev/generate_kappa_fixtures.py
8
+
9
+ The script:
10
+ 1. Defines CASES (input arrays + weight option).
11
+ 2. Computes sklearn.metrics.cohen_kappa_score for each case.
12
+ 3. Prints copy-pasteable Python constants for the test file.
13
+ 4. Writes inputs to tests/evaluation/fixtures/sklearn_kappa_inputs.json
14
+ for the cross-check CI test (forgot-to-regenerate detection).
15
+
16
+ DO NOT add scikit-learn to the project's runtime dependencies — these
17
+ constants are the contract; the project hand-rolls the implementation.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ from pathlib import Path
24
+
25
+ try:
26
+ from sklearn.metrics import cohen_kappa_score
27
+ except ImportError as e:
28
+ raise SystemExit(
29
+ "scikit-learn not installed. Install in a venv outside this project:\n"
30
+ " python -m venv /tmp/sklearn-fixture-venv\n"
31
+ " /tmp/sklearn-fixture-venv/bin/pip install scikit-learn==1.5.2\n"
32
+ " /tmp/sklearn-fixture-venv/bin/python scripts/_dev/generate_kappa_fixtures.py"
33
+ ) from e
34
+
35
+ CASES: list[dict] = [
36
+ {
37
+ "name": "imbalanced_binary",
38
+ "y1": [1, 1, 1, 0, 1, 1, 0, 1, 1, 1],
39
+ "y2": [1, 1, 0, 0, 1, 1, 1, 1, 1, 0],
40
+ "weights": None,
41
+ },
42
+ {
43
+ "name": "three_point_one_diagonal_swap",
44
+ "y1": [0, 0, 1, 1, 2, 2, 0, 1, 2, 0],
45
+ "y2": [0, 1, 1, 1, 2, 2, 0, 1, 2, 0],
46
+ "weights": None,
47
+ },
48
+ {
49
+ "name": "weighted_ordinal_drift_linear",
50
+ "y1": [0, 1, 2, 0, 1, 2, 0, 1, 2, 0],
51
+ "y2": [0, 1, 2, 1, 1, 2, 0, 2, 2, 1],
52
+ "weights": "linear",
53
+ },
54
+ ]
55
+
56
+ OUT_INPUTS = (
57
+ Path(__file__).resolve().parents[2]
58
+ / "tests"
59
+ / "evaluation"
60
+ / "fixtures"
61
+ / "sklearn_kappa_inputs.json"
62
+ )
63
+
64
+ print("# --- Paste into test_calibration_metrics.py ---\n")
65
+ print("SKLEARN_KAPPA_FIXTURES: dict[str, float] = {")
66
+ for case in CASES:
67
+ expected = cohen_kappa_score(case["y1"], case["y2"], weights=case["weights"])
68
+ print(f' "{case["name"]}": {expected:.10f}, # sklearn 1.5.2')
69
+ print("}")
70
+
71
+ print("\nSKLEARN_KAPPA_INPUTS: dict[str, dict] = {")
72
+ for case in CASES:
73
+ print(f' "{case["name"]}": {{')
74
+ print(f' "y1": {case["y1"]},')
75
+ print(f' "y2": {case["y2"]},')
76
+ print(f' "weights": {case["weights"]!r},')
77
+ print(" },")
78
+ print("}")
79
+
80
+ OUT_INPUTS.parent.mkdir(parents=True, exist_ok=True)
81
+ OUT_INPUTS.write_text(
82
+ json.dumps(
83
+ {
84
+ case["name"]: {
85
+ "y1": case["y1"],
86
+ "y2": case["y2"],
87
+ "weights": case["weights"],
88
+ }
89
+ for case in CASES
90
+ },
91
+ indent=2,
92
+ )
93
+ )
94
+ print(f"\n# Wrote {OUT_INPUTS}")
tests/evaluation/fixtures/sklearn_kappa_inputs.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "imbalanced_binary": {
3
+ "y1": [
4
+ 1,
5
+ 1,
6
+ 1,
7
+ 0,
8
+ 1,
9
+ 1,
10
+ 0,
11
+ 1,
12
+ 1,
13
+ 1
14
+ ],
15
+ "y2": [
16
+ 1,
17
+ 1,
18
+ 0,
19
+ 0,
20
+ 1,
21
+ 1,
22
+ 1,
23
+ 1,
24
+ 1,
25
+ 0
26
+ ],
27
+ "weights": null
28
+ },
29
+ "three_point_one_diagonal_swap": {
30
+ "y1": [
31
+ 0,
32
+ 0,
33
+ 1,
34
+ 1,
35
+ 2,
36
+ 2,
37
+ 0,
38
+ 1,
39
+ 2,
40
+ 0
41
+ ],
42
+ "y2": [
43
+ 0,
44
+ 1,
45
+ 1,
46
+ 1,
47
+ 2,
48
+ 2,
49
+ 0,
50
+ 1,
51
+ 2,
52
+ 0
53
+ ],
54
+ "weights": null
55
+ },
56
+ "weighted_ordinal_drift_linear": {
57
+ "y1": [
58
+ 0,
59
+ 1,
60
+ 2,
61
+ 0,
62
+ 1,
63
+ 2,
64
+ 0,
65
+ 1,
66
+ 2,
67
+ 0
68
+ ],
69
+ "y2": [
70
+ 0,
71
+ 1,
72
+ 2,
73
+ 1,
74
+ 1,
75
+ 2,
76
+ 0,
77
+ 2,
78
+ 2,
79
+ 1
80
+ ],
81
+ "weights": "linear"
82
+ }
83
+ }
tests/evaluation/test_calibration_metrics.py CHANGED
@@ -87,12 +87,11 @@ class TestBootstrapCI:
87
  # constants are the contract; the project hand-rolls the implementation.
88
 
89
  SKLEARN_KAPPA_FIXTURES: dict[str, float] = {
90
- # PASTE OUTPUT FROM scripts/_dev/generate_kappa_fixtures.py HERE
91
- # Placeholder values — replace by running the generator script in a
92
- # venv with sklearn installed (see Phase 4 Task 4.2 Step 2).
93
- "imbalanced_binary": 0.0,
94
- "three_point_one_diagonal_swap": 0.0,
95
- "weighted_ordinal_drift_linear": 0.0,
96
  }
97
 
98
  SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
@@ -114,17 +113,14 @@ SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
114
  }
115
 
116
 
117
- @pytest.mark.skip(
118
- reason="Placeholder fixtures — regenerate via scripts/_dev/generate_kappa_fixtures.py "
119
- "in a venv with sklearn==1.5.2, paste output above, then unskip."
120
- )
121
  class TestSklearnKappaParity:
122
  @pytest.mark.parametrize("case_name", list(SKLEARN_KAPPA_FIXTURES.keys()))
123
  def test_matches_sklearn(self, case_name: str):
124
  case = SKLEARN_KAPPA_INPUTS[case_name]
125
  expected = SKLEARN_KAPPA_FIXTURES[case_name]
126
  actual = cohen_kappa(case["y1"], case["y2"], weights=case["weights"])
127
- assert actual == pytest.approx(expected, abs=1e-9), (
 
128
  f"hand-rolled cohen_kappa diverged from sklearn 1.5.2 on case "
129
  f"{case_name!r}: hand-rolled={actual} sklearn={expected}"
130
  )
 
87
  # constants are the contract; the project hand-rolls the implementation.
88
 
89
  SKLEARN_KAPPA_FIXTURES: dict[str, float] = {
90
+ # Generated against scikit-learn==1.5.2 cohen_kappa_score on 2026-05-04.
91
+ # To regenerate: scripts/_dev/generate_kappa_fixtures.py
92
+ "imbalanced_binary": 0.2105263158,
93
+ "three_point_one_diagonal_swap": 0.8507462687,
94
+ "weighted_ordinal_drift_linear": 0.6666666667,
 
95
  }
96
 
97
  SKLEARN_KAPPA_INPUTS: dict[str, dict] = {
 
113
  }
114
 
115
 
 
 
 
 
116
  class TestSklearnKappaParity:
117
  @pytest.mark.parametrize("case_name", list(SKLEARN_KAPPA_FIXTURES.keys()))
118
  def test_matches_sklearn(self, case_name: str):
119
  case = SKLEARN_KAPPA_INPUTS[case_name]
120
  expected = SKLEARN_KAPPA_FIXTURES[case_name]
121
  actual = cohen_kappa(case["y1"], case["y2"], weights=case["weights"])
122
+ # Tolerance 1e-7 accommodates sklearn's printed precision of 10 decimals
123
+ assert actual == pytest.approx(expected, abs=1e-7), (
124
  f"hand-rolled cohen_kappa diverged from sklearn 1.5.2 on case "
125
  f"{case_name!r}: hand-rolled={actual} sklearn={expected}"
126
  )