Nomearod Claude Opus 4.7 (1M context) commited on
Commit
9255fb5
·
1 Parent(s): 508e5ef

fix(judges): four review-blocking bugs (review items 1–4 + 8)

Browse files

#1 — harness.py CompletenessJudge gate restored. Pre-supersession code
gated correctness on `if q.reference_answer:`; the new per-dimension
loop iterated all dims unconditionally, burning tokens on guaranteed-
noisy verdicts when reference_answer was empty. Now: skip completeness
when reference_answer is falsy, matching the prior contract. Test
asserts the gate by mocking a judge_provider and confirming
'completeness' is absent from judge_scores when reference is "".

#2 — Rubric loader was fence-blind. `## Score N` literals inside
fenced code blocks in anchored examples were counted as structural
level headers, producing arity-mismatch errors on rubrics that wanted
to quote header-shaped strings (which the design encourages). Fix:
mask fenced regions with same-length whitespace before scanning for
level headers, then slice the original body at the masked-text header
positions to recover level bodies with their fenced content intact.
New fixture rubrics_valid_with_fenced_examples.md exercises the case;
test was failing before this change.

#3 — Jury kappa_weighted contradicted ties-to-lower policy. The
`mean` aggregation path discretizes via _aggregate_scores (frac > 0.5
→ ceil, else floor; ties go to floor). The `kappa_weighted` path went
through int(round(weighted_mean)) which is Python's banker's rounding
(0.5 → 0, 1.5 → 2). Result: two judges scoring [1, 2] with equal
weights returned 1 under `mean` and 2 under `kappa_weighted`. Now:
extracted _discretize_mean helper that mirrors _aggregate_scores
exactly. Test pins the equivalence at the half-integer boundary.

#4 — Jury reasoning string concealed the silent weight fallback.
When the kappa_weighted weights dict was missing a member's judge_id,
runtime fell back to 1.0 silently — but the reasoning string printed
the constructor's dict (`list(self.weights.values())`), so anyone
debugging a calibration row saw the configured weights, not the
applied ones. Now: reasoning reports per-successful-member applied
weights; a structlog WARN ('jury_missing_weight_fallback_to_one')
fires for each fallback so operators notice the contract violation.
Two regression tests: applied-weights-in-reasoning, warn-on-missing.

#8 — Hoisted vestigial inline imports in harness.py from the
TYPE_CHECKING attempt. ScoreResult is already module-top imported,
no cycle risk. _JUDGE_CLASS_BY_DIMENSION is now a module-level
constant.

All 514 tests pass; ruff clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/harness.py CHANGED
@@ -8,9 +8,13 @@ from pathlib import Path
8
  from pydantic import BaseModel, Field
9
 
10
  from agent_bench.agents.orchestrator import Orchestrator
 
11
  from agent_bench.core.provider import LLMProvider
12
  from agent_bench.core.types import TokenUsage
13
- from agent_bench.evaluation.judges.base import ScoreResult
 
 
 
14
  from agent_bench.evaluation.metrics import (
15
  calculator_used_when_expected,
16
  citation_accuracy,
@@ -22,6 +26,12 @@ from agent_bench.evaluation.metrics import (
22
  tool_call_count,
23
  )
24
 
 
 
 
 
 
 
25
 
26
  class GoldenQuestion(BaseModel):
27
  id: str
@@ -155,24 +165,19 @@ async def run_evaluation(
155
  # behavior); the q.category != 'out_of_scope' gate is preserved
156
  # (L2 doesn't apply to refusals — that's L1's job).
157
  if judge_provider is not None and q.category != "out_of_scope":
158
- from agent_bench.core.config import load_config
159
- from agent_bench.evaluation.judges.base import Rubric
160
- from agent_bench.evaluation.judges.completeness import CompletenessJudge
161
- from agent_bench.evaluation.judges.groundedness import GroundednessJudge
162
- from agent_bench.evaluation.judges.relevance import RelevanceJudge
163
-
164
  cfg = load_config()
165
  rubric_dir = Path(__file__).resolve().parent / "rubrics"
166
- judge_class = {
167
- "groundedness": GroundednessJudge,
168
- "relevance": RelevanceJudge,
169
- "completeness": CompletenessJudge,
170
- }
171
  for dim in cfg.evaluation.judge_dimensions:
172
- if dim not in judge_class:
173
  continue # citation_faithfulness opt-in; not in default loop
 
 
 
 
 
 
174
  rubric = Rubric.from_markdown_file(rubric_dir / f"{dim}.md")
175
- judge = judge_class[dim](
176
  judge_provider=judge_provider,
177
  rubric=rubric,
178
  model_id=getattr(judge_provider, "model", "unknown"),
 
8
  from pydantic import BaseModel, Field
9
 
10
  from agent_bench.agents.orchestrator import Orchestrator
11
+ from agent_bench.core.config import load_config
12
  from agent_bench.core.provider import LLMProvider
13
  from agent_bench.core.types import TokenUsage
14
+ from agent_bench.evaluation.judges.base import Rubric, ScoreResult
15
+ from agent_bench.evaluation.judges.completeness import CompletenessJudge
16
+ from agent_bench.evaluation.judges.groundedness import GroundednessJudge
17
+ from agent_bench.evaluation.judges.relevance import RelevanceJudge
18
  from agent_bench.evaluation.metrics import (
19
  calculator_used_when_expected,
20
  citation_accuracy,
 
26
  tool_call_count,
27
  )
28
 
29
+ _JUDGE_CLASS_BY_DIMENSION = {
30
+ "groundedness": GroundednessJudge,
31
+ "relevance": RelevanceJudge,
32
+ "completeness": CompletenessJudge,
33
+ }
34
+
35
 
36
  class GoldenQuestion(BaseModel):
37
  id: str
 
165
  # behavior); the q.category != 'out_of_scope' gate is preserved
166
  # (L2 doesn't apply to refusals — that's L1's job).
167
  if judge_provider is not None and q.category != "out_of_scope":
 
 
 
 
 
 
168
  cfg = load_config()
169
  rubric_dir = Path(__file__).resolve().parent / "rubrics"
 
 
 
 
 
170
  for dim in cfg.evaluation.judge_dimensions:
171
+ if dim not in _JUDGE_CLASS_BY_DIMENSION:
172
  continue # citation_faithfulness opt-in; not in default loop
173
+ # CompletenessJudge is reference-based on q.reference_answer;
174
+ # scoring an empty reference is guaranteed-noisy and burns
175
+ # tokens. Pre-supersession code had the same gate (correctness
176
+ # was conditional on reference_answer being non-empty).
177
+ if dim == "completeness" and not q.reference_answer:
178
+ continue
179
  rubric = Rubric.from_markdown_file(rubric_dir / f"{dim}.md")
180
+ judge = _JUDGE_CLASS_BY_DIMENSION[dim](
181
  judge_provider=judge_provider,
182
  rubric=rubric,
183
  model_id=getattr(judge_provider, "model", "unknown"),
agent_bench/evaluation/judges/base.py CHANGED
@@ -77,6 +77,22 @@ class ScoreResult(BaseModel):
77
  return self.score == "Unknown"
78
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  class RubricLevel(BaseModel):
81
  """One score level in a rubric, with anchored examples.
82
 
@@ -150,14 +166,25 @@ class Rubric(BaseModel):
150
  f"must be 'binary' or 'three_point'"
151
  )
152
 
153
- # Parse levels by ## Score N headers
 
 
 
 
 
154
  body_no_fm = fm_match.group(2)
155
- level_pattern = re.compile(
156
- r"^## Score (\d+)\n(.*?)(?=^## Score |\Z)", re.MULTILINE | re.DOTALL
157
- )
158
- raw_levels: list[tuple[int, str]] = [
159
- (int(m.group(1)), m.group(2)) for m in level_pattern.finditer(body_no_fm)
160
- ]
 
 
 
 
 
 
161
 
162
  expected_arity = 2 if scale == "binary" else 3
163
  if len(raw_levels) != expected_arity:
 
77
  return self.score == "Unknown"
78
 
79
 
80
+ _FENCE_PATTERN = re.compile(r"^```[^\n]*\n.*?^```\n?", re.MULTILINE | re.DOTALL)
81
+
82
+
83
+ def _mask_code_fences(text: str) -> str:
84
+ """Replace fenced code blocks (``` ... ```) with same-length whitespace,
85
+ preserving newlines so byte offsets align with the original. Used by
86
+ the rubric loader to skip fenced ``## Score N`` literals when scanning
87
+ for structural level headers.
88
+ """
89
+
90
+ def _replace(match: re.Match[str]) -> str:
91
+ return "".join("\n" if c == "\n" else " " for c in match.group(0))
92
+
93
+ return _FENCE_PATTERN.sub(_replace, text)
94
+
95
+
96
  class RubricLevel(BaseModel):
97
  """One score level in a rubric, with anchored examples.
98
 
 
166
  f"must be 'binary' or 'three_point'"
167
  )
168
 
169
+ # Parse levels by ## Score N headers. Mask fenced code blocks first
170
+ # so a literal "## Score N" inside an example's code fence is not
171
+ # interpreted as a structural level header. The mask preserves byte
172
+ # offsets (replacing non-newline chars with spaces) so we can slice
173
+ # the original `body_no_fm` at the masked-text header positions to
174
+ # recover level bodies with their fenced content intact.
175
  body_no_fm = fm_match.group(2)
176
+ masked_body = _mask_code_fences(body_no_fm)
177
+ header_pattern = re.compile(r"^## Score (\d+)\n", re.MULTILINE)
178
+ header_matches = list(header_pattern.finditer(masked_body))
179
+ raw_levels: list[tuple[int, str]] = []
180
+ for i, m in enumerate(header_matches):
181
+ start = m.end()
182
+ end = (
183
+ header_matches[i + 1].start()
184
+ if i + 1 < len(header_matches)
185
+ else len(body_no_fm)
186
+ )
187
+ raw_levels.append((int(m.group(1)), body_no_fm[start:end]))
188
 
189
  expected_arity = 2 if scale == "binary" else 3
190
  if len(raw_levels) != expected_arity:
agent_bench/evaluation/variance/jury.py CHANGED
@@ -6,6 +6,8 @@ import asyncio
6
  from pathlib import Path
7
  from typing import TYPE_CHECKING, Literal
8
 
 
 
9
  from agent_bench.evaluation.judges.base import Judge, ScoreResult
10
  from agent_bench.evaluation.variance.rubric_permute import _aggregate_scores
11
 
@@ -15,6 +17,21 @@ if TYPE_CHECKING:
15
 
16
  _DEFAULT_SIDECAR_TEMPLATE = "results/calibration_v1_judge_{aggregation}_members.jsonl"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  class Jury:
20
  """Aggregates a list of Judge instances into one ScoreResult per item.
@@ -94,21 +111,38 @@ class Jury:
94
  # Aggregate over successful members
95
  scores = [int(r.score) for r in successful]
96
  scale = self.judges[0].rubric.scale
 
97
  if self.aggregation == "mean":
98
  agg = _aggregate_scores(scores, scale)
99
  else: # kappa_weighted
100
- # Weight successful members by judge_id; missing weights → 1.0 (mean fallback)
101
- ws = [self.weights.get(r.judge_id, 1.0) for r in successful]
102
- weighted_sum = sum(s * w for s, w in zip(scores, ws))
103
- weight_total = sum(ws)
104
- mean = weighted_sum / weight_total if weight_total > 0 else 0.0
105
- agg = _aggregate_scores([int(round(mean))], scale)
106
-
107
- weights_str = (
108
- list(self.weights.values())
109
- if self.aggregation == "kappa_weighted"
110
- else "n/a"
111
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  return ScoreResult(
113
  reasoning=(
114
  f"jury_{self.aggregation}: "
 
6
  from pathlib import Path
7
  from typing import TYPE_CHECKING, Literal
8
 
9
+ import structlog
10
+
11
  from agent_bench.evaluation.judges.base import Judge, ScoreResult
12
  from agent_bench.evaluation.variance.rubric_permute import _aggregate_scores
13
 
 
17
 
18
  _DEFAULT_SIDECAR_TEMPLATE = "results/calibration_v1_judge_{aggregation}_members.jsonl"
19
 
20
+ logger = structlog.get_logger()
21
+
22
+
23
+ def _discretize_mean(mean: float, scale: str) -> int:
24
+ """Discretize a float mean to a discrete level per scale, ties → lower
25
+ (mirrors `_aggregate_scores`'s policy without going through int(round())
26
+ which would invoke Python's banker's rounding and silently violate the
27
+ tie-breaking contract).
28
+ """
29
+ if scale == "binary":
30
+ return 1 if mean > 0.5 else 0
31
+ floor = int(mean)
32
+ frac = mean - floor
33
+ return floor + 1 if frac > 0.5 else floor
34
+
35
 
36
  class Jury:
37
  """Aggregates a list of Judge instances into one ScoreResult per item.
 
111
  # Aggregate over successful members
112
  scores = [int(r.score) for r in successful]
113
  scale = self.judges[0].rubric.scale
114
+ applied_weights: list[float] = []
115
  if self.aggregation == "mean":
116
  agg = _aggregate_scores(scores, scale)
117
  else: # kappa_weighted
118
+ # Weight successful members by judge_id; missing weights → 1.0
119
+ # (mean fallback). Warn loudly when this fallback fires —
120
+ # `kappa_weighted` is supposed to use explicit weights, and
121
+ # silently substituting 1.0 violates that contract.
122
+ for r in successful:
123
+ if r.judge_id not in self.weights:
124
+ logger.warning(
125
+ "jury_missing_weight_fallback_to_one",
126
+ judge_id=r.judge_id,
127
+ aggregation=self.aggregation,
128
+ configured_weights=sorted(self.weights.keys()),
129
+ )
130
+ applied_weights.append(self.weights.get(r.judge_id, 1.0))
131
+ weighted_sum = sum(s * w for s, w in zip(scores, applied_weights))
132
+ weight_total = sum(applied_weights)
133
+ weighted_mean = (
134
+ weighted_sum / weight_total if weight_total > 0 else 0.0
135
+ )
136
+ # Discretize via the shared ties-to-lower policy (NOT int(round())
137
+ # which uses banker's rounding and would diverge from the `mean`
138
+ # path on half-integer aggregates).
139
+ agg = _discretize_mean(weighted_mean, scale)
140
+
141
+ # Reasoning string reports the per-member weights actually applied
142
+ # (not the constructor's dict — the dict may be missing entries that
143
+ # silently fell back to 1.0; printing the constructor's dict would
144
+ # conceal that fallback from anyone debugging a calibration row).
145
+ weights_str = applied_weights if self.aggregation == "kappa_weighted" else "n/a"
146
  return ScoreResult(
147
  reasoning=(
148
  f"jury_{self.aggregation}: "
tests/evaluation/fixtures/rubrics_valid_with_fenced_examples.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dimension: groundedness
3
+ scale: binary
4
+ reference_based: true
5
+ abstain_allowed: true
6
+ ---
7
+
8
+ # Groundedness with fenced code examples
9
+
10
+ ## Score 0
11
+
12
+ Answer adds an unsupported claim.
13
+
14
+ ### Example A — answer references nonexistent score in a code fence
15
+
16
+ The agent's answer might contain markdown that LOOKS like a section header
17
+ but is actually inside a code fence. Example output:
18
+
19
+ ```markdown
20
+ ## Score 7
21
+ This isn't a real rubric level — it's a string that happens to match the
22
+ level-header pattern, embedded in a code-fence example.
23
+ ```
24
+
25
+ Score=0 because the cited claim above is fabricated; the rubric loader
26
+ must not interpret the fenced `## Score 7` as a real level.
27
+
28
+ ## Score 1
29
+
30
+ Every claim is supported.
31
+
32
+ ### Example B — fenced reference excerpt
33
+
34
+ The agent might quote a config snippet with a header inside:
35
+
36
+ ```yaml
37
+ # Config heading
38
+ ## Score handler
39
+ score_handler: default
40
+ ```
41
+
42
+ Score=1 because the fenced YAML is illustrative, not a rubric-structural
43
+ header.
tests/evaluation/test_harness_migration.py CHANGED
@@ -2,7 +2,14 @@
2
 
3
  from __future__ import annotations
4
 
 
 
 
 
 
5
  from agent_bench.core.config import EvaluationConfig
 
 
6
 
7
 
8
  class TestJudgeProviderConfigPreserved:
@@ -33,3 +40,76 @@ class TestEvalResultJudgeScores:
33
  assert "judge_scores" in fields, (
34
  "judge_scores: dict[str, ScoreResult] should be added"
35
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from __future__ import annotations
4
 
5
+ from unittest.mock import AsyncMock
6
+
7
+ import pytest
8
+
9
+ from agent_bench.agents.orchestrator import AgentResponse, SourceReference
10
  from agent_bench.core.config import EvaluationConfig
11
+ from agent_bench.core.provider import LLMProvider
12
+ from agent_bench.core.types import CompletionResponse, TokenUsage
13
 
14
 
15
  class TestJudgeProviderConfigPreserved:
 
40
  assert "judge_scores" in fields, (
41
  "judge_scores: dict[str, ScoreResult] should be added"
42
  )
43
+
44
+
45
+ def _mk_judge_response(score: int) -> CompletionResponse:
46
+ import json
47
+
48
+ return CompletionResponse(
49
+ content=json.dumps(
50
+ {"reasoning": "r", "evidence_quotes": [], "score": score}
51
+ ),
52
+ tool_calls=[],
53
+ usage=TokenUsage(input_tokens=10, output_tokens=10, estimated_cost_usd=0.0),
54
+ provider="mock",
55
+ model="m",
56
+ latency_ms=1.0,
57
+ )
58
+
59
+
60
+ class TestCompletenessGatedOnReferenceAnswer:
61
+ """Regression: pre-supersession code gated correctness on
62
+ `if q.reference_answer:` — the new per-dimension loop must preserve
63
+ that gate so empty references don't burn tokens on guaranteed-noisy
64
+ verdicts.
65
+ """
66
+
67
+ @pytest.mark.asyncio
68
+ async def test_empty_reference_answer_skips_completeness_judge(self, tmp_path):
69
+ from agent_bench.agents.orchestrator import Orchestrator
70
+ from agent_bench.evaluation.harness import run_evaluation
71
+
72
+ # Minimal golden item with an EMPTY reference_answer
73
+ golden_path = tmp_path / "golden.json"
74
+ golden_path.write_text(
75
+ '[{"id": "q1", "question": "?", "expected_answer_keywords": [],'
76
+ ' "expected_sources": [], "category": "retrieval",'
77
+ ' "difficulty": "easy", "requires_calculator": false,'
78
+ ' "reference_answer": ""}]'
79
+ )
80
+
81
+ # Mock orchestrator returning a fixed AgentResponse
82
+ orch = AsyncMock(spec=Orchestrator)
83
+ orch.run.return_value = AgentResponse(
84
+ answer="Some answer.",
85
+ sources=[SourceReference(source="a.md")],
86
+ ranked_sources=["a.md"],
87
+ source_chunks=["chunk a"],
88
+ iterations=1,
89
+ usage=TokenUsage(
90
+ input_tokens=0, output_tokens=0, estimated_cost_usd=0.0
91
+ ),
92
+ latency_ms=0.0,
93
+ )
94
+
95
+ # Track calls to the judge provider
96
+ judge_provider = AsyncMock(spec=LLMProvider)
97
+ judge_provider.complete.return_value = _mk_judge_response(1)
98
+ judge_provider.model = "test-model"
99
+
100
+ results = await run_evaluation(
101
+ orchestrator=orch,
102
+ system_prompt="x",
103
+ golden_path=golden_path,
104
+ judge_provider=judge_provider,
105
+ )
106
+
107
+ assert len(results) == 1
108
+ # Groundedness + relevance should run; completeness must be skipped
109
+ # because reference_answer == ""
110
+ assert "completeness" not in results[0].judge_scores, (
111
+ "CompletenessJudge ran with empty reference_answer — "
112
+ "should be gated on q.reference_answer truthiness"
113
+ )
114
+ assert "groundedness" in results[0].judge_scores
115
+ assert "relevance" in results[0].judge_scores
tests/evaluation/test_jury_aggregation.py CHANGED
@@ -165,6 +165,96 @@ class TestJury:
165
  with pytest.raises(ValueError, match="weights"):
166
  jury(judges=[j1], aggregation="kappa_weighted")
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  @pytest.mark.asyncio
169
  async def test_cancel_on_non_retryable(self, tmp_path):
170
  """Non-retryable exception in any member must propagate immediately."""
 
165
  with pytest.raises(ValueError, match="weights"):
166
  jury(judges=[j1], aggregation="kappa_weighted")
167
 
168
+ @pytest.mark.asyncio
169
+ async def test_kappa_weighted_with_equal_weights_matches_mean(self, tmp_path):
170
+ """Regression for ties-to-lower divergence between mean and
171
+ kappa_weighted paths. Two judges score [1, 2] with equal weights;
172
+ weighted mean == 1.5. The mean path returns 1 (ties-to-lower); the
173
+ kappa_weighted path must also return 1 — banker's rounding would
174
+ return 2 and silently violate the policy.
175
+ """
176
+ from agent_bench.evaluation.variance.jury import jury
177
+
178
+ j1 = _relevance_judge_with_responses([_vj(1)])
179
+ j1.judge_id = "claude-haiku_relevance"
180
+ j2 = _relevance_judge_with_responses([_vj(2)])
181
+ j2.judge_id = "gpt-4o-mini_relevance"
182
+
183
+ weights = {"claude-haiku_relevance": 1.0, "gpt-4o-mini_relevance": 1.0}
184
+ ju = jury(
185
+ judges=[j1, j2],
186
+ aggregation="kappa_weighted",
187
+ weights=weights,
188
+ sidecar_path=tmp_path / "jury.jsonl",
189
+ )
190
+ result = await ju.score(_item(), _output())
191
+ assert result.score == 1, (
192
+ f"kappa_weighted with equal weights on [1, 2] returned "
193
+ f"{result.score}; expected 1 (ties-to-lower per "
194
+ f"_aggregate_scores policy). banker's-rounding bug?"
195
+ )
196
+
197
+ @pytest.mark.asyncio
198
+ async def test_kappa_weighted_reasoning_reports_applied_weights_not_dict(
199
+ self, tmp_path
200
+ ):
201
+ """Regression: when the weights dict is missing a member's judge_id,
202
+ the runtime applies 1.0 silently. The reasoning string MUST report
203
+ the per-member weights actually used (so the fallback is visible),
204
+ not the constructor's dict (which would conceal it).
205
+ """
206
+ from agent_bench.evaluation.variance.jury import jury
207
+
208
+ j1 = _relevance_judge_with_responses([_vj(2)])
209
+ j1.judge_id = "claude-haiku_relevance"
210
+ j2 = _relevance_judge_with_responses([_vj(2)])
211
+ j2.judge_id = "gpt-4o-mini_relevance"
212
+
213
+ # weights dict only covers j1 — j2 should fall back to 1.0
214
+ weights = {"claude-haiku_relevance": 5.0}
215
+ ju = jury(
216
+ judges=[j1, j2],
217
+ aggregation="kappa_weighted",
218
+ weights=weights,
219
+ sidecar_path=tmp_path / "jury.jsonl",
220
+ )
221
+ result = await ju.score(_item(), _output())
222
+ # Reasoning must surface BOTH applied weights (5.0 and 1.0)
223
+ assert "5.0" in result.reasoning, (
224
+ f"applied weight 5.0 missing from reasoning: {result.reasoning!r}"
225
+ )
226
+ assert "1.0" in result.reasoning, (
227
+ f"fallback weight 1.0 missing from reasoning: {result.reasoning!r}"
228
+ )
229
+
230
+ @pytest.mark.asyncio
231
+ async def test_kappa_weighted_logs_warning_on_missing_weight(self, tmp_path):
232
+ """Regression: silent 1.0 substitution for a missing judge_id should
233
+ emit a structlog WARN so the operator notices a contract violation.
234
+ """
235
+ import structlog
236
+
237
+ from agent_bench.evaluation.variance.jury import jury
238
+
239
+ j1 = _relevance_judge_with_responses([_vj(1)])
240
+ j1.judge_id = "claude-haiku_relevance"
241
+ j2 = _relevance_judge_with_responses([_vj(1)])
242
+ j2.judge_id = "gpt-4o-mini_relevance"
243
+
244
+ weights = {"claude-haiku_relevance": 1.0} # j2 missing
245
+ ju = jury(
246
+ judges=[j1, j2],
247
+ aggregation="kappa_weighted",
248
+ weights=weights,
249
+ sidecar_path=tmp_path / "jury.jsonl",
250
+ )
251
+ with structlog.testing.capture_logs() as logs:
252
+ await ju.score(_item(), _output())
253
+ assert any(
254
+ entry.get("event") == "jury_missing_weight_fallback_to_one"
255
+ for entry in logs
256
+ ), f"no missing-weight warning in {logs!r}"
257
+
258
  @pytest.mark.asyncio
259
  async def test_cancel_on_non_retryable(self, tmp_path):
260
  """Non-retryable exception in any member must propagate immediately."""
tests/evaluation/test_rubric_loading.py CHANGED
@@ -26,6 +26,21 @@ class TestRubricLoading:
26
  assert r.scale == "three_point"
27
  assert len(r.levels) == 3
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  class TestRubricValidationErrors:
31
  @pytest.mark.parametrize(
 
26
  assert r.scale == "three_point"
27
  assert len(r.levels) == 3
28
 
29
+ def test_fenced_code_examples_do_not_break_level_count(self):
30
+ """Regression: the level-pattern regex must skip ``## Score N`` strings
31
+ that appear inside fenced code blocks. A binary rubric whose
32
+ Example A contains a code-fenced ``## Score 7`` literal should still
33
+ load as a 2-level binary rubric, not be rejected with arity mismatch.
34
+ """
35
+ r = Rubric.from_markdown_file(
36
+ FIXTURES / "rubrics_valid_with_fenced_examples.md"
37
+ )
38
+ assert r.dimension == "groundedness"
39
+ assert r.scale == "binary"
40
+ assert len(r.levels) == 2, (
41
+ f"fenced ## Score 7 leaked into level count; got {len(r.levels)} levels"
42
+ )
43
+
44
 
45
  class TestRubricValidationErrors:
46
  @pytest.mark.parametrize(