Nomearod Claude Opus 4.7 (1M context) commited on
Commit
7b72b2c
·
1 Parent(s): 76e370c

feat(judges): Rubric markdown loader with aggressive validation

Browse files

Rubric loads from markdown with YAML frontmatter; validates scale,
arity-matches-scale, anchored-example-per-level, frontmatter
required fields. ValidationError raises with file path + field
context so malformed rubrics fail at construction (Day 1) not at
first judge.score call (Day 2 with API budget spent).

source_hash is SHA-256 of body_markdown — immutable per file
content, independent of git state. Used as ScoreResult.rubric_version
so κ aggregation can group by rubric identity without cross-
referencing run metadata.

render_prompt(level_permutation_seed=N) deterministically permutes
the ## Score sections via seeded PRNG. Seed=0 returns canonical
order; this is the variance-control hook used by rubric_permute
in Phase 3.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/judges/base.py CHANGED
@@ -8,8 +8,13 @@ rationale and the six-axis comparison table.
8
 
9
  from __future__ import annotations
10
 
11
- from typing import Literal
 
 
 
 
12
 
 
13
  from pydantic import BaseModel, Field
14
 
15
  # --- Abstain-reason constants ---
@@ -53,3 +58,144 @@ class ScoreResult(BaseModel):
53
  @property
54
  def abstained(self) -> bool:
55
  return self.score == "Unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  from __future__ import annotations
10
 
11
+ import hashlib
12
+ import random
13
+ import re
14
+ from pathlib import Path
15
+ from typing import Literal, Self
16
 
17
+ import yaml
18
  from pydantic import BaseModel, Field
19
 
20
  # --- Abstain-reason constants ---
 
58
  @property
59
  def abstained(self) -> bool:
60
  return self.score == "Unknown"
61
+
62
+
63
+ class RubricLevel(BaseModel):
64
+ """One score level in a rubric, with anchored examples.
65
+
66
+ Parsed from markdown sections under `## Score N` headers. The
67
+ `examples` list contains the H3 sub-sections (`### Example X`)
68
+ each with a thinking-trace explanation of why that output got
69
+ that score.
70
+ """
71
+
72
+ score: int
73
+ description: str
74
+ examples: list[str] # raw markdown of `### Example` sections
75
+
76
+
77
+ class Rubric(BaseModel):
78
+ """A scoring rubric loaded from a markdown file with YAML frontmatter.
79
+
80
+ Construction validates aggressively: scale ∈ {binary, three_point},
81
+ levels arity matches scale, every level has at least one anchored
82
+ example. ValidationError raises with file path + field path so a
83
+ Day-1 rubric typo doesn't surface as a Day-2 judge.score crash with
84
+ API budget already spent.
85
+ """
86
+
87
+ dimension: Literal[
88
+ "groundedness", "relevance", "completeness", "citation_faithfulness"
89
+ ]
90
+ scale: Literal["binary", "three_point"]
91
+ reference_based: bool
92
+ abstain_allowed: bool
93
+ levels: list[RubricLevel]
94
+ body_markdown: str
95
+
96
+ @property
97
+ def source_hash(self) -> str:
98
+ """SHA-256 of the canonical body. Immutable per file content,
99
+ independent of git state. Used as ScoreResult.rubric_version.
100
+ """
101
+ return hashlib.sha256(self.body_markdown.encode("utf-8")).hexdigest()
102
+
103
+ @classmethod
104
+ def from_markdown_file(cls, path: Path | str) -> Self:
105
+ path = Path(path)
106
+ body = path.read_text(encoding="utf-8")
107
+
108
+ # Parse YAML frontmatter delimited by --- ... ---
109
+ fm_match = re.match(r"^---\n(.+?)\n---\n(.*)$", body, re.DOTALL)
110
+ if not fm_match:
111
+ raise ValueError(
112
+ f"Rubric {path.name}: missing YAML frontmatter "
113
+ f"(expected --- ... --- block at top of file)"
114
+ )
115
+ try:
116
+ frontmatter = yaml.safe_load(fm_match.group(1)) or {}
117
+ except yaml.YAMLError as e:
118
+ raise ValueError(
119
+ f"Rubric {path.name}: frontmatter YAML parse error: {e}"
120
+ ) from e
121
+
122
+ required = {"dimension", "scale", "reference_based", "abstain_allowed"}
123
+ missing = required - frontmatter.keys()
124
+ if missing:
125
+ raise ValueError(
126
+ f"Rubric {path.name}: frontmatter missing fields: {sorted(missing)}"
127
+ )
128
+
129
+ scale = frontmatter["scale"]
130
+ if scale not in ("binary", "three_point"):
131
+ raise ValueError(
132
+ f"Rubric {path.name}: invalid scale {scale!r}; "
133
+ f"must be 'binary' or 'three_point'"
134
+ )
135
+
136
+ # Parse levels by ## Score N headers
137
+ body_no_fm = fm_match.group(2)
138
+ level_pattern = re.compile(
139
+ r"^## Score (\d+)\n(.*?)(?=^## Score |\Z)", re.MULTILINE | re.DOTALL
140
+ )
141
+ raw_levels: list[tuple[int, str]] = [
142
+ (int(m.group(1)), m.group(2)) for m in level_pattern.finditer(body_no_fm)
143
+ ]
144
+
145
+ expected_arity = 2 if scale == "binary" else 3
146
+ if len(raw_levels) != expected_arity:
147
+ raise ValueError(
148
+ f"Rubric {path.name}: arity mismatch — scale {scale!r} "
149
+ f"requires {expected_arity} levels, found {len(raw_levels)}"
150
+ )
151
+
152
+ # Parse examples (### Example) per level
153
+ levels: list[RubricLevel] = []
154
+ for score, level_body in raw_levels:
155
+ example_pattern = re.compile(
156
+ r"^### (Example .+?)\n(.*?)(?=^### |\Z)", re.MULTILINE | re.DOTALL
157
+ )
158
+ examples = [m.group(0) for m in example_pattern.finditer(level_body)]
159
+ if not examples:
160
+ raise ValueError(
161
+ f"Rubric {path.name}: level Score {score} has no "
162
+ f"anchored example (expected at least one ### Example header)"
163
+ )
164
+ description = level_body.split("###", 1)[0].strip()
165
+ levels.append(
166
+ RubricLevel(score=score, description=description, examples=examples)
167
+ )
168
+
169
+ return cls(
170
+ dimension=frontmatter["dimension"],
171
+ scale=scale,
172
+ reference_based=bool(frontmatter["reference_based"]),
173
+ abstain_allowed=bool(frontmatter["abstain_allowed"]),
174
+ levels=levels,
175
+ body_markdown=body,
176
+ )
177
+
178
+ def render_prompt(self, *, level_permutation_seed: int = 0) -> str:
179
+ """Render the rubric body for inclusion in a judge prompt.
180
+
181
+ If level_permutation_seed > 0, levels are reordered deterministically
182
+ using a seeded PRNG. seed=0 returns the canonical order.
183
+ """
184
+ if level_permutation_seed == 0:
185
+ return self.body_markdown
186
+ rng = random.Random(level_permutation_seed)
187
+ permuted_levels = list(self.levels)
188
+ rng.shuffle(permuted_levels)
189
+ # Reconstruct: keep frontmatter + intro paragraphs intact;
190
+ # reorder the ## Score N sections.
191
+ fm_match = re.match(r"^(---\n.+?\n---\n)(.*)$", self.body_markdown, re.DOTALL)
192
+ if not fm_match:
193
+ return self.body_markdown # defensive — should never happen post-construction
194
+ head = fm_match.group(1)
195
+ rest = fm_match.group(2)
196
+ intro = re.split(r"^## Score ", rest, maxsplit=1, flags=re.MULTILINE)[0]
197
+ permuted_body = head + intro + "\n".join(
198
+ f"## Score {lvl.score}\n{lvl.description}\n" + "\n".join(lvl.examples)
199
+ for lvl in permuted_levels
200
+ )
201
+ return permuted_body
tests/evaluation/fixtures/rubrics_invalid_arity.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dimension: groundedness
3
+ scale: binary
4
+ reference_based: true
5
+ abstain_allowed: true
6
+ ---
7
+
8
+ # Wrong arity (binary should have 2 levels, this has 3)
9
+
10
+ ## Score 0
11
+ example A
12
+
13
+ ## Score 1
14
+ example B
15
+
16
+ ## Score 2
17
+ example C
tests/evaluation/fixtures/rubrics_invalid_no_examples.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dimension: groundedness
3
+ scale: binary
4
+ reference_based: true
5
+ abstain_allowed: true
6
+ ---
7
+
8
+ # Missing anchored examples
9
+
10
+ ## Score 0
11
+
12
+ Just a description, no anchored example.
13
+
14
+ ## Score 1
15
+
16
+ Same — no anchored example.
tests/evaluation/fixtures/rubrics_invalid_no_frontmatter.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # No frontmatter at all
2
+
3
+ ## Score 0
4
+ example
5
+
6
+ ## Score 1
7
+ example
tests/evaluation/fixtures/rubrics_invalid_scale.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dimension: groundedness
3
+ scale: five_point
4
+ reference_based: true
5
+ abstain_allowed: true
6
+ ---
7
+
8
+ # Bad scale
9
+
10
+ ## Score 0
11
+ example
tests/evaluation/fixtures/rubrics_valid_binary.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dimension: groundedness
3
+ scale: binary
4
+ reference_based: true
5
+ abstain_allowed: true
6
+ ---
7
+
8
+ # Groundedness (binary)
9
+
10
+ Score whether every claim in the answer is supported by the gold source snippets.
11
+
12
+ ## Score 0
13
+
14
+ Answer contains at least one claim not supported by the snippets.
15
+
16
+ ### Example A — answer cites unsupported fact
17
+
18
+ Question: "What's the default port?"
19
+ Snippets: ["The default is 8080."]
20
+ Answer: "The default is 8080 and supports TLS."
21
+
22
+ Score=0 because the TLS claim has no support in the snippet. The
23
+ unsupported claim is sufficient to fail groundedness regardless of
24
+ how many other claims are correctly grounded — this is the binary
25
+ rubric's strict-conjunction definition.
26
+
27
+ ## Score 1
28
+
29
+ Every claim in the answer is supported by at least one snippet.
30
+
31
+ ### Example B — fully grounded one-sentence answer
32
+
33
+ Question: "What's the default port?"
34
+ Snippets: ["The default is 8080."]
35
+ Answer: "The default port is 8080."
36
+
37
+ Score=1 because the only claim ("default port is 8080") is directly
38
+ supported by the snippet. Paraphrase is allowed; what matters is
39
+ factual entailment.
tests/evaluation/fixtures/rubrics_valid_three_point.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dimension: relevance
3
+ scale: three_point
4
+ reference_based: false
5
+ abstain_allowed: true
6
+ ---
7
+
8
+ # Relevance (three-point)
9
+
10
+ Does the answer address the user's question?
11
+
12
+ ## Score 0
13
+
14
+ Off-topic. Answer addresses a different question or is unintelligible.
15
+
16
+ ### Example A — wrong topic
17
+
18
+ Question: "How do I deploy to Kubernetes?"
19
+ Answer: "Python virtual environments isolate dependencies."
20
+
21
+ Score=0 because the answer is about Python venvs, not deployment.
22
+
23
+ ## Score 1
24
+
25
+ Partially relevant. Answer touches the question but misses the core ask.
26
+
27
+ ### Example B — adjacent but off-target
28
+
29
+ Question: "How do I deploy to Kubernetes?"
30
+ Answer: "Kubernetes runs containerized workloads on a cluster of nodes."
31
+
32
+ Score=1 because it's about Kubernetes but doesn't say how to deploy.
33
+
34
+ ## Score 2
35
+
36
+ Directly addresses the question.
37
+
38
+ ### Example C — on-target
39
+
40
+ Question: "How do I deploy to Kubernetes?"
41
+ Answer: "Apply a Deployment manifest with kubectl apply -f deployment.yaml."
42
+
43
+ Score=2 because it gives a concrete deployment action.
tests/evaluation/test_rubric_loading.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for Rubric markdown loader: construction validation, hash, permutation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ from agent_bench.evaluation.judges.base import Rubric
10
+
11
+ FIXTURES = Path(__file__).parent / "fixtures"
12
+
13
+
14
+ class TestRubricLoading:
15
+ def test_load_valid_binary(self):
16
+ r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
17
+ assert r.dimension == "groundedness"
18
+ assert r.scale == "binary"
19
+ assert r.reference_based is True
20
+ assert r.abstain_allowed is True
21
+ assert len(r.levels) == 2
22
+
23
+ def test_load_valid_three_point(self):
24
+ r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
25
+ assert r.dimension == "relevance"
26
+ assert r.scale == "three_point"
27
+ assert len(r.levels) == 3
28
+
29
+
30
+ class TestRubricValidationErrors:
31
+ @pytest.mark.parametrize(
32
+ "fixture_name,error_substring",
33
+ [
34
+ ("rubrics_invalid_scale.md", "scale"),
35
+ ("rubrics_invalid_arity.md", "arity"),
36
+ ("rubrics_invalid_no_examples.md", "anchored example"),
37
+ ("rubrics_invalid_no_frontmatter.md", "frontmatter"),
38
+ ],
39
+ )
40
+ def test_construction_raises_with_path_and_field(
41
+ self, fixture_name: str, error_substring: str
42
+ ):
43
+ path = FIXTURES / fixture_name
44
+ with pytest.raises(ValueError) as exc_info:
45
+ Rubric.from_markdown_file(path)
46
+ msg = str(exc_info.value)
47
+ # Error must mention the file path and the field-level reason
48
+ assert fixture_name in msg, f"Path missing from error: {msg}"
49
+ assert error_substring in msg.lower(), (
50
+ f"Expected '{error_substring}' in error message: {msg}"
51
+ )
52
+
53
+
54
+ class TestRubricSourceHash:
55
+ def test_source_hash_deterministic(self):
56
+ r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
57
+ r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
58
+ assert r1.source_hash == r2.source_hash
59
+ # SHA-256 hex is 64 chars
60
+ assert len(r1.source_hash) == 64
61
+
62
+ def test_source_hash_changes_with_content(self):
63
+ r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
64
+ r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
65
+ assert r1.source_hash != r2.source_hash
66
+
67
+
68
+ class TestRubricPermutation:
69
+ def test_render_prompt_seed_0_unchanged(self):
70
+ r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
71
+ prompt = r.render_prompt(level_permutation_seed=0)
72
+ # Default: levels in original 0, 1, 2 order
73
+ idx0 = prompt.index("Score 0")
74
+ idx1 = prompt.index("Score 1")
75
+ idx2 = prompt.index("Score 2")
76
+ assert idx0 < idx1 < idx2
77
+
78
+ def test_render_prompt_seed_reproducibility(self):
79
+ r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
80
+ p1 = r.render_prompt(level_permutation_seed=42)
81
+ p2 = r.render_prompt(level_permutation_seed=42)
82
+ assert p1 == p2
83
+
84
+ def test_render_prompt_different_seed_different_order(self):
85
+ r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
86
+ # Try several seeds; at least one should produce a non-default order
87
+ # (with 3! = 6 permutations, the chance all 5 seeds produce identity
88
+ # is (1/6)^5 ≈ 1e-4, negligible)
89
+ default = r.render_prompt(level_permutation_seed=0)
90
+ differs = any(
91
+ r.render_prompt(level_permutation_seed=s) != default
92
+ for s in (1, 2, 3, 7, 13)
93
+ )
94
+ assert differs, "No seed produced a permutation different from default"