Spaces:
Running
feat(judges): Rubric markdown loader with aggressive validation
Browse filesRubric loads from markdown with YAML frontmatter; validates scale,
arity-matches-scale, anchored-example-per-level, frontmatter
required fields. ValidationError raises with file path + field
context so malformed rubrics fail at construction (Day 1) not at
first judge.score call (Day 2 with API budget spent).
source_hash is SHA-256 of body_markdown — immutable per file
content, independent of git state. Used as ScoreResult.rubric_version
so κ aggregation can group by rubric identity without cross-
referencing run metadata.
render_prompt(level_permutation_seed=N) deterministically permutes
the ## Score sections via seeded PRNG. Seed=0 returns canonical
order; this is the variance-control hook used by rubric_permute
in Phase 3.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- agent_bench/evaluation/judges/base.py +147 -1
- tests/evaluation/fixtures/rubrics_invalid_arity.md +17 -0
- tests/evaluation/fixtures/rubrics_invalid_no_examples.md +16 -0
- tests/evaluation/fixtures/rubrics_invalid_no_frontmatter.md +7 -0
- tests/evaluation/fixtures/rubrics_invalid_scale.md +11 -0
- tests/evaluation/fixtures/rubrics_valid_binary.md +39 -0
- tests/evaluation/fixtures/rubrics_valid_three_point.md +43 -0
- tests/evaluation/test_rubric_loading.py +94 -0
|
@@ -8,8 +8,13 @@ rationale and the six-axis comparison table.
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
|
|
|
| 13 |
from pydantic import BaseModel, Field
|
| 14 |
|
| 15 |
# --- Abstain-reason constants ---
|
|
@@ -53,3 +58,144 @@ class ScoreResult(BaseModel):
|
|
| 53 |
@property
|
| 54 |
def abstained(self) -> bool:
|
| 55 |
return self.score == "Unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
+
import hashlib
|
| 12 |
+
import random
|
| 13 |
+
import re
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Literal, Self
|
| 16 |
|
| 17 |
+
import yaml
|
| 18 |
from pydantic import BaseModel, Field
|
| 19 |
|
| 20 |
# --- Abstain-reason constants ---
|
|
|
|
| 58 |
@property
|
| 59 |
def abstained(self) -> bool:
|
| 60 |
return self.score == "Unknown"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class RubricLevel(BaseModel):
|
| 64 |
+
"""One score level in a rubric, with anchored examples.
|
| 65 |
+
|
| 66 |
+
Parsed from markdown sections under `## Score N` headers. The
|
| 67 |
+
`examples` list contains the H3 sub-sections (`### Example X`)
|
| 68 |
+
each with a thinking-trace explanation of why that output got
|
| 69 |
+
that score.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
score: int
|
| 73 |
+
description: str
|
| 74 |
+
examples: list[str] # raw markdown of `### Example` sections
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class Rubric(BaseModel):
|
| 78 |
+
"""A scoring rubric loaded from a markdown file with YAML frontmatter.
|
| 79 |
+
|
| 80 |
+
Construction validates aggressively: scale ∈ {binary, three_point},
|
| 81 |
+
levels arity matches scale, every level has at least one anchored
|
| 82 |
+
example. ValidationError raises with file path + field path so a
|
| 83 |
+
Day-1 rubric typo doesn't surface as a Day-2 judge.score crash with
|
| 84 |
+
API budget already spent.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
dimension: Literal[
|
| 88 |
+
"groundedness", "relevance", "completeness", "citation_faithfulness"
|
| 89 |
+
]
|
| 90 |
+
scale: Literal["binary", "three_point"]
|
| 91 |
+
reference_based: bool
|
| 92 |
+
abstain_allowed: bool
|
| 93 |
+
levels: list[RubricLevel]
|
| 94 |
+
body_markdown: str
|
| 95 |
+
|
| 96 |
+
@property
|
| 97 |
+
def source_hash(self) -> str:
|
| 98 |
+
"""SHA-256 of the canonical body. Immutable per file content,
|
| 99 |
+
independent of git state. Used as ScoreResult.rubric_version.
|
| 100 |
+
"""
|
| 101 |
+
return hashlib.sha256(self.body_markdown.encode("utf-8")).hexdigest()
|
| 102 |
+
|
| 103 |
+
@classmethod
|
| 104 |
+
def from_markdown_file(cls, path: Path | str) -> Self:
|
| 105 |
+
path = Path(path)
|
| 106 |
+
body = path.read_text(encoding="utf-8")
|
| 107 |
+
|
| 108 |
+
# Parse YAML frontmatter delimited by --- ... ---
|
| 109 |
+
fm_match = re.match(r"^---\n(.+?)\n---\n(.*)$", body, re.DOTALL)
|
| 110 |
+
if not fm_match:
|
| 111 |
+
raise ValueError(
|
| 112 |
+
f"Rubric {path.name}: missing YAML frontmatter "
|
| 113 |
+
f"(expected --- ... --- block at top of file)"
|
| 114 |
+
)
|
| 115 |
+
try:
|
| 116 |
+
frontmatter = yaml.safe_load(fm_match.group(1)) or {}
|
| 117 |
+
except yaml.YAMLError as e:
|
| 118 |
+
raise ValueError(
|
| 119 |
+
f"Rubric {path.name}: frontmatter YAML parse error: {e}"
|
| 120 |
+
) from e
|
| 121 |
+
|
| 122 |
+
required = {"dimension", "scale", "reference_based", "abstain_allowed"}
|
| 123 |
+
missing = required - frontmatter.keys()
|
| 124 |
+
if missing:
|
| 125 |
+
raise ValueError(
|
| 126 |
+
f"Rubric {path.name}: frontmatter missing fields: {sorted(missing)}"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
scale = frontmatter["scale"]
|
| 130 |
+
if scale not in ("binary", "three_point"):
|
| 131 |
+
raise ValueError(
|
| 132 |
+
f"Rubric {path.name}: invalid scale {scale!r}; "
|
| 133 |
+
f"must be 'binary' or 'three_point'"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Parse levels by ## Score N headers
|
| 137 |
+
body_no_fm = fm_match.group(2)
|
| 138 |
+
level_pattern = re.compile(
|
| 139 |
+
r"^## Score (\d+)\n(.*?)(?=^## Score |\Z)", re.MULTILINE | re.DOTALL
|
| 140 |
+
)
|
| 141 |
+
raw_levels: list[tuple[int, str]] = [
|
| 142 |
+
(int(m.group(1)), m.group(2)) for m in level_pattern.finditer(body_no_fm)
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
expected_arity = 2 if scale == "binary" else 3
|
| 146 |
+
if len(raw_levels) != expected_arity:
|
| 147 |
+
raise ValueError(
|
| 148 |
+
f"Rubric {path.name}: arity mismatch — scale {scale!r} "
|
| 149 |
+
f"requires {expected_arity} levels, found {len(raw_levels)}"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Parse examples (### Example) per level
|
| 153 |
+
levels: list[RubricLevel] = []
|
| 154 |
+
for score, level_body in raw_levels:
|
| 155 |
+
example_pattern = re.compile(
|
| 156 |
+
r"^### (Example .+?)\n(.*?)(?=^### |\Z)", re.MULTILINE | re.DOTALL
|
| 157 |
+
)
|
| 158 |
+
examples = [m.group(0) for m in example_pattern.finditer(level_body)]
|
| 159 |
+
if not examples:
|
| 160 |
+
raise ValueError(
|
| 161 |
+
f"Rubric {path.name}: level Score {score} has no "
|
| 162 |
+
f"anchored example (expected at least one ### Example header)"
|
| 163 |
+
)
|
| 164 |
+
description = level_body.split("###", 1)[0].strip()
|
| 165 |
+
levels.append(
|
| 166 |
+
RubricLevel(score=score, description=description, examples=examples)
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
return cls(
|
| 170 |
+
dimension=frontmatter["dimension"],
|
| 171 |
+
scale=scale,
|
| 172 |
+
reference_based=bool(frontmatter["reference_based"]),
|
| 173 |
+
abstain_allowed=bool(frontmatter["abstain_allowed"]),
|
| 174 |
+
levels=levels,
|
| 175 |
+
body_markdown=body,
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
def render_prompt(self, *, level_permutation_seed: int = 0) -> str:
|
| 179 |
+
"""Render the rubric body for inclusion in a judge prompt.
|
| 180 |
+
|
| 181 |
+
If level_permutation_seed > 0, levels are reordered deterministically
|
| 182 |
+
using a seeded PRNG. seed=0 returns the canonical order.
|
| 183 |
+
"""
|
| 184 |
+
if level_permutation_seed == 0:
|
| 185 |
+
return self.body_markdown
|
| 186 |
+
rng = random.Random(level_permutation_seed)
|
| 187 |
+
permuted_levels = list(self.levels)
|
| 188 |
+
rng.shuffle(permuted_levels)
|
| 189 |
+
# Reconstruct: keep frontmatter + intro paragraphs intact;
|
| 190 |
+
# reorder the ## Score N sections.
|
| 191 |
+
fm_match = re.match(r"^(---\n.+?\n---\n)(.*)$", self.body_markdown, re.DOTALL)
|
| 192 |
+
if not fm_match:
|
| 193 |
+
return self.body_markdown # defensive — should never happen post-construction
|
| 194 |
+
head = fm_match.group(1)
|
| 195 |
+
rest = fm_match.group(2)
|
| 196 |
+
intro = re.split(r"^## Score ", rest, maxsplit=1, flags=re.MULTILINE)[0]
|
| 197 |
+
permuted_body = head + intro + "\n".join(
|
| 198 |
+
f"## Score {lvl.score}\n{lvl.description}\n" + "\n".join(lvl.examples)
|
| 199 |
+
for lvl in permuted_levels
|
| 200 |
+
)
|
| 201 |
+
return permuted_body
|
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
dimension: groundedness
|
| 3 |
+
scale: binary
|
| 4 |
+
reference_based: true
|
| 5 |
+
abstain_allowed: true
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Wrong arity (binary should have 2 levels, this has 3)
|
| 9 |
+
|
| 10 |
+
## Score 0
|
| 11 |
+
example A
|
| 12 |
+
|
| 13 |
+
## Score 1
|
| 14 |
+
example B
|
| 15 |
+
|
| 16 |
+
## Score 2
|
| 17 |
+
example C
|
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
dimension: groundedness
|
| 3 |
+
scale: binary
|
| 4 |
+
reference_based: true
|
| 5 |
+
abstain_allowed: true
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Missing anchored examples
|
| 9 |
+
|
| 10 |
+
## Score 0
|
| 11 |
+
|
| 12 |
+
Just a description, no anchored example.
|
| 13 |
+
|
| 14 |
+
## Score 1
|
| 15 |
+
|
| 16 |
+
Same — no anchored example.
|
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# No frontmatter at all
|
| 2 |
+
|
| 3 |
+
## Score 0
|
| 4 |
+
example
|
| 5 |
+
|
| 6 |
+
## Score 1
|
| 7 |
+
example
|
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
dimension: groundedness
|
| 3 |
+
scale: five_point
|
| 4 |
+
reference_based: true
|
| 5 |
+
abstain_allowed: true
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Bad scale
|
| 9 |
+
|
| 10 |
+
## Score 0
|
| 11 |
+
example
|
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
dimension: groundedness
|
| 3 |
+
scale: binary
|
| 4 |
+
reference_based: true
|
| 5 |
+
abstain_allowed: true
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Groundedness (binary)
|
| 9 |
+
|
| 10 |
+
Score whether every claim in the answer is supported by the gold source snippets.
|
| 11 |
+
|
| 12 |
+
## Score 0
|
| 13 |
+
|
| 14 |
+
Answer contains at least one claim not supported by the snippets.
|
| 15 |
+
|
| 16 |
+
### Example A — answer cites unsupported fact
|
| 17 |
+
|
| 18 |
+
Question: "What's the default port?"
|
| 19 |
+
Snippets: ["The default is 8080."]
|
| 20 |
+
Answer: "The default is 8080 and supports TLS."
|
| 21 |
+
|
| 22 |
+
Score=0 because the TLS claim has no support in the snippet. The
|
| 23 |
+
unsupported claim is sufficient to fail groundedness regardless of
|
| 24 |
+
how many other claims are correctly grounded — this is the binary
|
| 25 |
+
rubric's strict-conjunction definition.
|
| 26 |
+
|
| 27 |
+
## Score 1
|
| 28 |
+
|
| 29 |
+
Every claim in the answer is supported by at least one snippet.
|
| 30 |
+
|
| 31 |
+
### Example B — fully grounded one-sentence answer
|
| 32 |
+
|
| 33 |
+
Question: "What's the default port?"
|
| 34 |
+
Snippets: ["The default is 8080."]
|
| 35 |
+
Answer: "The default port is 8080."
|
| 36 |
+
|
| 37 |
+
Score=1 because the only claim ("default port is 8080") is directly
|
| 38 |
+
supported by the snippet. Paraphrase is allowed; what matters is
|
| 39 |
+
factual entailment.
|
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
dimension: relevance
|
| 3 |
+
scale: three_point
|
| 4 |
+
reference_based: false
|
| 5 |
+
abstain_allowed: true
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Relevance (three-point)
|
| 9 |
+
|
| 10 |
+
Does the answer address the user's question?
|
| 11 |
+
|
| 12 |
+
## Score 0
|
| 13 |
+
|
| 14 |
+
Off-topic. Answer addresses a different question or is unintelligible.
|
| 15 |
+
|
| 16 |
+
### Example A — wrong topic
|
| 17 |
+
|
| 18 |
+
Question: "How do I deploy to Kubernetes?"
|
| 19 |
+
Answer: "Python virtual environments isolate dependencies."
|
| 20 |
+
|
| 21 |
+
Score=0 because the answer is about Python venvs, not deployment.
|
| 22 |
+
|
| 23 |
+
## Score 1
|
| 24 |
+
|
| 25 |
+
Partially relevant. Answer touches the question but misses the core ask.
|
| 26 |
+
|
| 27 |
+
### Example B — adjacent but off-target
|
| 28 |
+
|
| 29 |
+
Question: "How do I deploy to Kubernetes?"
|
| 30 |
+
Answer: "Kubernetes runs containerized workloads on a cluster of nodes."
|
| 31 |
+
|
| 32 |
+
Score=1 because it's about Kubernetes but doesn't say how to deploy.
|
| 33 |
+
|
| 34 |
+
## Score 2
|
| 35 |
+
|
| 36 |
+
Directly addresses the question.
|
| 37 |
+
|
| 38 |
+
### Example C — on-target
|
| 39 |
+
|
| 40 |
+
Question: "How do I deploy to Kubernetes?"
|
| 41 |
+
Answer: "Apply a Deployment manifest with kubectl apply -f deployment.yaml."
|
| 42 |
+
|
| 43 |
+
Score=2 because it gives a concrete deployment action.
|
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for Rubric markdown loader: construction validation, hash, permutation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from agent_bench.evaluation.judges.base import Rubric
|
| 10 |
+
|
| 11 |
+
FIXTURES = Path(__file__).parent / "fixtures"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestRubricLoading:
|
| 15 |
+
def test_load_valid_binary(self):
|
| 16 |
+
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
|
| 17 |
+
assert r.dimension == "groundedness"
|
| 18 |
+
assert r.scale == "binary"
|
| 19 |
+
assert r.reference_based is True
|
| 20 |
+
assert r.abstain_allowed is True
|
| 21 |
+
assert len(r.levels) == 2
|
| 22 |
+
|
| 23 |
+
def test_load_valid_three_point(self):
|
| 24 |
+
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
|
| 25 |
+
assert r.dimension == "relevance"
|
| 26 |
+
assert r.scale == "three_point"
|
| 27 |
+
assert len(r.levels) == 3
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TestRubricValidationErrors:
|
| 31 |
+
@pytest.mark.parametrize(
|
| 32 |
+
"fixture_name,error_substring",
|
| 33 |
+
[
|
| 34 |
+
("rubrics_invalid_scale.md", "scale"),
|
| 35 |
+
("rubrics_invalid_arity.md", "arity"),
|
| 36 |
+
("rubrics_invalid_no_examples.md", "anchored example"),
|
| 37 |
+
("rubrics_invalid_no_frontmatter.md", "frontmatter"),
|
| 38 |
+
],
|
| 39 |
+
)
|
| 40 |
+
def test_construction_raises_with_path_and_field(
|
| 41 |
+
self, fixture_name: str, error_substring: str
|
| 42 |
+
):
|
| 43 |
+
path = FIXTURES / fixture_name
|
| 44 |
+
with pytest.raises(ValueError) as exc_info:
|
| 45 |
+
Rubric.from_markdown_file(path)
|
| 46 |
+
msg = str(exc_info.value)
|
| 47 |
+
# Error must mention the file path and the field-level reason
|
| 48 |
+
assert fixture_name in msg, f"Path missing from error: {msg}"
|
| 49 |
+
assert error_substring in msg.lower(), (
|
| 50 |
+
f"Expected '{error_substring}' in error message: {msg}"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class TestRubricSourceHash:
|
| 55 |
+
def test_source_hash_deterministic(self):
|
| 56 |
+
r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
|
| 57 |
+
r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
|
| 58 |
+
assert r1.source_hash == r2.source_hash
|
| 59 |
+
# SHA-256 hex is 64 chars
|
| 60 |
+
assert len(r1.source_hash) == 64
|
| 61 |
+
|
| 62 |
+
def test_source_hash_changes_with_content(self):
|
| 63 |
+
r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
|
| 64 |
+
r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
|
| 65 |
+
assert r1.source_hash != r2.source_hash
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class TestRubricPermutation:
|
| 69 |
+
def test_render_prompt_seed_0_unchanged(self):
|
| 70 |
+
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
|
| 71 |
+
prompt = r.render_prompt(level_permutation_seed=0)
|
| 72 |
+
# Default: levels in original 0, 1, 2 order
|
| 73 |
+
idx0 = prompt.index("Score 0")
|
| 74 |
+
idx1 = prompt.index("Score 1")
|
| 75 |
+
idx2 = prompt.index("Score 2")
|
| 76 |
+
assert idx0 < idx1 < idx2
|
| 77 |
+
|
| 78 |
+
def test_render_prompt_seed_reproducibility(self):
|
| 79 |
+
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
|
| 80 |
+
p1 = r.render_prompt(level_permutation_seed=42)
|
| 81 |
+
p2 = r.render_prompt(level_permutation_seed=42)
|
| 82 |
+
assert p1 == p2
|
| 83 |
+
|
| 84 |
+
def test_render_prompt_different_seed_different_order(self):
|
| 85 |
+
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
|
| 86 |
+
# Try several seeds; at least one should produce a non-default order
|
| 87 |
+
# (with 3! = 6 permutations, the chance all 5 seeds produce identity
|
| 88 |
+
# is (1/6)^5 ≈ 1e-4, negligible)
|
| 89 |
+
default = r.render_prompt(level_permutation_seed=0)
|
| 90 |
+
differs = any(
|
| 91 |
+
r.render_prompt(level_permutation_seed=s) != default
|
| 92 |
+
for s in (1, 2, 3, 7, 13)
|
| 93 |
+
)
|
| 94 |
+
assert differs, "No seed produced a permutation different from default"
|