Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on 25 days ago

Commit

7b72b2c

1 Parent(s): 76e370c

feat(judges): Rubric markdown loader with aggressive validation

Rubric loads from markdown with YAML frontmatter; validates scale,
arity-matches-scale, anchored-example-per-level, frontmatter
required fields. ValidationError raises with file path + field
context so malformed rubrics fail at construction (Day 1) not at
first judge.score call (Day 2 with API budget spent).

source_hash is SHA-256 of body_markdown — immutable per file
content, independent of git state. Used as ScoreResult.rubric_version
so κ aggregation can group by rubric identity without cross-
referencing run metadata.

render_prompt(level_permutation_seed=N) deterministically permutes
the ## Score sections via seeded PRNG. Seed=0 returns canonical
order; this is the variance-control hook used by rubric_permute
in Phase 3.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (8) hide show

agent_bench/evaluation/judges/base.py +147 -1
tests/evaluation/fixtures/rubrics_invalid_arity.md +17 -0
tests/evaluation/fixtures/rubrics_invalid_no_examples.md +16 -0
tests/evaluation/fixtures/rubrics_invalid_no_frontmatter.md +7 -0
tests/evaluation/fixtures/rubrics_invalid_scale.md +11 -0
tests/evaluation/fixtures/rubrics_valid_binary.md +39 -0
tests/evaluation/fixtures/rubrics_valid_three_point.md +43 -0
tests/evaluation/test_rubric_loading.py +94 -0

agent_bench/evaluation/judges/base.py CHANGED Viewed

@@ -8,8 +8,13 @@ rationale and the six-axis comparison table.
 from __future__ import annotations
-from typing import Literal
 from pydantic import BaseModel, Field
 # --- Abstain-reason constants ---
@@ -53,3 +58,144 @@ class ScoreResult(BaseModel):
     @property
     def abstained(self) -> bool:
         return self.score == "Unknown"

 from __future__ import annotations
+import hashlib
+import random
+import re
+from pathlib import Path
+from typing import Literal, Self
+import yaml
 from pydantic import BaseModel, Field
 # --- Abstain-reason constants ---
     @property
     def abstained(self) -> bool:
         return self.score == "Unknown"
+class RubricLevel(BaseModel):
+    """One score level in a rubric, with anchored examples.
+    Parsed from markdown sections under `## Score N` headers. The
+    `examples` list contains the H3 sub-sections (`### Example X`)
+    each with a thinking-trace explanation of why that output got
+    that score.
+    """
+    score: int
+    description: str
+    examples: list[str]  # raw markdown of `### Example` sections
+class Rubric(BaseModel):
+    """A scoring rubric loaded from a markdown file with YAML frontmatter.
+    Construction validates aggressively: scale ∈ {binary, three_point},
+    levels arity matches scale, every level has at least one anchored
+    example. ValidationError raises with file path + field path so a
+    Day-1 rubric typo doesn't surface as a Day-2 judge.score crash with
+    API budget already spent.
+    """
+    dimension: Literal[
+        "groundedness", "relevance", "completeness", "citation_faithfulness"
+    ]
+    scale: Literal["binary", "three_point"]
+    reference_based: bool
+    abstain_allowed: bool
+    levels: list[RubricLevel]
+    body_markdown: str
+    @property
+    def source_hash(self) -> str:
+        """SHA-256 of the canonical body. Immutable per file content,
+        independent of git state. Used as ScoreResult.rubric_version.
+        """
+        return hashlib.sha256(self.body_markdown.encode("utf-8")).hexdigest()
+    @classmethod
+    def from_markdown_file(cls, path: Path | str) -> Self:
+        path = Path(path)
+        body = path.read_text(encoding="utf-8")
+        # Parse YAML frontmatter delimited by --- ... ---
+        fm_match = re.match(r"^---\n(.+?)\n---\n(.*)$", body, re.DOTALL)
+        if not fm_match:
+            raise ValueError(
+                f"Rubric {path.name}: missing YAML frontmatter "
+                f"(expected --- ... --- block at top of file)"
+            )
+        try:
+            frontmatter = yaml.safe_load(fm_match.group(1)) or {}
+        except yaml.YAMLError as e:
+            raise ValueError(
+                f"Rubric {path.name}: frontmatter YAML parse error: {e}"
+            ) from e
+        required = {"dimension", "scale", "reference_based", "abstain_allowed"}
+        missing = required - frontmatter.keys()
+        if missing:
+            raise ValueError(
+                f"Rubric {path.name}: frontmatter missing fields: {sorted(missing)}"
+            )
+        scale = frontmatter["scale"]
+        if scale not in ("binary", "three_point"):
+            raise ValueError(
+                f"Rubric {path.name}: invalid scale {scale!r}; "
+                f"must be 'binary' or 'three_point'"
+            )
+        # Parse levels by ## Score N headers
+        body_no_fm = fm_match.group(2)
+        level_pattern = re.compile(
+            r"^## Score (\d+)\n(.*?)(?=^## Score |\Z)", re.MULTILINE | re.DOTALL
+        )
+        raw_levels: list[tuple[int, str]] = [
+            (int(m.group(1)), m.group(2)) for m in level_pattern.finditer(body_no_fm)
+        ]
+        expected_arity = 2 if scale == "binary" else 3
+        if len(raw_levels) != expected_arity:
+            raise ValueError(
+                f"Rubric {path.name}: arity mismatch — scale {scale!r} "
+                f"requires {expected_arity} levels, found {len(raw_levels)}"
+            )
+        # Parse examples (### Example) per level
+        levels: list[RubricLevel] = []
+        for score, level_body in raw_levels:
+            example_pattern = re.compile(
+                r"^### (Example .+?)\n(.*?)(?=^### |\Z)", re.MULTILINE | re.DOTALL
+            )
+            examples = [m.group(0) for m in example_pattern.finditer(level_body)]
+            if not examples:
+                raise ValueError(
+                    f"Rubric {path.name}: level Score {score} has no "
+                    f"anchored example (expected at least one ### Example header)"
+                )
+            description = level_body.split("###", 1)[0].strip()
+            levels.append(
+                RubricLevel(score=score, description=description, examples=examples)
+            )
+        return cls(
+            dimension=frontmatter["dimension"],
+            scale=scale,
+            reference_based=bool(frontmatter["reference_based"]),
+            abstain_allowed=bool(frontmatter["abstain_allowed"]),
+            levels=levels,
+            body_markdown=body,
+        )
+    def render_prompt(self, *, level_permutation_seed: int = 0) -> str:
+        """Render the rubric body for inclusion in a judge prompt.
+        If level_permutation_seed > 0, levels are reordered deterministically
+        using a seeded PRNG. seed=0 returns the canonical order.
+        """
+        if level_permutation_seed == 0:
+            return self.body_markdown
+        rng = random.Random(level_permutation_seed)
+        permuted_levels = list(self.levels)
+        rng.shuffle(permuted_levels)
+        # Reconstruct: keep frontmatter + intro paragraphs intact;
+        # reorder the ## Score N sections.
+        fm_match = re.match(r"^(---\n.+?\n---\n)(.*)$", self.body_markdown, re.DOTALL)
+        if not fm_match:
+            return self.body_markdown  # defensive — should never happen post-construction
+        head = fm_match.group(1)
+        rest = fm_match.group(2)
+        intro = re.split(r"^## Score ", rest, maxsplit=1, flags=re.MULTILINE)[0]
+        permuted_body = head + intro + "\n".join(
+            f"## Score {lvl.score}\n{lvl.description}\n" + "\n".join(lvl.examples)
+            for lvl in permuted_levels
+        )
+        return permuted_body

tests/evaluation/fixtures/rubrics_invalid_arity.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+dimension: groundedness
+scale: binary
+reference_based: true
+abstain_allowed: true
+---
+# Wrong arity (binary should have 2 levels, this has 3)
+## Score 0
+example A
+## Score 1
+example B
+## Score 2
+example C

tests/evaluation/fixtures/rubrics_invalid_no_examples.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+dimension: groundedness
+scale: binary
+reference_based: true
+abstain_allowed: true
+---
+# Missing anchored examples
+## Score 0
+Just a description, no anchored example.
+## Score 1
+Same — no anchored example.

tests/evaluation/fixtures/rubrics_invalid_no_frontmatter.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# No frontmatter at all
+## Score 0
+example
+## Score 1
+example

tests/evaluation/fixtures/rubrics_invalid_scale.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+dimension: groundedness
+scale: five_point
+reference_based: true
+abstain_allowed: true
+---
+# Bad scale
+## Score 0
+example

tests/evaluation/fixtures/rubrics_valid_binary.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+dimension: groundedness
+scale: binary
+reference_based: true
+abstain_allowed: true
+---
+# Groundedness (binary)
+Score whether every claim in the answer is supported by the gold source snippets.
+## Score 0
+Answer contains at least one claim not supported by the snippets.
+### Example A — answer cites unsupported fact
+Question: "What's the default port?"
+Snippets: ["The default is 8080."]
+Answer: "The default is 8080 and supports TLS."
+Score=0 because the TLS claim has no support in the snippet. The
+unsupported claim is sufficient to fail groundedness regardless of
+how many other claims are correctly grounded — this is the binary
+rubric's strict-conjunction definition.
+## Score 1
+Every claim in the answer is supported by at least one snippet.
+### Example B — fully grounded one-sentence answer
+Question: "What's the default port?"
+Snippets: ["The default is 8080."]
+Answer: "The default port is 8080."
+Score=1 because the only claim ("default port is 8080") is directly
+supported by the snippet. Paraphrase is allowed; what matters is
+factual entailment.

tests/evaluation/fixtures/rubrics_valid_three_point.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+dimension: relevance
+scale: three_point
+reference_based: false
+abstain_allowed: true
+---
+# Relevance (three-point)
+Does the answer address the user's question?
+## Score 0
+Off-topic. Answer addresses a different question or is unintelligible.
+### Example A — wrong topic
+Question: "How do I deploy to Kubernetes?"
+Answer: "Python virtual environments isolate dependencies."
+Score=0 because the answer is about Python venvs, not deployment.
+## Score 1
+Partially relevant. Answer touches the question but misses the core ask.
+### Example B — adjacent but off-target
+Question: "How do I deploy to Kubernetes?"
+Answer: "Kubernetes runs containerized workloads on a cluster of nodes."
+Score=1 because it's about Kubernetes but doesn't say how to deploy.
+## Score 2
+Directly addresses the question.
+### Example C — on-target
+Question: "How do I deploy to Kubernetes?"
+Answer: "Apply a Deployment manifest with kubectl apply -f deployment.yaml."
+Score=2 because it gives a concrete deployment action.

tests/evaluation/test_rubric_loading.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Tests for Rubric markdown loader: construction validation, hash, permutation."""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from agent_bench.evaluation.judges.base import Rubric
+FIXTURES = Path(__file__).parent / "fixtures"
+class TestRubricLoading:
+    def test_load_valid_binary(self):
+        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
+        assert r.dimension == "groundedness"
+        assert r.scale == "binary"
+        assert r.reference_based is True
+        assert r.abstain_allowed is True
+        assert len(r.levels) == 2
+    def test_load_valid_three_point(self):
+        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
+        assert r.dimension == "relevance"
+        assert r.scale == "three_point"
+        assert len(r.levels) == 3
+class TestRubricValidationErrors:
+    @pytest.mark.parametrize(
+        "fixture_name,error_substring",
+        [
+            ("rubrics_invalid_scale.md", "scale"),
+            ("rubrics_invalid_arity.md", "arity"),
+            ("rubrics_invalid_no_examples.md", "anchored example"),
+            ("rubrics_invalid_no_frontmatter.md", "frontmatter"),
+        ],
+    )
+    def test_construction_raises_with_path_and_field(
+        self, fixture_name: str, error_substring: str
+    ):
+        path = FIXTURES / fixture_name
+        with pytest.raises(ValueError) as exc_info:
+            Rubric.from_markdown_file(path)
+        msg = str(exc_info.value)
+        # Error must mention the file path and the field-level reason
+        assert fixture_name in msg, f"Path missing from error: {msg}"
+        assert error_substring in msg.lower(), (
+            f"Expected '{error_substring}' in error message: {msg}"
+        )
+class TestRubricSourceHash:
+    def test_source_hash_deterministic(self):
+        r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
+        r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
+        assert r1.source_hash == r2.source_hash
+        # SHA-256 hex is 64 chars
+        assert len(r1.source_hash) == 64
+    def test_source_hash_changes_with_content(self):
+        r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
+        r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
+        assert r1.source_hash != r2.source_hash
+class TestRubricPermutation:
+    def test_render_prompt_seed_0_unchanged(self):
+        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
+        prompt = r.render_prompt(level_permutation_seed=0)
+        # Default: levels in original 0, 1, 2 order
+        idx0 = prompt.index("Score 0")
+        idx1 = prompt.index("Score 1")
+        idx2 = prompt.index("Score 2")
+        assert idx0 < idx1 < idx2
+    def test_render_prompt_seed_reproducibility(self):
+        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
+        p1 = r.render_prompt(level_permutation_seed=42)
+        p2 = r.render_prompt(level_permutation_seed=42)
+        assert p1 == p2
+    def test_render_prompt_different_seed_different_order(self):
+        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
+        # Try several seeds; at least one should produce a non-default order
+        # (with 3! = 6 permutations, the chance all 5 seeds produce identity
+        # is (1/6)^5 ≈ 1e-4, negligible)
+        default = r.render_prompt(level_permutation_seed=0)
+        differs = any(
+            r.render_prompt(level_permutation_seed=s) != default
+            for s in (1, 2, 3, 7, 13)
+        )
+        assert differs, "No seed produced a permutation different from default"