agentbench / tests /evaluation /test_rubric_loading.py
Nomearod's picture
fix(judges): four review-blocking bugs (review items 1–4 + 8)
9255fb5
"""Tests for Rubric markdown loader: construction validation, hash, permutation."""
from __future__ import annotations
from pathlib import Path
import pytest
from agent_bench.evaluation.judges.base import Rubric
FIXTURES = Path(__file__).parent / "fixtures"
class TestRubricLoading:
def test_load_valid_binary(self):
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
assert r.dimension == "groundedness"
assert r.scale == "binary"
assert r.reference_based is True
assert r.abstain_allowed is True
assert len(r.levels) == 2
def test_load_valid_three_point(self):
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
assert r.dimension == "relevance"
assert r.scale == "three_point"
assert len(r.levels) == 3
def test_fenced_code_examples_do_not_break_level_count(self):
"""Regression: the level-pattern regex must skip ``## Score N`` strings
that appear inside fenced code blocks. A binary rubric whose
Example A contains a code-fenced ``## Score 7`` literal should still
load as a 2-level binary rubric, not be rejected with arity mismatch.
"""
r = Rubric.from_markdown_file(
FIXTURES / "rubrics_valid_with_fenced_examples.md"
)
assert r.dimension == "groundedness"
assert r.scale == "binary"
assert len(r.levels) == 2, (
f"fenced ## Score 7 leaked into level count; got {len(r.levels)} levels"
)
class TestRubricValidationErrors:
@pytest.mark.parametrize(
"fixture_name,error_substring",
[
("rubrics_invalid_scale.md", "scale"),
("rubrics_invalid_arity.md", "arity"),
("rubrics_invalid_no_examples.md", "anchored example"),
("rubrics_invalid_no_frontmatter.md", "frontmatter"),
],
)
def test_construction_raises_with_path_and_field(
self, fixture_name: str, error_substring: str
):
path = FIXTURES / fixture_name
with pytest.raises(ValueError) as exc_info:
Rubric.from_markdown_file(path)
msg = str(exc_info.value)
# Error must mention the file path and the field-level reason
assert fixture_name in msg, f"Path missing from error: {msg}"
assert error_substring in msg.lower(), (
f"Expected '{error_substring}' in error message: {msg}"
)
class TestRubricSourceHash:
def test_source_hash_deterministic(self):
r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
assert r1.source_hash == r2.source_hash
# SHA-256 hex is 64 chars
assert len(r1.source_hash) == 64
def test_source_hash_changes_with_content(self):
r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
assert r1.source_hash != r2.source_hash
class TestRubricPermutation:
def test_render_prompt_seed_0_unchanged(self):
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
prompt = r.render_prompt(level_permutation_seed=0)
# Default: levels in original 0, 1, 2 order
idx0 = prompt.index("Score 0")
idx1 = prompt.index("Score 1")
idx2 = prompt.index("Score 2")
assert idx0 < idx1 < idx2
def test_render_prompt_seed_reproducibility(self):
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
p1 = r.render_prompt(level_permutation_seed=42)
p2 = r.render_prompt(level_permutation_seed=42)
assert p1 == p2
def test_render_prompt_different_seed_different_order(self):
r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
# Try several seeds; at least one should produce a non-default order
# (with 3! = 6 permutations, the chance all 5 seeds produce identity
# is (1/6)^5 ≈ 1e-4, negligible)
default = r.render_prompt(level_permutation_seed=0)
differs = any(
r.render_prompt(level_permutation_seed=s) != default
for s in (1, 2, 3, 7, 13)
)
assert differs, "No seed produced a permutation different from default"