"""Tests for Rubric markdown loader: construction validation, hash, permutation.""" from __future__ import annotations from pathlib import Path import pytest from agent_bench.evaluation.judges.base import Rubric FIXTURES = Path(__file__).parent / "fixtures" class TestRubricLoading: def test_load_valid_binary(self): r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md") assert r.dimension == "groundedness" assert r.scale == "binary" assert r.reference_based is True assert r.abstain_allowed is True assert len(r.levels) == 2 def test_load_valid_three_point(self): r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md") assert r.dimension == "relevance" assert r.scale == "three_point" assert len(r.levels) == 3 def test_fenced_code_examples_do_not_break_level_count(self): """Regression: the level-pattern regex must skip ``## Score N`` strings that appear inside fenced code blocks. A binary rubric whose Example A contains a code-fenced ``## Score 7`` literal should still load as a 2-level binary rubric, not be rejected with arity mismatch. """ r = Rubric.from_markdown_file( FIXTURES / "rubrics_valid_with_fenced_examples.md" ) assert r.dimension == "groundedness" assert r.scale == "binary" assert len(r.levels) == 2, ( f"fenced ## Score 7 leaked into level count; got {len(r.levels)} levels" ) class TestRubricValidationErrors: @pytest.mark.parametrize( "fixture_name,error_substring", [ ("rubrics_invalid_scale.md", "scale"), ("rubrics_invalid_arity.md", "arity"), ("rubrics_invalid_no_examples.md", "anchored example"), ("rubrics_invalid_no_frontmatter.md", "frontmatter"), ], ) def test_construction_raises_with_path_and_field( self, fixture_name: str, error_substring: str ): path = FIXTURES / fixture_name with pytest.raises(ValueError) as exc_info: Rubric.from_markdown_file(path) msg = str(exc_info.value) # Error must mention the file path and the field-level reason assert fixture_name in msg, f"Path missing from error: {msg}" assert error_substring in msg.lower(), ( f"Expected '{error_substring}' in error message: {msg}" ) class TestRubricSourceHash: def test_source_hash_deterministic(self): r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md") r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md") assert r1.source_hash == r2.source_hash # SHA-256 hex is 64 chars assert len(r1.source_hash) == 64 def test_source_hash_changes_with_content(self): r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md") r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md") assert r1.source_hash != r2.source_hash class TestRubricPermutation: def test_render_prompt_seed_0_unchanged(self): r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md") prompt = r.render_prompt(level_permutation_seed=0) # Default: levels in original 0, 1, 2 order idx0 = prompt.index("Score 0") idx1 = prompt.index("Score 1") idx2 = prompt.index("Score 2") assert idx0 < idx1 < idx2 def test_render_prompt_seed_reproducibility(self): r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md") p1 = r.render_prompt(level_permutation_seed=42) p2 = r.render_prompt(level_permutation_seed=42) assert p1 == p2 def test_render_prompt_different_seed_different_order(self): r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md") # Try several seeds; at least one should produce a non-default order # (with 3! = 6 permutations, the chance all 5 seeds produce identity # is (1/6)^5 ≈ 1e-4, negligible) default = r.render_prompt(level_permutation_seed=0) differs = any( r.render_prompt(level_permutation_seed=s) != default for s in (1, 2, 3, 7, 13) ) assert differs, "No seed produced a permutation different from default"