Spaces:

Nomearod
/

agentbench

Sleeping

File size: 4,400 Bytes

"""Tests for Rubric markdown loader: construction validation, hash, permutation."""

from __future__ import annotations

from pathlib import Path

import pytest

from agent_bench.evaluation.judges.base import Rubric

FIXTURES = Path(__file__).parent / "fixtures"


class TestRubricLoading:
    def test_load_valid_binary(self):
        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
        assert r.dimension == "groundedness"
        assert r.scale == "binary"
        assert r.reference_based is True
        assert r.abstain_allowed is True
        assert len(r.levels) == 2

    def test_load_valid_three_point(self):
        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
        assert r.dimension == "relevance"
        assert r.scale == "three_point"
        assert len(r.levels) == 3

    def test_fenced_code_examples_do_not_break_level_count(self):
        """Regression: the level-pattern regex must skip ``## Score N`` strings
        that appear inside fenced code blocks. A binary rubric whose
        Example A contains a code-fenced ``## Score 7`` literal should still
        load as a 2-level binary rubric, not be rejected with arity mismatch.
        """
        r = Rubric.from_markdown_file(
            FIXTURES / "rubrics_valid_with_fenced_examples.md"
        )
        assert r.dimension == "groundedness"
        assert r.scale == "binary"
        assert len(r.levels) == 2, (
            f"fenced ## Score 7 leaked into level count; got {len(r.levels)} levels"
        )


class TestRubricValidationErrors:
    @pytest.mark.parametrize(
        "fixture_name,error_substring",
        [
            ("rubrics_invalid_scale.md", "scale"),
            ("rubrics_invalid_arity.md", "arity"),
            ("rubrics_invalid_no_examples.md", "anchored example"),
            ("rubrics_invalid_no_frontmatter.md", "frontmatter"),
        ],
    )
    def test_construction_raises_with_path_and_field(
        self, fixture_name: str, error_substring: str
    ):
        path = FIXTURES / fixture_name
        with pytest.raises(ValueError) as exc_info:
            Rubric.from_markdown_file(path)
        msg = str(exc_info.value)
        # Error must mention the file path and the field-level reason
        assert fixture_name in msg, f"Path missing from error: {msg}"
        assert error_substring in msg.lower(), (
            f"Expected '{error_substring}' in error message: {msg}"
        )


class TestRubricSourceHash:
    def test_source_hash_deterministic(self):
        r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
        r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
        assert r1.source_hash == r2.source_hash
        # SHA-256 hex is 64 chars
        assert len(r1.source_hash) == 64

    def test_source_hash_changes_with_content(self):
        r1 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_binary.md")
        r2 = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
        assert r1.source_hash != r2.source_hash


class TestRubricPermutation:
    def test_render_prompt_seed_0_unchanged(self):
        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
        prompt = r.render_prompt(level_permutation_seed=0)
        # Default: levels in original 0, 1, 2 order
        idx0 = prompt.index("Score 0")
        idx1 = prompt.index("Score 1")
        idx2 = prompt.index("Score 2")
        assert idx0 < idx1 < idx2

    def test_render_prompt_seed_reproducibility(self):
        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
        p1 = r.render_prompt(level_permutation_seed=42)
        p2 = r.render_prompt(level_permutation_seed=42)
        assert p1 == p2

    def test_render_prompt_different_seed_different_order(self):
        r = Rubric.from_markdown_file(FIXTURES / "rubrics_valid_three_point.md")
        # Try several seeds; at least one should produce a non-default order
        # (with 3! = 6 permutations, the chance all 5 seeds produce identity
        # is (1/6)^5 ≈ 1e-4, negligible)
        default = r.render_prompt(level_permutation_seed=0)
        differs = any(
            r.render_prompt(level_permutation_seed=s) != default
            for s in (1, 2, 3, 7, 13)
        )
        assert differs, "No seed produced a permutation different from default"