Spaces:

Nomearod
/

agentbench

Sleeping

Nomearod Claude Opus 4.7 (1M context) commited on May 4

Commit

aa70e89

1 Parent(s): 2192305

feat(judges): MockJudge with LookupError on missing keys

MockJudge raises LookupError (not a default) on missing item.id keys,
so test fixtures are self-checking against rename drift. A separate
fixture-validation test in Phase 8 walks item.id across all goldens
and asserts coverage; the LookupError is the second layer of defense.

__init__.py re-exports the public surface for ergonomic imports
(from agent_bench.evaluation.judges import Judge, ScoreResult, ...).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

agent_bench/evaluation/judges/__init__.py +24 -0
agent_bench/evaluation/judges/base.py +38 -0
tests/evaluation/test_judges.py +77 -0

agent_bench/evaluation/judges/__init__.py CHANGED Viewed

	@@ -1 +1,25 @@
1	"""Discrete-scale per-dimension LLM judges with anchored rubrics."""

 """Discrete-scale per-dimension LLM judges with anchored rubrics."""
+from agent_bench.evaluation.judges.base import (
+    ABSTAIN_REASON_GENUINE,
+    ABSTAIN_REASON_OUT_OF_RANGE,
+    ABSTAIN_REASON_PROVIDER_EXHAUSTED,
+    ABSTAIN_REASON_SCHEMA_PARSE,
+    Judge,
+    MockJudge,
+    Rubric,
+    RubricLevel,
+    ScoreResult,
+)
+__all__ = [
+    "ABSTAIN_REASON_GENUINE",
+    "ABSTAIN_REASON_OUT_OF_RANGE",
+    "ABSTAIN_REASON_PROVIDER_EXHAUSTED",
+    "ABSTAIN_REASON_SCHEMA_PARSE",
+    "Judge",
+    "MockJudge",
+    "Rubric",
+    "RubricLevel",
+    "ScoreResult",
+]

agent_bench/evaluation/judges/base.py CHANGED Viewed

@@ -240,3 +240,41 @@ class Judge(ABC):
         retryable errors raise (caller bug, not noise).
         """
         ...

         retryable errors raise (caller bug, not noise).
         """
         ...
+class MockJudge(Judge):
+    """Pre-baked-verdict judge for deterministic tests. No API calls.
+    Constructor takes verdicts: dict[item_id, ScoreResult]. score()
+    raises LookupError on missing keys — never returns a default —
+    so test fixtures are self-checking. A separate fixture-validation
+    test (test_mockjudge_coverage.py) walks item.id across all goldens
+    and asserts every MockJudge instance has coverage for the items
+    its tests reference.
+    Mirrors the MockProvider pattern at agent_bench/core/provider.py.
+    """
+    def __init__(self, verdicts: dict[str, ScoreResult]) -> None:
+        # MockJudge does not need provider/rubric/model_id; supply
+        # placeholder values so the ABC's __init__ doesn't matter.
+        self.judge_provider = None  # type: ignore[assignment]
+        self.rubric = None  # type: ignore[assignment]
+        self.model_id = "mock"
+        self.judge_id = "mock_judge"
+        self._verdicts = verdicts
+    async def score(
+        self,
+        item: "GoldenQuestion",
+        output: "AgentResponse",
+        *,
+        prompt_seed: int = 0,
+    ) -> ScoreResult:
+        if item.id not in self._verdicts:
+            raise LookupError(
+                f"MockJudge has no pre-baked verdict for item_id {item.id!r}; "
+                f"available: {sorted(self._verdicts.keys())[:5]}"
+                + (" ..." if len(self._verdicts) > 5 else "")
+            )
+        return self._verdicts[item.id]

tests/evaluation/test_judges.py CHANGED Viewed

@@ -97,3 +97,80 @@ class TestJudgeABC:
         )
         j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5")  # type: ignore[arg-type]
         assert j.judge_id == "claude-haiku-4-5_groundedness"

         )
         j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5")  # type: ignore[arg-type]
         assert j.judge_id == "claude-haiku-4-5_groundedness"
+from agent_bench.evaluation.judges.base import MockJudge
+class TestMockJudge:
+    def _verdict(self, item_id: str, score: int = 1) -> ScoreResult:
+        return ScoreResult(
+            reasoning=f"prebaked for {item_id}",
+            evidence_quotes=[],
+            score=score,
+            judge_id="mock_groundedness",
+            rubric_version="abc",
+            system_output_hash="def",
+            cost_usd=0.0,
+            latency_ms=0.0,
+        )
+    @pytest.mark.asyncio
+    async def test_returns_prebaked_verdict(self, monkeypatch):
+        from agent_bench.agents.orchestrator import AgentResponse, SourceReference
+        from agent_bench.core.types import TokenUsage
+        from agent_bench.evaluation.harness import GoldenQuestion
+        verdict = self._verdict("item_001", score=1)
+        mj = MockJudge(verdicts={"item_001": verdict})
+        item = GoldenQuestion(
+            id="item_001",
+            question="?",
+            expected_answer_keywords=[],
+            expected_sources=[],
+            category="retrieval",
+            difficulty="easy",
+            requires_calculator=False,
+        )
+        output = AgentResponse(
+            answer="x",
+            sources=[SourceReference(source="a.md")],
+            iterations=1,
+            usage=TokenUsage(
+                input_tokens=0, output_tokens=0, estimated_cost_usd=0
+            ),
+            latency_ms=0,
+        )
+        result = await mj.score(item, output)
+        assert result.score == 1
+        assert result.reasoning == "prebaked for item_001"
+    @pytest.mark.asyncio
+    async def test_raises_lookuperror_on_missing_key(self):
+        from agent_bench.agents.orchestrator import AgentResponse
+        from agent_bench.core.types import TokenUsage
+        from agent_bench.evaluation.harness import GoldenQuestion
+        mj = MockJudge(verdicts={"item_001": self._verdict("item_001")})
+        item = GoldenQuestion(
+            id="item_999_NOT_PRESENT",
+            question="?",
+            expected_answer_keywords=[],
+            expected_sources=[],
+            category="retrieval",
+            difficulty="easy",
+            requires_calculator=False,
+        )
+        output = AgentResponse(
+            answer="x",
+            sources=[],
+            iterations=1,
+            usage=TokenUsage(
+                input_tokens=0, output_tokens=0, estimated_cost_usd=0
+            ),
+            latency_ms=0,
+        )
+        with pytest.raises(LookupError, match="item_999_NOT_PRESENT"):
+            await mj.score(item, output)