Nomearod Claude Opus 4.7 (1M context) commited on
Commit
aa70e89
·
1 Parent(s): 2192305

feat(judges): MockJudge with LookupError on missing keys

Browse files

MockJudge raises LookupError (not a default) on missing item.id keys,
so test fixtures are self-checking against rename drift. A separate
fixture-validation test in Phase 8 walks item.id across all goldens
and asserts coverage; the LookupError is the second layer of defense.

__init__.py re-exports the public surface for ergonomic imports
(from agent_bench.evaluation.judges import Judge, ScoreResult, ...).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/judges/__init__.py CHANGED
@@ -1 +1,25 @@
1
  """Discrete-scale per-dimension LLM judges with anchored rubrics."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Discrete-scale per-dimension LLM judges with anchored rubrics."""
2
+
3
+ from agent_bench.evaluation.judges.base import (
4
+ ABSTAIN_REASON_GENUINE,
5
+ ABSTAIN_REASON_OUT_OF_RANGE,
6
+ ABSTAIN_REASON_PROVIDER_EXHAUSTED,
7
+ ABSTAIN_REASON_SCHEMA_PARSE,
8
+ Judge,
9
+ MockJudge,
10
+ Rubric,
11
+ RubricLevel,
12
+ ScoreResult,
13
+ )
14
+
15
+ __all__ = [
16
+ "ABSTAIN_REASON_GENUINE",
17
+ "ABSTAIN_REASON_OUT_OF_RANGE",
18
+ "ABSTAIN_REASON_PROVIDER_EXHAUSTED",
19
+ "ABSTAIN_REASON_SCHEMA_PARSE",
20
+ "Judge",
21
+ "MockJudge",
22
+ "Rubric",
23
+ "RubricLevel",
24
+ "ScoreResult",
25
+ ]
agent_bench/evaluation/judges/base.py CHANGED
@@ -240,3 +240,41 @@ class Judge(ABC):
240
  retryable errors raise (caller bug, not noise).
241
  """
242
  ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  retryable errors raise (caller bug, not noise).
241
  """
242
  ...
243
+
244
+
245
+ class MockJudge(Judge):
246
+ """Pre-baked-verdict judge for deterministic tests. No API calls.
247
+
248
+ Constructor takes verdicts: dict[item_id, ScoreResult]. score()
249
+ raises LookupError on missing keys — never returns a default —
250
+ so test fixtures are self-checking. A separate fixture-validation
251
+ test (test_mockjudge_coverage.py) walks item.id across all goldens
252
+ and asserts every MockJudge instance has coverage for the items
253
+ its tests reference.
254
+
255
+ Mirrors the MockProvider pattern at agent_bench/core/provider.py.
256
+ """
257
+
258
+ def __init__(self, verdicts: dict[str, ScoreResult]) -> None:
259
+ # MockJudge does not need provider/rubric/model_id; supply
260
+ # placeholder values so the ABC's __init__ doesn't matter.
261
+ self.judge_provider = None # type: ignore[assignment]
262
+ self.rubric = None # type: ignore[assignment]
263
+ self.model_id = "mock"
264
+ self.judge_id = "mock_judge"
265
+ self._verdicts = verdicts
266
+
267
+ async def score(
268
+ self,
269
+ item: "GoldenQuestion",
270
+ output: "AgentResponse",
271
+ *,
272
+ prompt_seed: int = 0,
273
+ ) -> ScoreResult:
274
+ if item.id not in self._verdicts:
275
+ raise LookupError(
276
+ f"MockJudge has no pre-baked verdict for item_id {item.id!r}; "
277
+ f"available: {sorted(self._verdicts.keys())[:5]}"
278
+ + (" ..." if len(self._verdicts) > 5 else "")
279
+ )
280
+ return self._verdicts[item.id]
tests/evaluation/test_judges.py CHANGED
@@ -97,3 +97,80 @@ class TestJudgeABC:
97
  )
98
  j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5") # type: ignore[arg-type]
99
  assert j.judge_id == "claude-haiku-4-5_groundedness"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  )
98
  j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5") # type: ignore[arg-type]
99
  assert j.judge_id == "claude-haiku-4-5_groundedness"
100
+
101
+
102
+ from agent_bench.evaluation.judges.base import MockJudge
103
+
104
+
105
+ class TestMockJudge:
106
+ def _verdict(self, item_id: str, score: int = 1) -> ScoreResult:
107
+ return ScoreResult(
108
+ reasoning=f"prebaked for {item_id}",
109
+ evidence_quotes=[],
110
+ score=score,
111
+ judge_id="mock_groundedness",
112
+ rubric_version="abc",
113
+ system_output_hash="def",
114
+ cost_usd=0.0,
115
+ latency_ms=0.0,
116
+ )
117
+
118
+ @pytest.mark.asyncio
119
+ async def test_returns_prebaked_verdict(self, monkeypatch):
120
+ from agent_bench.agents.orchestrator import AgentResponse, SourceReference
121
+ from agent_bench.core.types import TokenUsage
122
+ from agent_bench.evaluation.harness import GoldenQuestion
123
+
124
+ verdict = self._verdict("item_001", score=1)
125
+ mj = MockJudge(verdicts={"item_001": verdict})
126
+
127
+ item = GoldenQuestion(
128
+ id="item_001",
129
+ question="?",
130
+ expected_answer_keywords=[],
131
+ expected_sources=[],
132
+ category="retrieval",
133
+ difficulty="easy",
134
+ requires_calculator=False,
135
+ )
136
+ output = AgentResponse(
137
+ answer="x",
138
+ sources=[SourceReference(source="a.md")],
139
+ iterations=1,
140
+ usage=TokenUsage(
141
+ input_tokens=0, output_tokens=0, estimated_cost_usd=0
142
+ ),
143
+ latency_ms=0,
144
+ )
145
+ result = await mj.score(item, output)
146
+ assert result.score == 1
147
+ assert result.reasoning == "prebaked for item_001"
148
+
149
+ @pytest.mark.asyncio
150
+ async def test_raises_lookuperror_on_missing_key(self):
151
+ from agent_bench.agents.orchestrator import AgentResponse
152
+ from agent_bench.core.types import TokenUsage
153
+ from agent_bench.evaluation.harness import GoldenQuestion
154
+
155
+ mj = MockJudge(verdicts={"item_001": self._verdict("item_001")})
156
+
157
+ item = GoldenQuestion(
158
+ id="item_999_NOT_PRESENT",
159
+ question="?",
160
+ expected_answer_keywords=[],
161
+ expected_sources=[],
162
+ category="retrieval",
163
+ difficulty="easy",
164
+ requires_calculator=False,
165
+ )
166
+ output = AgentResponse(
167
+ answer="x",
168
+ sources=[],
169
+ iterations=1,
170
+ usage=TokenUsage(
171
+ input_tokens=0, output_tokens=0, estimated_cost_usd=0
172
+ ),
173
+ latency_ms=0,
174
+ )
175
+ with pytest.raises(LookupError, match="item_999_NOT_PRESENT"):
176
+ await mj.score(item, output)