Spaces:
Sleeping
Sleeping
feat(judges): MockJudge with LookupError on missing keys
Browse filesMockJudge raises LookupError (not a default) on missing item.id keys,
so test fixtures are self-checking against rename drift. A separate
fixture-validation test in Phase 8 walks item.id across all goldens
and asserts coverage; the LookupError is the second layer of defense.
__init__.py re-exports the public surface for ergonomic imports
(from agent_bench.evaluation.judges import Judge, ScoreResult, ...).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
agent_bench/evaluation/judges/__init__.py
CHANGED
|
@@ -1 +1,25 @@
|
|
| 1 |
"""Discrete-scale per-dimension LLM judges with anchored rubrics."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Discrete-scale per-dimension LLM judges with anchored rubrics."""
|
| 2 |
+
|
| 3 |
+
from agent_bench.evaluation.judges.base import (
|
| 4 |
+
ABSTAIN_REASON_GENUINE,
|
| 5 |
+
ABSTAIN_REASON_OUT_OF_RANGE,
|
| 6 |
+
ABSTAIN_REASON_PROVIDER_EXHAUSTED,
|
| 7 |
+
ABSTAIN_REASON_SCHEMA_PARSE,
|
| 8 |
+
Judge,
|
| 9 |
+
MockJudge,
|
| 10 |
+
Rubric,
|
| 11 |
+
RubricLevel,
|
| 12 |
+
ScoreResult,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"ABSTAIN_REASON_GENUINE",
|
| 17 |
+
"ABSTAIN_REASON_OUT_OF_RANGE",
|
| 18 |
+
"ABSTAIN_REASON_PROVIDER_EXHAUSTED",
|
| 19 |
+
"ABSTAIN_REASON_SCHEMA_PARSE",
|
| 20 |
+
"Judge",
|
| 21 |
+
"MockJudge",
|
| 22 |
+
"Rubric",
|
| 23 |
+
"RubricLevel",
|
| 24 |
+
"ScoreResult",
|
| 25 |
+
]
|
agent_bench/evaluation/judges/base.py
CHANGED
|
@@ -240,3 +240,41 @@ class Judge(ABC):
|
|
| 240 |
retryable errors raise (caller bug, not noise).
|
| 241 |
"""
|
| 242 |
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
retryable errors raise (caller bug, not noise).
|
| 241 |
"""
|
| 242 |
...
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
class MockJudge(Judge):
|
| 246 |
+
"""Pre-baked-verdict judge for deterministic tests. No API calls.
|
| 247 |
+
|
| 248 |
+
Constructor takes verdicts: dict[item_id, ScoreResult]. score()
|
| 249 |
+
raises LookupError on missing keys — never returns a default —
|
| 250 |
+
so test fixtures are self-checking. A separate fixture-validation
|
| 251 |
+
test (test_mockjudge_coverage.py) walks item.id across all goldens
|
| 252 |
+
and asserts every MockJudge instance has coverage for the items
|
| 253 |
+
its tests reference.
|
| 254 |
+
|
| 255 |
+
Mirrors the MockProvider pattern at agent_bench/core/provider.py.
|
| 256 |
+
"""
|
| 257 |
+
|
| 258 |
+
def __init__(self, verdicts: dict[str, ScoreResult]) -> None:
|
| 259 |
+
# MockJudge does not need provider/rubric/model_id; supply
|
| 260 |
+
# placeholder values so the ABC's __init__ doesn't matter.
|
| 261 |
+
self.judge_provider = None # type: ignore[assignment]
|
| 262 |
+
self.rubric = None # type: ignore[assignment]
|
| 263 |
+
self.model_id = "mock"
|
| 264 |
+
self.judge_id = "mock_judge"
|
| 265 |
+
self._verdicts = verdicts
|
| 266 |
+
|
| 267 |
+
async def score(
|
| 268 |
+
self,
|
| 269 |
+
item: "GoldenQuestion",
|
| 270 |
+
output: "AgentResponse",
|
| 271 |
+
*,
|
| 272 |
+
prompt_seed: int = 0,
|
| 273 |
+
) -> ScoreResult:
|
| 274 |
+
if item.id not in self._verdicts:
|
| 275 |
+
raise LookupError(
|
| 276 |
+
f"MockJudge has no pre-baked verdict for item_id {item.id!r}; "
|
| 277 |
+
f"available: {sorted(self._verdicts.keys())[:5]}"
|
| 278 |
+
+ (" ..." if len(self._verdicts) > 5 else "")
|
| 279 |
+
)
|
| 280 |
+
return self._verdicts[item.id]
|
tests/evaluation/test_judges.py
CHANGED
|
@@ -97,3 +97,80 @@ class TestJudgeABC:
|
|
| 97 |
)
|
| 98 |
j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5") # type: ignore[arg-type]
|
| 99 |
assert j.judge_id == "claude-haiku-4-5_groundedness"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
)
|
| 98 |
j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5") # type: ignore[arg-type]
|
| 99 |
assert j.judge_id == "claude-haiku-4-5_groundedness"
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
from agent_bench.evaluation.judges.base import MockJudge
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class TestMockJudge:
|
| 106 |
+
def _verdict(self, item_id: str, score: int = 1) -> ScoreResult:
|
| 107 |
+
return ScoreResult(
|
| 108 |
+
reasoning=f"prebaked for {item_id}",
|
| 109 |
+
evidence_quotes=[],
|
| 110 |
+
score=score,
|
| 111 |
+
judge_id="mock_groundedness",
|
| 112 |
+
rubric_version="abc",
|
| 113 |
+
system_output_hash="def",
|
| 114 |
+
cost_usd=0.0,
|
| 115 |
+
latency_ms=0.0,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
@pytest.mark.asyncio
|
| 119 |
+
async def test_returns_prebaked_verdict(self, monkeypatch):
|
| 120 |
+
from agent_bench.agents.orchestrator import AgentResponse, SourceReference
|
| 121 |
+
from agent_bench.core.types import TokenUsage
|
| 122 |
+
from agent_bench.evaluation.harness import GoldenQuestion
|
| 123 |
+
|
| 124 |
+
verdict = self._verdict("item_001", score=1)
|
| 125 |
+
mj = MockJudge(verdicts={"item_001": verdict})
|
| 126 |
+
|
| 127 |
+
item = GoldenQuestion(
|
| 128 |
+
id="item_001",
|
| 129 |
+
question="?",
|
| 130 |
+
expected_answer_keywords=[],
|
| 131 |
+
expected_sources=[],
|
| 132 |
+
category="retrieval",
|
| 133 |
+
difficulty="easy",
|
| 134 |
+
requires_calculator=False,
|
| 135 |
+
)
|
| 136 |
+
output = AgentResponse(
|
| 137 |
+
answer="x",
|
| 138 |
+
sources=[SourceReference(source="a.md")],
|
| 139 |
+
iterations=1,
|
| 140 |
+
usage=TokenUsage(
|
| 141 |
+
input_tokens=0, output_tokens=0, estimated_cost_usd=0
|
| 142 |
+
),
|
| 143 |
+
latency_ms=0,
|
| 144 |
+
)
|
| 145 |
+
result = await mj.score(item, output)
|
| 146 |
+
assert result.score == 1
|
| 147 |
+
assert result.reasoning == "prebaked for item_001"
|
| 148 |
+
|
| 149 |
+
@pytest.mark.asyncio
|
| 150 |
+
async def test_raises_lookuperror_on_missing_key(self):
|
| 151 |
+
from agent_bench.agents.orchestrator import AgentResponse
|
| 152 |
+
from agent_bench.core.types import TokenUsage
|
| 153 |
+
from agent_bench.evaluation.harness import GoldenQuestion
|
| 154 |
+
|
| 155 |
+
mj = MockJudge(verdicts={"item_001": self._verdict("item_001")})
|
| 156 |
+
|
| 157 |
+
item = GoldenQuestion(
|
| 158 |
+
id="item_999_NOT_PRESENT",
|
| 159 |
+
question="?",
|
| 160 |
+
expected_answer_keywords=[],
|
| 161 |
+
expected_sources=[],
|
| 162 |
+
category="retrieval",
|
| 163 |
+
difficulty="easy",
|
| 164 |
+
requires_calculator=False,
|
| 165 |
+
)
|
| 166 |
+
output = AgentResponse(
|
| 167 |
+
answer="x",
|
| 168 |
+
sources=[],
|
| 169 |
+
iterations=1,
|
| 170 |
+
usage=TokenUsage(
|
| 171 |
+
input_tokens=0, output_tokens=0, estimated_cost_usd=0
|
| 172 |
+
),
|
| 173 |
+
latency_ms=0,
|
| 174 |
+
)
|
| 175 |
+
with pytest.raises(LookupError, match="item_999_NOT_PRESENT"):
|
| 176 |
+
await mj.score(item, output)
|