Spaces:
Running
Running
feat(judges): Judge ABC with judge_id derived from model + dimension
Browse filesJudge is abstract — concrete subclasses (groundedness, relevance,
completeness, citation_faithfulness) land in Phase 2 as thin
~30-line classes per the no-shared-base-method discipline.
judge_id format: '{model_id}_{rubric.dimension}', e.g.
'claude-haiku-4-5_groundedness'. The format is load-bearing for
the calibration report's per-judge κ breakdown — the report
groups by judge_id when computing per-judge agreement against
the human labels.
Cross-package types (GoldenQuestion, AgentResponse, LLMProvider)
are imported under TYPE_CHECKING to keep judges/base.py loadable
before harness.py is migrated in Phase 8 (avoiding a circular
import once harness imports back from judges).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
agent_bench/evaluation/judges/base.py
CHANGED
|
@@ -11,12 +11,18 @@ from __future__ import annotations
|
|
| 11 |
import hashlib
|
| 12 |
import random
|
| 13 |
import re
|
|
|
|
| 14 |
from pathlib import Path
|
| 15 |
-
from typing import Literal, Self
|
| 16 |
|
| 17 |
import yaml
|
| 18 |
from pydantic import BaseModel, Field
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# --- Abstain-reason constants ---
|
| 21 |
#
|
| 22 |
# Failure-as-abstain ScoreResults carry a reasoning string with one of
|
|
@@ -199,3 +205,38 @@ class Rubric(BaseModel):
|
|
| 199 |
for lvl in permuted_levels
|
| 200 |
)
|
| 201 |
return permuted_body
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import hashlib
|
| 12 |
import random
|
| 13 |
import re
|
| 14 |
+
from abc import ABC, abstractmethod
|
| 15 |
from pathlib import Path
|
| 16 |
+
from typing import TYPE_CHECKING, Literal, Self
|
| 17 |
|
| 18 |
import yaml
|
| 19 |
from pydantic import BaseModel, Field
|
| 20 |
|
| 21 |
+
if TYPE_CHECKING:
|
| 22 |
+
from agent_bench.agents.orchestrator import AgentResponse
|
| 23 |
+
from agent_bench.core.provider import LLMProvider
|
| 24 |
+
from agent_bench.evaluation.harness import GoldenQuestion
|
| 25 |
+
|
| 26 |
# --- Abstain-reason constants ---
|
| 27 |
#
|
| 28 |
# Failure-as-abstain ScoreResults carry a reasoning string with one of
|
|
|
|
| 205 |
for lvl in permuted_levels
|
| 206 |
)
|
| 207 |
return permuted_body
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
class Judge(ABC):
|
| 211 |
+
"""Per-dimension LLM judge. Concrete subclasses implement score()
|
| 212 |
+
for one rubric dimension; they are thin (~30 lines) and not
|
| 213 |
+
factored against a shared base method (see design doc for why).
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
def __init__(
|
| 217 |
+
self,
|
| 218 |
+
judge_provider: "LLMProvider",
|
| 219 |
+
rubric: Rubric,
|
| 220 |
+
model_id: str,
|
| 221 |
+
) -> None:
|
| 222 |
+
self.judge_provider = judge_provider
|
| 223 |
+
self.rubric = rubric
|
| 224 |
+
self.model_id = model_id
|
| 225 |
+
self.judge_id = f"{model_id}_{rubric.dimension}"
|
| 226 |
+
|
| 227 |
+
@abstractmethod
|
| 228 |
+
async def score(
|
| 229 |
+
self,
|
| 230 |
+
item: "GoldenQuestion",
|
| 231 |
+
output: "AgentResponse",
|
| 232 |
+
*,
|
| 233 |
+
prompt_seed: int = 0,
|
| 234 |
+
) -> ScoreResult:
|
| 235 |
+
"""Score one (item, output) pair against this judge's rubric.
|
| 236 |
+
|
| 237 |
+
Returns a ScoreResult whose system_output_hash is computed from
|
| 238 |
+
(item.id, output.answer, sorted(output.sources)). Failures map
|
| 239 |
+
to abstain via the abstain-reason constants; provider non-
|
| 240 |
+
retryable errors raise (caller bug, not noise).
|
| 241 |
+
"""
|
| 242 |
+
...
|
tests/evaluation/test_judges.py
CHANGED
|
@@ -69,3 +69,31 @@ class TestScoreResult:
|
|
| 69 |
def test_score_rejects_other_strings(self):
|
| 70 |
with pytest.raises(ValueError):
|
| 71 |
ScoreResult(score="maybe", **self._base_kwargs()) # type: ignore[arg-type]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def test_score_rejects_other_strings(self):
|
| 70 |
with pytest.raises(ValueError):
|
| 71 |
ScoreResult(score="maybe", **self._base_kwargs()) # type: ignore[arg-type]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
from abc import ABC
|
| 75 |
+
from pathlib import Path
|
| 76 |
+
|
| 77 |
+
from agent_bench.evaluation.judges.base import Judge
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class TestJudgeABC:
|
| 81 |
+
def test_judge_is_abstract(self):
|
| 82 |
+
assert issubclass(Judge, ABC)
|
| 83 |
+
# Cannot instantiate directly — score is abstract
|
| 84 |
+
with pytest.raises(TypeError, match="abstract"):
|
| 85 |
+
Judge(judge_provider=None, rubric=None, model_id="test") # type: ignore[abstract,arg-type]
|
| 86 |
+
|
| 87 |
+
def test_judge_id_built_from_model_and_dimension(self):
|
| 88 |
+
# Concrete subclass that satisfies the abstract method
|
| 89 |
+
class _ConcreteJudge(Judge):
|
| 90 |
+
async def score(self, item, output, *, prompt_seed=0):
|
| 91 |
+
raise NotImplementedError
|
| 92 |
+
|
| 93 |
+
from agent_bench.evaluation.judges.base import Rubric
|
| 94 |
+
|
| 95 |
+
rubric = Rubric.from_markdown_file(
|
| 96 |
+
Path(__file__).parent / "fixtures" / "rubrics_valid_binary.md"
|
| 97 |
+
)
|
| 98 |
+
j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5") # type: ignore[arg-type]
|
| 99 |
+
assert j.judge_id == "claude-haiku-4-5_groundedness"
|