Nomearod Claude Opus 4.7 (1M context) commited on
Commit
2192305
·
1 Parent(s): 7b72b2c

feat(judges): Judge ABC with judge_id derived from model + dimension

Browse files

Judge is abstract — concrete subclasses (groundedness, relevance,
completeness, citation_faithfulness) land in Phase 2 as thin
~30-line classes per the no-shared-base-method discipline.

judge_id format: '{model_id}_{rubric.dimension}', e.g.
'claude-haiku-4-5_groundedness'. The format is load-bearing for
the calibration report's per-judge κ breakdown — the report
groups by judge_id when computing per-judge agreement against
the human labels.

Cross-package types (GoldenQuestion, AgentResponse, LLMProvider)
are imported under TYPE_CHECKING to keep judges/base.py loadable
before harness.py is migrated in Phase 8 (avoiding a circular
import once harness imports back from judges).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/judges/base.py CHANGED
@@ -11,12 +11,18 @@ from __future__ import annotations
11
  import hashlib
12
  import random
13
  import re
 
14
  from pathlib import Path
15
- from typing import Literal, Self
16
 
17
  import yaml
18
  from pydantic import BaseModel, Field
19
 
 
 
 
 
 
20
  # --- Abstain-reason constants ---
21
  #
22
  # Failure-as-abstain ScoreResults carry a reasoning string with one of
@@ -199,3 +205,38 @@ class Rubric(BaseModel):
199
  for lvl in permuted_levels
200
  )
201
  return permuted_body
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import hashlib
12
  import random
13
  import re
14
+ from abc import ABC, abstractmethod
15
  from pathlib import Path
16
+ from typing import TYPE_CHECKING, Literal, Self
17
 
18
  import yaml
19
  from pydantic import BaseModel, Field
20
 
21
+ if TYPE_CHECKING:
22
+ from agent_bench.agents.orchestrator import AgentResponse
23
+ from agent_bench.core.provider import LLMProvider
24
+ from agent_bench.evaluation.harness import GoldenQuestion
25
+
26
  # --- Abstain-reason constants ---
27
  #
28
  # Failure-as-abstain ScoreResults carry a reasoning string with one of
 
205
  for lvl in permuted_levels
206
  )
207
  return permuted_body
208
+
209
+
210
+ class Judge(ABC):
211
+ """Per-dimension LLM judge. Concrete subclasses implement score()
212
+ for one rubric dimension; they are thin (~30 lines) and not
213
+ factored against a shared base method (see design doc for why).
214
+ """
215
+
216
+ def __init__(
217
+ self,
218
+ judge_provider: "LLMProvider",
219
+ rubric: Rubric,
220
+ model_id: str,
221
+ ) -> None:
222
+ self.judge_provider = judge_provider
223
+ self.rubric = rubric
224
+ self.model_id = model_id
225
+ self.judge_id = f"{model_id}_{rubric.dimension}"
226
+
227
+ @abstractmethod
228
+ async def score(
229
+ self,
230
+ item: "GoldenQuestion",
231
+ output: "AgentResponse",
232
+ *,
233
+ prompt_seed: int = 0,
234
+ ) -> ScoreResult:
235
+ """Score one (item, output) pair against this judge's rubric.
236
+
237
+ Returns a ScoreResult whose system_output_hash is computed from
238
+ (item.id, output.answer, sorted(output.sources)). Failures map
239
+ to abstain via the abstain-reason constants; provider non-
240
+ retryable errors raise (caller bug, not noise).
241
+ """
242
+ ...
tests/evaluation/test_judges.py CHANGED
@@ -69,3 +69,31 @@ class TestScoreResult:
69
  def test_score_rejects_other_strings(self):
70
  with pytest.raises(ValueError):
71
  ScoreResult(score="maybe", **self._base_kwargs()) # type: ignore[arg-type]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def test_score_rejects_other_strings(self):
70
  with pytest.raises(ValueError):
71
  ScoreResult(score="maybe", **self._base_kwargs()) # type: ignore[arg-type]
72
+
73
+
74
+ from abc import ABC
75
+ from pathlib import Path
76
+
77
+ from agent_bench.evaluation.judges.base import Judge
78
+
79
+
80
+ class TestJudgeABC:
81
+ def test_judge_is_abstract(self):
82
+ assert issubclass(Judge, ABC)
83
+ # Cannot instantiate directly — score is abstract
84
+ with pytest.raises(TypeError, match="abstract"):
85
+ Judge(judge_provider=None, rubric=None, model_id="test") # type: ignore[abstract,arg-type]
86
+
87
+ def test_judge_id_built_from_model_and_dimension(self):
88
+ # Concrete subclass that satisfies the abstract method
89
+ class _ConcreteJudge(Judge):
90
+ async def score(self, item, output, *, prompt_seed=0):
91
+ raise NotImplementedError
92
+
93
+ from agent_bench.evaluation.judges.base import Rubric
94
+
95
+ rubric = Rubric.from_markdown_file(
96
+ Path(__file__).parent / "fixtures" / "rubrics_valid_binary.md"
97
+ )
98
+ j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5") # type: ignore[arg-type]
99
+ assert j.judge_id == "claude-haiku-4-5_groundedness"