Nomearod Claude Opus 4.7 (1M context) commited on
Commit
4fa7c61
·
1 Parent(s): cf57f16

feat(scripts): run_calibration.py orchestrator for Steps A/C/D

Browse files

Three subcommands, all sharing concurrency-resolution + structured
logging:
generate-outputs — Step A: orchestrator against 30 calibration
items, frozen config, writes
results/calibration_v1_system_outputs.json
run-judges — Step C: takes --row-config=<path>, scores
frozen outputs with that row's judges, writes
results/calibration_v1_judge_<label>.json
build-table — Step D: invokes generate_kappa_table; --strict
raises on missing predictions/labels

Resolved concurrency value logged at every run so artifacts capture
which concurrency was used. Default 5; CLI overrides config-field
fallback overrides hardcoded default.

Step B (hand-labeling) is manual — done in a Jupyter notebook,
not orchestrated by this script.

Also folded in lint fixes for the Phase 1-3 modules to satisfy
ruff E402 (test imports moved to top of test_judges.py) and E501
(jury.py reasoning string broken into a temp variable).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/judges/base.py CHANGED
@@ -9,20 +9,31 @@ rationale and the six-axis comparison table.
9
  from __future__ import annotations
10
 
11
  import hashlib
 
12
  import random
13
  import re
 
14
  from abc import ABC, abstractmethod
15
  from pathlib import Path
16
  from typing import TYPE_CHECKING, Literal, Self
17
 
 
18
  import yaml
19
  from pydantic import BaseModel, Field
20
 
 
 
 
 
 
 
21
  if TYPE_CHECKING:
22
  from agent_bench.agents.orchestrator import AgentResponse
23
  from agent_bench.core.provider import LLMProvider
24
  from agent_bench.evaluation.harness import GoldenQuestion
25
 
 
 
26
  # --- Abstain-reason constants ---
27
  #
28
  # Failure-as-abstain ScoreResults carry a reasoning string with one of
@@ -282,19 +293,6 @@ class MockJudge(Judge):
282
 
283
  # --- _call_judge_with_retry helper ---
284
 
285
- import json as _json
286
- import time
287
-
288
- import structlog
289
-
290
- from agent_bench.core.provider import (
291
- ProviderRateLimitError,
292
- ProviderTimeoutError,
293
- )
294
- from agent_bench.core.types import Message, Role
295
-
296
- logger = structlog.get_logger()
297
-
298
  _STRICT_REPROMPT_SUFFIX = (
299
  "\n\nSTRICT FORMATTING NOTE: respond ONLY with a JSON object matching "
300
  "the schema; reasoning first, then evidence_quotes, then score."
 
9
  from __future__ import annotations
10
 
11
  import hashlib
12
+ import json as _json
13
  import random
14
  import re
15
+ import time
16
  from abc import ABC, abstractmethod
17
  from pathlib import Path
18
  from typing import TYPE_CHECKING, Literal, Self
19
 
20
+ import structlog
21
  import yaml
22
  from pydantic import BaseModel, Field
23
 
24
+ from agent_bench.core.provider import (
25
+ ProviderRateLimitError,
26
+ ProviderTimeoutError,
27
+ )
28
+ from agent_bench.core.types import Message, Role
29
+
30
  if TYPE_CHECKING:
31
  from agent_bench.agents.orchestrator import AgentResponse
32
  from agent_bench.core.provider import LLMProvider
33
  from agent_bench.evaluation.harness import GoldenQuestion
34
 
35
+ logger = structlog.get_logger()
36
+
37
  # --- Abstain-reason constants ---
38
  #
39
  # Failure-as-abstain ScoreResults carry a reasoning string with one of
 
293
 
294
  # --- _call_judge_with_retry helper ---
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  _STRICT_REPROMPT_SUFFIX = (
297
  "\n\nSTRICT FORMATTING NOTE: respond ONLY with a JSON object matching "
298
  "the schema; reasoning first, then evidence_quotes, then score."
agent_bench/evaluation/variance/jury.py CHANGED
@@ -104,10 +104,16 @@ class Jury:
104
  mean = weighted_sum / weight_total if weight_total > 0 else 0.0
105
  agg = _aggregate_scores([int(round(mean))], scale)
106
 
 
 
 
 
 
107
  return ScoreResult(
108
  reasoning=(
109
- f"jury_{self.aggregation}: members={[r.score for r in successful]}, "
110
- f"weights={list(self.weights.values()) if self.aggregation == 'kappa_weighted' else 'n/a'}"
 
111
  ),
112
  evidence_quotes=[],
113
  score=agg,
 
104
  mean = weighted_sum / weight_total if weight_total > 0 else 0.0
105
  agg = _aggregate_scores([int(round(mean))], scale)
106
 
107
+ weights_str = (
108
+ list(self.weights.values())
109
+ if self.aggregation == "kappa_weighted"
110
+ else "n/a"
111
+ )
112
  return ScoreResult(
113
  reasoning=(
114
+ f"jury_{self.aggregation}: "
115
+ f"members={[r.score for r in successful]}, "
116
+ f"weights={weights_str}"
117
  ),
118
  evidence_quotes=[],
119
  score=agg,
scripts/run_calibration.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Calibration runner: generate-outputs | run-judges | build-table.
2
+
3
+ Orchestrates Steps A, C, D from the design doc's data flow. Step B
4
+ (hand-labeling) is manual — done in a Jupyter notebook reading
5
+ results/calibration_v1_system_outputs.json and appending to
6
+ measurements/2026-05-04-judge-calibration-labels.jsonl.
7
+
8
+ Examples:
9
+ python scripts/run_calibration.py generate-outputs --concurrency 5
10
+ python scripts/run_calibration.py run-judges --row-config=configs/calibration/rows/baseline.yaml
11
+ python scripts/run_calibration.py build-table
12
+ python scripts/run_calibration.py build-table --strict
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import asyncio
19
+ import hashlib
20
+ import json
21
+ from pathlib import Path
22
+
23
+ import structlog
24
+ import yaml
25
+
26
+ logger = structlog.get_logger()
27
+
28
+ REPO = Path(__file__).resolve().parents[1]
29
+ CALIBRATION_SPEC = REPO / "agent_bench/evaluation/datasets/calibration_v1.json"
30
+ SYSTEM_OUTPUTS = REPO / "results/calibration_v1_system_outputs.json"
31
+ LABELS_PATH = REPO / "measurements/2026-05-04-judge-calibration-labels.jsonl"
32
+ KAPPA_TABLE_OUT = REPO / "docs/_generated/kappa_table.md"
33
+
34
+
35
+ def _resolve_concurrency(cli_value: int | None) -> int:
36
+ """CLI flag overrides config field; default is 5. Logs the resolved value."""
37
+ if cli_value is not None:
38
+ resolved = cli_value
39
+ else:
40
+ cfg_path = REPO / "configs/default.yaml"
41
+ cfg_concurrency = None
42
+ if cfg_path.exists():
43
+ cfg = yaml.safe_load(cfg_path.read_text()) or {}
44
+ cfg_concurrency = (cfg.get("evaluation", {}) or {}).get(
45
+ "calibration_concurrency"
46
+ )
47
+ resolved = cfg_concurrency if cfg_concurrency is not None else 5
48
+ logger.info("calibration_concurrency_resolved", value=resolved)
49
+ return resolved
50
+
51
+
52
+ # --- Subcommand: generate-outputs (Step A) ---
53
+
54
+
55
+ async def cmd_generate_outputs(concurrency: int) -> None:
56
+ """Run the orchestrator against the 30 calibration items with a frozen
57
+ configuration; write results/calibration_v1_system_outputs.json.
58
+ """
59
+ from agent_bench.agents.orchestrator import Orchestrator
60
+ from agent_bench.core.config import load_config
61
+ from agent_bench.core.provider import AnthropicProvider
62
+ from agent_bench.evaluation.harness import load_golden_dataset
63
+ from agent_bench.tools.registry import build_default_registry
64
+
65
+ spec = json.loads(CALIBRATION_SPEC.read_text())
66
+ target_ids = {i["id"]: i for i in spec["items"]}
67
+
68
+ fastapi = load_golden_dataset(
69
+ REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json"
70
+ )
71
+ k8s = load_golden_dataset(
72
+ REPO / "agent_bench/evaluation/datasets/k8s_golden.json"
73
+ )
74
+ items = [q for q in (fastapi + k8s) if q.id in target_ids]
75
+ if len(items) != len(target_ids):
76
+ missing = set(target_ids) - {q.id for q in items}
77
+ raise SystemExit(
78
+ f"calibration items not found in goldens: {sorted(missing)}"
79
+ )
80
+
81
+ cfg = load_config()
82
+ provider = AnthropicProvider(cfg)
83
+ registry = build_default_registry(cfg)
84
+ orchestrator = Orchestrator(provider=provider, registry=registry)
85
+
86
+ sem = asyncio.Semaphore(concurrency)
87
+
88
+ async def _run_one(item):
89
+ async with sem:
90
+ response = await orchestrator.run(
91
+ question=item.question,
92
+ system_prompt="You are a helpful assistant.",
93
+ )
94
+ answer = response.answer
95
+ sources = sorted(s.source for s in response.sources)
96
+ sys_hash = hashlib.sha256(
97
+ f"{item.id}\x00{answer}\x00{','.join(sources)}".encode("utf-8")
98
+ ).hexdigest()
99
+ return {
100
+ "item_id": item.id,
101
+ "question": item.question,
102
+ "category": item.category,
103
+ "answer": answer,
104
+ "sources": [s.source for s in response.sources],
105
+ "ranked_sources": response.ranked_sources,
106
+ "source_chunks": response.source_chunks,
107
+ "source_snippets": item.source_snippets,
108
+ "reference_answer": item.reference_answer,
109
+ "system_output_hash": sys_hash,
110
+ "stratum": target_ids[item.id]["stratum"],
111
+ "corpus": target_ids[item.id]["corpus"],
112
+ }
113
+
114
+ records = await asyncio.gather(*[_run_one(it) for it in items])
115
+ SYSTEM_OUTPUTS.parent.mkdir(parents=True, exist_ok=True)
116
+ SYSTEM_OUTPUTS.write_text(json.dumps(records, indent=2) + "\n")
117
+ logger.info(
118
+ "generate_outputs_complete", count=len(records), path=str(SYSTEM_OUTPUTS)
119
+ )
120
+
121
+
122
+ # --- Subcommand: run-judges (Step C, one row per invocation) ---
123
+
124
+
125
+ def _make_provider(name: str, cfg):
126
+ from agent_bench.core.provider import AnthropicProvider, OpenAIProvider
127
+
128
+ if name == "anthropic":
129
+ return AnthropicProvider(cfg)
130
+ if name == "openai":
131
+ return OpenAIProvider(cfg)
132
+ raise ValueError(f"unknown provider: {name}")
133
+
134
+
135
+ def _make_judge(provider_name: str, model_id: str, dimension: str, cfg):
136
+ from agent_bench.evaluation.judges.base import Rubric
137
+ from agent_bench.evaluation.judges.citation_faithfulness import (
138
+ CitationFaithfulnessJudge,
139
+ )
140
+ from agent_bench.evaluation.judges.completeness import CompletenessJudge
141
+ from agent_bench.evaluation.judges.groundedness import GroundednessJudge
142
+ from agent_bench.evaluation.judges.relevance import RelevanceJudge
143
+
144
+ judge_class = {
145
+ "groundedness": GroundednessJudge,
146
+ "relevance": RelevanceJudge,
147
+ "completeness": CompletenessJudge,
148
+ "citation_faithfulness": CitationFaithfulnessJudge,
149
+ }
150
+ rubric_dir = REPO / "agent_bench/evaluation/rubrics"
151
+ rubric = Rubric.from_markdown_file(rubric_dir / f"{dimension}.md")
152
+ return judge_class[dimension](
153
+ judge_provider=_make_provider(provider_name, cfg),
154
+ rubric=rubric,
155
+ model_id=model_id,
156
+ )
157
+
158
+
159
+ def _build_item_and_output(rec: dict):
160
+ from agent_bench.agents.orchestrator import AgentResponse, SourceReference
161
+ from agent_bench.core.types import TokenUsage
162
+ from agent_bench.evaluation.harness import GoldenQuestion
163
+
164
+ item = GoldenQuestion(
165
+ id=rec["item_id"],
166
+ question=rec["question"],
167
+ expected_answer_keywords=[],
168
+ expected_sources=[],
169
+ category=rec["category"],
170
+ difficulty="easy",
171
+ requires_calculator=False,
172
+ source_snippets=rec.get("source_snippets", []),
173
+ reference_answer=rec.get("reference_answer", ""),
174
+ )
175
+ output = AgentResponse(
176
+ answer=rec["answer"],
177
+ sources=[SourceReference(source=s) for s in rec["sources"]],
178
+ ranked_sources=rec.get("ranked_sources", []),
179
+ source_chunks=rec.get("source_chunks", []),
180
+ iterations=1,
181
+ usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
182
+ latency_ms=0,
183
+ )
184
+ return item, output
185
+
186
+
187
+ async def cmd_run_judges(row_config_path: Path, concurrency: int) -> None:
188
+ """Score the frozen system outputs with the row's judge configuration."""
189
+ from agent_bench.core.config import load_config
190
+ from agent_bench.evaluation.variance.jury import jury
191
+ from agent_bench.evaluation.variance.rubric_permute import rubric_permute
192
+
193
+ if not SYSTEM_OUTPUTS.exists():
194
+ raise SystemExit(
195
+ f"{SYSTEM_OUTPUTS} not found — run `generate-outputs` first."
196
+ )
197
+ row = yaml.safe_load(row_config_path.read_text())
198
+ outputs = json.loads(SYSTEM_OUTPUTS.read_text())
199
+
200
+ cfg = load_config()
201
+ sem = asyncio.Semaphore(concurrency)
202
+ all_results: list[dict] = []
203
+
204
+ for dim in row["dimensions"]:
205
+ if row["strategy"] == "single":
206
+ judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
207
+
208
+ async def score_one(rec, _judge=judge, _dim=dim):
209
+ async with sem:
210
+ if rec["category"] == "out_of_scope" and _dim != "relevance":
211
+ return None
212
+ item, output = _build_item_and_output(rec)
213
+ result = await _judge.score(item, output)
214
+ return {"dimension": _dim, **result.model_dump()}
215
+
216
+ row_results = await asyncio.gather(*[score_one(r) for r in outputs])
217
+ all_results.extend([r for r in row_results if r is not None])
218
+
219
+ elif row["strategy"] == "rubric_permute":
220
+ judge = _make_judge(row["provider"], row["model_id"], dim, cfg)
221
+ sidecar = REPO / row.get(
222
+ "sidecar_path", "results/calibration_v1_permute_members.jsonl"
223
+ )
224
+ permuted = rubric_permute(
225
+ judge,
226
+ n=row["options"]["n_permutations"],
227
+ seeds=row["options"]["seeds"],
228
+ sidecar_path=sidecar,
229
+ )
230
+ for rec in outputs:
231
+ if rec["category"] == "out_of_scope" and dim != "relevance":
232
+ continue
233
+ item, output = _build_item_and_output(rec)
234
+ result = await permuted.score(item, output)
235
+ all_results.append({"dimension": dim, **result.model_dump()})
236
+
237
+ elif row["strategy"] == "jury":
238
+ members = [
239
+ _make_judge(m["provider"], m["model_id"], dim, cfg)
240
+ for m in row["members"]
241
+ ]
242
+ sidecar = REPO / row["sidecar_path"]
243
+ weights = (
244
+ _load_weights_from_baseline(REPO / row["weights_source"], dim)
245
+ if row.get("aggregation") == "kappa_weighted"
246
+ else None
247
+ )
248
+ j = jury(
249
+ judges=members,
250
+ aggregation=row["aggregation"],
251
+ weights=weights,
252
+ quorum=row.get("quorum"),
253
+ sidecar_path=sidecar,
254
+ )
255
+ for rec in outputs:
256
+ if rec["category"] == "out_of_scope" and dim != "relevance":
257
+ continue
258
+ item, output = _build_item_and_output(rec)
259
+ result = await j.score(item, output)
260
+ all_results.append({"dimension": dim, **result.model_dump()})
261
+ else:
262
+ raise SystemExit(f"unknown strategy: {row['strategy']}")
263
+
264
+ out_path = REPO / row["output_path"]
265
+ out_path.parent.mkdir(parents=True, exist_ok=True)
266
+ out_path.write_text(json.dumps(all_results, indent=2) + "\n")
267
+ logger.info(
268
+ "run_judges_complete",
269
+ row=row["label"],
270
+ count=len(all_results),
271
+ path=str(out_path),
272
+ )
273
+
274
+
275
+ def _load_weights_from_baseline(
276
+ baseline_path: Path, dimension: str
277
+ ) -> dict[str, float]:
278
+ """Compute per-judge weight = κ vs labels for the dimension, from baseline run.
279
+
280
+ Stub for v1: returns equal weights (1.0 for each judge_id seen in
281
+ the baseline file). Replaced by real κ-derived weights once labels
282
+ + baseline are both populated. Documented in writeup as caveat:
283
+ 'weights estimated on calibration set; production deployment would
284
+ use a held-out validation set'.
285
+ """
286
+ if not baseline_path.exists():
287
+ logger.warning(
288
+ "weights_source_missing",
289
+ path=str(baseline_path),
290
+ fallback="equal_weights",
291
+ )
292
+ return {}
293
+ baseline = json.loads(baseline_path.read_text())
294
+ judge_ids = {
295
+ r["judge_id"] for r in baseline if r.get("dimension") == dimension
296
+ }
297
+ return {jid: 1.0 for jid in judge_ids}
298
+
299
+
300
+ # --- Subcommand: build-table (Step D) ---
301
+
302
+
303
+ def cmd_build_table(strict: bool) -> None:
304
+ from agent_bench.evaluation.calibration.report import generate_kappa_table
305
+
306
+ predictions_glob = str(REPO / "results/calibration_v1_judge_*.json")
307
+ generate_kappa_table(
308
+ predictions_glob=predictions_glob,
309
+ labels_path=str(LABELS_PATH),
310
+ output_path=str(KAPPA_TABLE_OUT),
311
+ strict=strict,
312
+ )
313
+ logger.info("build_table_complete", path=str(KAPPA_TABLE_OUT), strict=strict)
314
+
315
+
316
+ def main() -> None:
317
+ parser = argparse.ArgumentParser(
318
+ description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
319
+ )
320
+ sub = parser.add_subparsers(dest="cmd", required=True)
321
+
322
+ p_gen = sub.add_parser(
323
+ "generate-outputs", help="Step A: generate frozen system outputs"
324
+ )
325
+ p_gen.add_argument("--concurrency", type=int, default=None)
326
+
327
+ p_run = sub.add_parser("run-judges", help="Step C: score one ablation row")
328
+ p_run.add_argument("--row-config", type=Path, required=True)
329
+ p_run.add_argument("--concurrency", type=int, default=None)
330
+
331
+ p_tab = sub.add_parser(
332
+ "build-table", help="Step D: aggregate predictions into κ table"
333
+ )
334
+ p_tab.add_argument(
335
+ "--strict",
336
+ action="store_true",
337
+ help="Raise on missing predictions/labels (final-artifact path)",
338
+ )
339
+
340
+ args = parser.parse_args()
341
+ if args.cmd == "generate-outputs":
342
+ asyncio.run(cmd_generate_outputs(_resolve_concurrency(args.concurrency)))
343
+ elif args.cmd == "run-judges":
344
+ asyncio.run(
345
+ cmd_run_judges(args.row_config, _resolve_concurrency(args.concurrency))
346
+ )
347
+ elif args.cmd == "build-table":
348
+ cmd_build_table(strict=args.strict)
349
+
350
+
351
+ if __name__ == "__main__":
352
+ main()
tests/evaluation/test_judges.py CHANGED
@@ -2,14 +2,24 @@
2
 
3
  from __future__ import annotations
4
 
 
 
 
 
 
5
  import pytest
6
 
 
 
7
  from agent_bench.evaluation.judges.base import (
8
  ABSTAIN_REASON_GENUINE,
9
  ABSTAIN_REASON_OUT_OF_RANGE,
10
  ABSTAIN_REASON_PROVIDER_EXHAUSTED,
11
  ABSTAIN_REASON_SCHEMA_PARSE,
 
 
12
  ScoreResult,
 
13
  )
14
 
15
 
@@ -71,12 +81,6 @@ class TestScoreResult:
71
  ScoreResult(score="maybe", **self._base_kwargs()) # type: ignore[arg-type]
72
 
73
 
74
- from abc import ABC
75
- from pathlib import Path
76
-
77
- from agent_bench.evaluation.judges.base import Judge
78
-
79
-
80
  class TestJudgeABC:
81
  def test_judge_is_abstract(self):
82
  assert issubclass(Judge, ABC)
@@ -99,9 +103,6 @@ class TestJudgeABC:
99
  assert j.judge_id == "claude-haiku-4-5_groundedness"
100
 
101
 
102
- from agent_bench.evaluation.judges.base import MockJudge
103
-
104
-
105
  class TestMockJudge:
106
  def _verdict(self, item_id: str, score: int = 1) -> ScoreResult:
107
  return ScoreResult(
@@ -176,17 +177,6 @@ class TestMockJudge:
176
  await mj.score(item, output)
177
 
178
 
179
- import json
180
- from unittest.mock import AsyncMock
181
-
182
- from agent_bench.core.provider import (
183
- LLMProvider,
184
- ProviderRateLimitError,
185
- )
186
- from agent_bench.core.types import CompletionResponse, TokenUsage
187
- from agent_bench.evaluation.judges.base import _call_judge_with_retry
188
-
189
-
190
  def _mk_response(content: str) -> CompletionResponse:
191
  return CompletionResponse(
192
  content=content,
 
2
 
3
  from __future__ import annotations
4
 
5
+ import json
6
+ from abc import ABC
7
+ from pathlib import Path
8
+ from unittest.mock import AsyncMock
9
+
10
  import pytest
11
 
12
+ from agent_bench.core.provider import LLMProvider, ProviderRateLimitError
13
+ from agent_bench.core.types import CompletionResponse, TokenUsage
14
  from agent_bench.evaluation.judges.base import (
15
  ABSTAIN_REASON_GENUINE,
16
  ABSTAIN_REASON_OUT_OF_RANGE,
17
  ABSTAIN_REASON_PROVIDER_EXHAUSTED,
18
  ABSTAIN_REASON_SCHEMA_PARSE,
19
+ Judge,
20
+ MockJudge,
21
  ScoreResult,
22
+ _call_judge_with_retry,
23
  )
24
 
25
 
 
81
  ScoreResult(score="maybe", **self._base_kwargs()) # type: ignore[arg-type]
82
 
83
 
 
 
 
 
 
 
84
  class TestJudgeABC:
85
  def test_judge_is_abstract(self):
86
  assert issubclass(Judge, ABC)
 
103
  assert j.judge_id == "claude-haiku-4-5_groundedness"
104
 
105
 
 
 
 
106
  class TestMockJudge:
107
  def _verdict(self, item_id: str, score: int = 1) -> ScoreResult:
108
  return ScoreResult(
 
177
  await mj.score(item, output)
178
 
179
 
 
 
 
 
 
 
 
 
 
 
 
180
  def _mk_response(content: str) -> CompletionResponse:
181
  return CompletionResponse(
182
  content=content,