Nomearod Claude Opus 4.7 (1M context) commited on
Commit
80be2d8
Β·
1 Parent(s): b170eb6

feat(judges): CompletenessJudge + three-point reference-based rubric

Browse files

Three-point rubric (none / partial / full) scored against the gold
reference_answer. Coverage-of-facts framing: score only on what
fraction of the reference's points are present, not on additional
correct facts. Two anchored examples per level.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/judges/completeness.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CompletenessJudge β€” three-point, reference-based on item.reference_answer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from agent_bench.evaluation.judges.base import (
8
+ Judge,
9
+ ScoreResult,
10
+ _call_judge_with_retry,
11
+ )
12
+ from agent_bench.evaluation.judges.groundedness import _system_output_hash
13
+
14
+ if TYPE_CHECKING:
15
+ from agent_bench.agents.orchestrator import AgentResponse
16
+ from agent_bench.evaluation.harness import GoldenQuestion
17
+
18
+
19
+ class CompletenessJudge(Judge):
20
+ async def score(
21
+ self,
22
+ item: "GoldenQuestion",
23
+ output: "AgentResponse",
24
+ *,
25
+ prompt_seed: int = 0,
26
+ ) -> ScoreResult:
27
+ prompt = (
28
+ f"{self.rubric.render_prompt(level_permutation_seed=prompt_seed)}\n\n"
29
+ f"---\n\n"
30
+ f"## Reference answer (gold)\n{item.reference_answer}\n\n"
31
+ f"## Answer to score\n{output.answer}\n\n"
32
+ f"Score this answer against the rubric above. Respond with ONLY a "
33
+ f'JSON object: {{"reasoning": "...", "evidence_quotes": [...], '
34
+ f'"score": 0 or 1 or 2 or "Unknown"}}.'
35
+ )
36
+ return await _call_judge_with_retry(
37
+ provider=self.judge_provider,
38
+ prompt=prompt,
39
+ valid_scores={0, 1, 2},
40
+ judge_id=self.judge_id,
41
+ rubric_version=self.rubric.source_hash,
42
+ prompt_seed=prompt_seed,
43
+ system_output_hash=_system_output_hash(
44
+ item.id, output.answer, [s.source for s in output.sources]
45
+ ),
46
+ item_id=item.id,
47
+ abstain_allowed=self.rubric.abstain_allowed,
48
+ )
agent_bench/evaluation/rubrics/completeness.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ dimension: completeness
3
+ scale: three_point
4
+ reference_based: true
5
+ abstain_allowed: true
6
+ ---
7
+
8
+ # Completeness (three-point)
9
+
10
+ Score how much of the gold reference answer is covered by the agent's
11
+ answer. This is reference-based β€” the judge sees the gold reference
12
+ and the agent's answer; score on **coverage of facts** in the
13
+ reference, not on additional facts the agent may have included.
14
+
15
+ The judge does not penalize the agent for adding correct extra detail
16
+ (that's a separate concern). Score only on what fraction of the
17
+ reference's points are present.
18
+
19
+ ## Score 0
20
+
21
+ None of the reference's key points are present in the answer.
22
+
23
+ ### Example A β€” answer addresses different facts
24
+
25
+ Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
26
+ Answer: "Kubernetes uses YAML manifests to declare resources."
27
+
28
+ Score=0 β€” none of the three reference points (ordinal, hostname, storage) appear.
29
+
30
+ ### Example B β€” refusal that covers nothing
31
+
32
+ Reference: "The default port is 8080."
33
+ Answer: "I cannot find that information."
34
+
35
+ Score=0 β€” the reference's single point (port=8080) is not in the answer.
36
+
37
+ ## Score 1
38
+
39
+ Some but not all of the reference's points are present.
40
+
41
+ ### Example C β€” partial coverage
42
+
43
+ Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
44
+ Answer: "StatefulSet pods get ordinal indices."
45
+
46
+ Score=1 β€” one of three points covered.
47
+
48
+ ### Example D β€” half a comparison
49
+
50
+ Reference: "Deployments manage stateless replicas; StatefulSets manage stateful pods with stable identities."
51
+ Answer: "Deployments manage stateless replicas with rolling updates."
52
+
53
+ Score=1 β€” Deployment side covered, StatefulSet side missing.
54
+
55
+ ## Score 2
56
+
57
+ All of the reference's key points are present (paraphrase allowed).
58
+
59
+ ### Example E β€” full coverage with paraphrase
60
+
61
+ Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
62
+ Answer: "Each pod gets an ordinal number, a stable DNS name, and storage that survives restarts."
63
+
64
+ Score=2 β€” all three points covered with paraphrase.
65
+
66
+ ### Example F β€” full coverage of single-fact reference
67
+
68
+ Reference: "The default port is 8080."
69
+ Answer: "Port 8080."
70
+
71
+ Score=2 β€” the only reference point is covered.
tests/evaluation/test_judges.py CHANGED
@@ -449,3 +449,45 @@ class TestRelevanceJudge:
449
  assert result.score == 2
450
  sent_prompt = provider.complete.await_args.args[0][0].content
451
  assert "What's the default kubelet port?" in sent_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  assert result.score == 2
450
  sent_prompt = provider.complete.await_args.args[0][0].content
451
  assert "What's the default kubelet port?" in sent_prompt
452
+
453
+
454
+ class TestCompletenessJudge:
455
+ @pytest.mark.asyncio
456
+ async def test_reference_answer_in_prompt(self):
457
+ from agent_bench.agents.orchestrator import AgentResponse
458
+ from agent_bench.core.types import TokenUsage
459
+ from agent_bench.evaluation.harness import GoldenQuestion
460
+ from agent_bench.evaluation.judges.base import Rubric
461
+ from agent_bench.evaluation.judges.completeness import CompletenessJudge
462
+
463
+ rubric = Rubric.from_markdown_file(
464
+ "agent_bench/evaluation/rubrics/completeness.md"
465
+ )
466
+ provider = AsyncMock(spec=LLMProvider)
467
+ provider.complete.return_value = _mk_response(_valid_json(2))
468
+
469
+ judge = CompletenessJudge(judge_provider=provider, rubric=rubric, model_id="m")
470
+ item = GoldenQuestion(
471
+ id="k8s_003",
472
+ question="?",
473
+ expected_answer_keywords=[],
474
+ expected_sources=[],
475
+ category="retrieval",
476
+ difficulty="easy",
477
+ requires_calculator=False,
478
+ reference_answer="The default port is 8080.",
479
+ )
480
+ output = AgentResponse(
481
+ answer="Port 8080.",
482
+ sources=[],
483
+ iterations=1,
484
+ usage=TokenUsage(
485
+ input_tokens=0, output_tokens=0, estimated_cost_usd=0
486
+ ),
487
+ latency_ms=0,
488
+ )
489
+ result = await judge.score(item, output)
490
+ assert result.score == 2
491
+ assert result.judge_id == "m_completeness"
492
+ sent_prompt = provider.complete.await_args.args[0][0].content
493
+ assert "The default port is 8080." in sent_prompt