Spaces:
Running
Running
feat(judges): CompletenessJudge + three-point reference-based rubric
Browse filesThree-point rubric (none / partial / full) scored against the gold
reference_answer. Coverage-of-facts framing: score only on what
fraction of the reference's points are present, not on additional
correct facts. Two anchored examples per level.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
agent_bench/evaluation/judges/completeness.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CompletenessJudge β three-point, reference-based on item.reference_answer."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import TYPE_CHECKING
|
| 6 |
+
|
| 7 |
+
from agent_bench.evaluation.judges.base import (
|
| 8 |
+
Judge,
|
| 9 |
+
ScoreResult,
|
| 10 |
+
_call_judge_with_retry,
|
| 11 |
+
)
|
| 12 |
+
from agent_bench.evaluation.judges.groundedness import _system_output_hash
|
| 13 |
+
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from agent_bench.agents.orchestrator import AgentResponse
|
| 16 |
+
from agent_bench.evaluation.harness import GoldenQuestion
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class CompletenessJudge(Judge):
|
| 20 |
+
async def score(
|
| 21 |
+
self,
|
| 22 |
+
item: "GoldenQuestion",
|
| 23 |
+
output: "AgentResponse",
|
| 24 |
+
*,
|
| 25 |
+
prompt_seed: int = 0,
|
| 26 |
+
) -> ScoreResult:
|
| 27 |
+
prompt = (
|
| 28 |
+
f"{self.rubric.render_prompt(level_permutation_seed=prompt_seed)}\n\n"
|
| 29 |
+
f"---\n\n"
|
| 30 |
+
f"## Reference answer (gold)\n{item.reference_answer}\n\n"
|
| 31 |
+
f"## Answer to score\n{output.answer}\n\n"
|
| 32 |
+
f"Score this answer against the rubric above. Respond with ONLY a "
|
| 33 |
+
f'JSON object: {{"reasoning": "...", "evidence_quotes": [...], '
|
| 34 |
+
f'"score": 0 or 1 or 2 or "Unknown"}}.'
|
| 35 |
+
)
|
| 36 |
+
return await _call_judge_with_retry(
|
| 37 |
+
provider=self.judge_provider,
|
| 38 |
+
prompt=prompt,
|
| 39 |
+
valid_scores={0, 1, 2},
|
| 40 |
+
judge_id=self.judge_id,
|
| 41 |
+
rubric_version=self.rubric.source_hash,
|
| 42 |
+
prompt_seed=prompt_seed,
|
| 43 |
+
system_output_hash=_system_output_hash(
|
| 44 |
+
item.id, output.answer, [s.source for s in output.sources]
|
| 45 |
+
),
|
| 46 |
+
item_id=item.id,
|
| 47 |
+
abstain_allowed=self.rubric.abstain_allowed,
|
| 48 |
+
)
|
agent_bench/evaluation/rubrics/completeness.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
dimension: completeness
|
| 3 |
+
scale: three_point
|
| 4 |
+
reference_based: true
|
| 5 |
+
abstain_allowed: true
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Completeness (three-point)
|
| 9 |
+
|
| 10 |
+
Score how much of the gold reference answer is covered by the agent's
|
| 11 |
+
answer. This is reference-based β the judge sees the gold reference
|
| 12 |
+
and the agent's answer; score on **coverage of facts** in the
|
| 13 |
+
reference, not on additional facts the agent may have included.
|
| 14 |
+
|
| 15 |
+
The judge does not penalize the agent for adding correct extra detail
|
| 16 |
+
(that's a separate concern). Score only on what fraction of the
|
| 17 |
+
reference's points are present.
|
| 18 |
+
|
| 19 |
+
## Score 0
|
| 20 |
+
|
| 21 |
+
None of the reference's key points are present in the answer.
|
| 22 |
+
|
| 23 |
+
### Example A β answer addresses different facts
|
| 24 |
+
|
| 25 |
+
Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
|
| 26 |
+
Answer: "Kubernetes uses YAML manifests to declare resources."
|
| 27 |
+
|
| 28 |
+
Score=0 β none of the three reference points (ordinal, hostname, storage) appear.
|
| 29 |
+
|
| 30 |
+
### Example B β refusal that covers nothing
|
| 31 |
+
|
| 32 |
+
Reference: "The default port is 8080."
|
| 33 |
+
Answer: "I cannot find that information."
|
| 34 |
+
|
| 35 |
+
Score=0 β the reference's single point (port=8080) is not in the answer.
|
| 36 |
+
|
| 37 |
+
## Score 1
|
| 38 |
+
|
| 39 |
+
Some but not all of the reference's points are present.
|
| 40 |
+
|
| 41 |
+
### Example C β partial coverage
|
| 42 |
+
|
| 43 |
+
Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
|
| 44 |
+
Answer: "StatefulSet pods get ordinal indices."
|
| 45 |
+
|
| 46 |
+
Score=1 β one of three points covered.
|
| 47 |
+
|
| 48 |
+
### Example D β half a comparison
|
| 49 |
+
|
| 50 |
+
Reference: "Deployments manage stateless replicas; StatefulSets manage stateful pods with stable identities."
|
| 51 |
+
Answer: "Deployments manage stateless replicas with rolling updates."
|
| 52 |
+
|
| 53 |
+
Score=1 β Deployment side covered, StatefulSet side missing.
|
| 54 |
+
|
| 55 |
+
## Score 2
|
| 56 |
+
|
| 57 |
+
All of the reference's key points are present (paraphrase allowed).
|
| 58 |
+
|
| 59 |
+
### Example E β full coverage with paraphrase
|
| 60 |
+
|
| 61 |
+
Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
|
| 62 |
+
Answer: "Each pod gets an ordinal number, a stable DNS name, and storage that survives restarts."
|
| 63 |
+
|
| 64 |
+
Score=2 β all three points covered with paraphrase.
|
| 65 |
+
|
| 66 |
+
### Example F β full coverage of single-fact reference
|
| 67 |
+
|
| 68 |
+
Reference: "The default port is 8080."
|
| 69 |
+
Answer: "Port 8080."
|
| 70 |
+
|
| 71 |
+
Score=2 β the only reference point is covered.
|
tests/evaluation/test_judges.py
CHANGED
|
@@ -449,3 +449,45 @@ class TestRelevanceJudge:
|
|
| 449 |
assert result.score == 2
|
| 450 |
sent_prompt = provider.complete.await_args.args[0][0].content
|
| 451 |
assert "What's the default kubelet port?" in sent_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
assert result.score == 2
|
| 450 |
sent_prompt = provider.complete.await_args.args[0][0].content
|
| 451 |
assert "What's the default kubelet port?" in sent_prompt
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
class TestCompletenessJudge:
|
| 455 |
+
@pytest.mark.asyncio
|
| 456 |
+
async def test_reference_answer_in_prompt(self):
|
| 457 |
+
from agent_bench.agents.orchestrator import AgentResponse
|
| 458 |
+
from agent_bench.core.types import TokenUsage
|
| 459 |
+
from agent_bench.evaluation.harness import GoldenQuestion
|
| 460 |
+
from agent_bench.evaluation.judges.base import Rubric
|
| 461 |
+
from agent_bench.evaluation.judges.completeness import CompletenessJudge
|
| 462 |
+
|
| 463 |
+
rubric = Rubric.from_markdown_file(
|
| 464 |
+
"agent_bench/evaluation/rubrics/completeness.md"
|
| 465 |
+
)
|
| 466 |
+
provider = AsyncMock(spec=LLMProvider)
|
| 467 |
+
provider.complete.return_value = _mk_response(_valid_json(2))
|
| 468 |
+
|
| 469 |
+
judge = CompletenessJudge(judge_provider=provider, rubric=rubric, model_id="m")
|
| 470 |
+
item = GoldenQuestion(
|
| 471 |
+
id="k8s_003",
|
| 472 |
+
question="?",
|
| 473 |
+
expected_answer_keywords=[],
|
| 474 |
+
expected_sources=[],
|
| 475 |
+
category="retrieval",
|
| 476 |
+
difficulty="easy",
|
| 477 |
+
requires_calculator=False,
|
| 478 |
+
reference_answer="The default port is 8080.",
|
| 479 |
+
)
|
| 480 |
+
output = AgentResponse(
|
| 481 |
+
answer="Port 8080.",
|
| 482 |
+
sources=[],
|
| 483 |
+
iterations=1,
|
| 484 |
+
usage=TokenUsage(
|
| 485 |
+
input_tokens=0, output_tokens=0, estimated_cost_usd=0
|
| 486 |
+
),
|
| 487 |
+
latency_ms=0,
|
| 488 |
+
)
|
| 489 |
+
result = await judge.score(item, output)
|
| 490 |
+
assert result.score == 2
|
| 491 |
+
assert result.judge_id == "m_completeness"
|
| 492 |
+
sent_prompt = provider.complete.await_args.args[0][0].content
|
| 493 |
+
assert "The default port is 8080." in sent_prompt
|