agentbench / tests /test_evaluation.py
Nomearod's picture
feat(eval): Week 1 step 5 β€” 25-question K8s golden dataset + grounded_refusal fix
4454894
"""Tests for evaluation metrics, harness, and report generation."""
from __future__ import annotations
import pytest
from agent_bench.agents.orchestrator import AgentResponse, SourceReference
from agent_bench.core.types import TokenUsage
from agent_bench.evaluation.harness import EvalResult, load_golden_dataset
from agent_bench.evaluation.metrics import (
calculator_used_when_expected,
citation_accuracy,
grounded_refusal,
keyword_hit_rate,
retrieval_precision_at_k,
retrieval_recall_at_k,
source_presence,
)
from agent_bench.evaluation.report import generate_report
# --- Metrics tests ---
class TestRetrievalMetrics:
def test_precision_at_k_perfect(self):
assert retrieval_precision_at_k(["a.md", "b.md"], ["a.md", "b.md"]) == 1.0
def test_precision_at_k_partial(self):
assert retrieval_precision_at_k(["a.md", "b.md", "c.md"], ["a.md"]) == pytest.approx(1 / 3)
def test_precision_at_k_empty_retrieved(self):
assert retrieval_precision_at_k([], ["a.md"]) == 0.0
def test_recall_at_k_perfect(self):
assert retrieval_recall_at_k(["a.md", "b.md", "c.md"], ["a.md", "b.md"]) == 1.0
def test_recall_at_k_partial(self):
assert retrieval_recall_at_k(["a.md"], ["a.md", "b.md"]) == 0.5
def test_recall_at_k_empty_expected(self):
assert retrieval_recall_at_k(["a.md"], []) == 0.0
def test_precision_uses_ranked_sources_with_duplicates(self):
"""Ranked sources may have duplicates β€” precision should count correctly."""
retrieved = ["a.md", "a.md", "b.md", "c.md", "d.md"]
expected = ["a.md"]
# 2 out of 5 retrieved are "a.md"
assert retrieval_precision_at_k(retrieved, expected, k=5) == pytest.approx(2 / 5)
class TestKeywordMetrics:
def test_keyword_hit_rate_all_match(self):
assert keyword_hit_rate("curly braces in path", ["curly braces", "path"]) == 1.0
def test_keyword_hit_rate_none_match(self):
assert keyword_hit_rate("something else", ["curly", "braces"]) == 0.0
def test_keyword_hit_rate_case_insensitive(self):
assert keyword_hit_rate("CORSMiddleware", ["corsmiddleware"]) == 1.0
class TestSourcePresence:
def test_has_sources(self):
resp = AgentResponse(
answer="test",
sources=[SourceReference(source="a.md")],
iterations=1,
usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
latency_ms=1.0,
)
assert source_presence(resp) is True
def test_no_sources(self):
resp = AgentResponse(
answer="test",
sources=[],
iterations=1,
usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
latency_ms=1.0,
)
assert source_presence(resp) is False
class TestGroundedRefusal:
def test_out_of_scope_with_refusal_no_citations(self):
"""Refusal phrase + no [source:] citations in answer text = passes."""
assert (
grounded_refusal("The documentation does not contain this info.", "out_of_scope")
is True
)
def test_out_of_scope_without_refusal(self):
assert grounded_refusal("Here is how you do it...", "out_of_scope") is False
def test_out_of_scope_refusal_with_citation_in_answer_fails(self):
"""Refusal phrase + [source:] citation in answer text = NOT a grounded refusal.
The metric inspects the answer text for citations rather than the
retrieved-sources list β€” a correct flavor-A refusal retrieves
candidates, inspects them, and explicitly declines to cite any of
them, which is the behavior the metric is designed to measure.
"""
answer = (
"The documentation does not contain this info. "
"[source: some_doc.md]"
)
assert grounded_refusal(answer, "out_of_scope") is False
def test_out_of_scope_refusal_no_citation_passes_even_with_retrieval(self):
"""Flavor-A refusal: agent retrieved candidates but answer cites none."""
answer = (
"The retrieved context does not contain information about Jaeger "
"sidecar injection. I cannot provide an answer."
)
# Under the old signature this test would have failed because the
# retrieved-sources list was non-empty. The fix moves the check to
# the answer text where the actual citations live.
assert grounded_refusal(answer, "out_of_scope") is True
def test_canonical_refusal_phrasing_recognized(self):
"""System-prompt-taught shape "not in the {label} documentation" passes.
core/prompts.py:17-18 instructs the agent to say "the answer is not
in the {corpus_label} documentation and stop" on out-of-scope queries.
The metric must recognize that canonical form.
"""
answer = "The answer is not in the Kubernetes documentation."
assert grounded_refusal(answer, "out_of_scope") is True
def test_not_in_the_is_not_substring_refusal(self):
"""Bare "not in the" fragment must NOT count as refusal.
Pins the design choice to match the canonical shape via a narrow
regex anchored on "documentation" rather than a loose substring.
A future refactor that widens the matcher to substring "not in the"
will break this test β€” that is the point.
"""
answer = "The rate limit is not in the same scope as the request timeout."
assert grounded_refusal(answer, "out_of_scope") is False
def test_in_scope_always_true(self):
assert grounded_refusal("any answer", "retrieval") is True
class TestCitationAccuracy:
def test_all_citations_valid(self):
answer = "Info from [source: a.md] and [source: b.md]."
assert citation_accuracy(answer, ["a.md", "b.md"]) == 1.0
def test_hallucinated_citation(self):
answer = "Info from [source: fake.md]."
assert citation_accuracy(answer, ["a.md"]) == 0.0
def test_no_citations(self):
assert citation_accuracy("No citations here.", ["a.md"]) == 1.0
class TestCalculatorMetric:
def test_calculator_used_when_required(self):
resp = AgentResponse(
answer="9",
tools_used=["search_documents", "calculator"],
iterations=2,
usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
latency_ms=1.0,
)
assert calculator_used_when_expected(resp, requires_calculator=True) is True
def test_calculator_not_used_when_required(self):
resp = AgentResponse(
answer="9",
tools_used=["search_documents"],
iterations=1,
usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
latency_ms=1.0,
)
assert calculator_used_when_expected(resp, requires_calculator=True) is False
def test_not_required_always_true(self):
resp = AgentResponse(
answer="test",
tools_used=[],
iterations=1,
usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
latency_ms=1.0,
)
assert calculator_used_when_expected(resp, requires_calculator=False) is True
# --- Golden dataset loading ---
class TestGoldenDataset:
def test_load_golden_dataset(self):
questions = load_golden_dataset("agent_bench/evaluation/datasets/tech_docs_golden.json")
assert len(questions) == 27
# Check distribution
categories = [q.category for q in questions]
assert categories.count("out_of_scope") == 5
assert categories.count("calculation") == 3
# All have required fields
for q in questions:
assert q.id
assert q.question
assert q.expected_answer_keywords
# --- Report generation ---
class TestReportGeneration:
def _make_results(self) -> list[EvalResult]:
usage = TokenUsage(input_tokens=100, output_tokens=50, estimated_cost_usd=0.001)
return [
EvalResult(
question_id="q001",
question="Test question?",
category="retrieval",
difficulty="easy",
retrieval_precision=0.8,
retrieval_recall=1.0,
keyword_hit_rate=0.75,
has_source_citation=True,
grounded_refusal=True,
citation_accuracy=1.0,
calculator_used_correctly=True,
tool_calls_made=2,
latency_ms=100.0,
tokens_used=usage,
answer="Test answer",
retrieved_sources=["a.md"],
),
EvalResult(
question_id="q002",
question="Out of scope?",
category="out_of_scope",
difficulty="easy",
retrieval_precision=0.0,
retrieval_recall=0.0,
keyword_hit_rate=0.5,
has_source_citation=False,
grounded_refusal=True,
citation_accuracy=1.0,
calculator_used_correctly=True,
tool_calls_made=1,
latency_ms=50.0,
tokens_used=usage,
answer="Does not contain",
retrieved_sources=[],
),
]
def test_report_contains_required_sections(self):
report = generate_report(self._make_results(), provider_name="test")
assert "## Aggregate Metrics" in report
assert "## By Category" in report
assert "## By Difficulty" in report
assert "## Chunking Strategy Comparison" in report
assert "## Failure Analysis" in report
assert "## Per-Question Results" in report
def test_report_contains_metrics(self):
report = generate_report(self._make_results(), provider_name="test")
assert "Retrieval P@5" in report
assert "Grounded Refusal Rate" in report
assert "Citation Accuracy" in report