Spaces:

Nomearod
/

agentbench

Running

App Files Files Community

agentbench / tests /test_evaluation.py

Nomearod

feat(eval): Week 1 step 5 — 25-question K8s golden dataset + grounded_refusal fix

4454894 about 1 month ago

raw

history blame contribute delete

10.2 kB

	"""Tests for evaluation metrics, harness, and report generation."""

	from __future__ import annotations

	import pytest

	from agent_bench.agents.orchestrator import AgentResponse, SourceReference
	from agent_bench.core.types import TokenUsage
	from agent_bench.evaluation.harness import EvalResult, load_golden_dataset
	from agent_bench.evaluation.metrics import (
	calculator_used_when_expected,
	citation_accuracy,
	grounded_refusal,
	keyword_hit_rate,
	retrieval_precision_at_k,
	retrieval_recall_at_k,
	source_presence,
	)
	from agent_bench.evaluation.report import generate_report

	# --- Metrics tests ---


	class TestRetrievalMetrics:
	def test_precision_at_k_perfect(self):
	assert retrieval_precision_at_k(["a.md", "b.md"], ["a.md", "b.md"]) == 1.0

	def test_precision_at_k_partial(self):
	assert retrieval_precision_at_k(["a.md", "b.md", "c.md"], ["a.md"]) == pytest.approx(1 / 3)

	def test_precision_at_k_empty_retrieved(self):
	assert retrieval_precision_at_k([], ["a.md"]) == 0.0

	def test_recall_at_k_perfect(self):
	assert retrieval_recall_at_k(["a.md", "b.md", "c.md"], ["a.md", "b.md"]) == 1.0

	def test_recall_at_k_partial(self):
	assert retrieval_recall_at_k(["a.md"], ["a.md", "b.md"]) == 0.5

	def test_recall_at_k_empty_expected(self):
	assert retrieval_recall_at_k(["a.md"], []) == 0.0

	def test_precision_uses_ranked_sources_with_duplicates(self):
	"""Ranked sources may have duplicates — precision should count correctly."""
	retrieved = ["a.md", "a.md", "b.md", "c.md", "d.md"]
	expected = ["a.md"]
	# 2 out of 5 retrieved are "a.md"
	assert retrieval_precision_at_k(retrieved, expected, k=5) == pytest.approx(2 / 5)


	class TestKeywordMetrics:
	def test_keyword_hit_rate_all_match(self):
	assert keyword_hit_rate("curly braces in path", ["curly braces", "path"]) == 1.0

	def test_keyword_hit_rate_none_match(self):
	assert keyword_hit_rate("something else", ["curly", "braces"]) == 0.0

	def test_keyword_hit_rate_case_insensitive(self):
	assert keyword_hit_rate("CORSMiddleware", ["corsmiddleware"]) == 1.0


	class TestSourcePresence:
	def test_has_sources(self):
	resp = AgentResponse(
	answer="test",
	sources=[SourceReference(source="a.md")],
	iterations=1,
	usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
	latency_ms=1.0,
	)
	assert source_presence(resp) is True

	def test_no_sources(self):
	resp = AgentResponse(
	answer="test",
	sources=[],
	iterations=1,
	usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
	latency_ms=1.0,
	)
	assert source_presence(resp) is False


	class TestGroundedRefusal:
	def test_out_of_scope_with_refusal_no_citations(self):
	"""Refusal phrase + no [source:] citations in answer text = passes."""
	assert (
	grounded_refusal("The documentation does not contain this info.", "out_of_scope")
	is True
	)

	def test_out_of_scope_without_refusal(self):
	assert grounded_refusal("Here is how you do it...", "out_of_scope") is False

	def test_out_of_scope_refusal_with_citation_in_answer_fails(self):
	"""Refusal phrase + [source:] citation in answer text = NOT a grounded refusal.

	The metric inspects the answer text for citations rather than the
	retrieved-sources list — a correct flavor-A refusal retrieves
	candidates, inspects them, and explicitly declines to cite any of
	them, which is the behavior the metric is designed to measure.
	"""
	answer = (
	"The documentation does not contain this info. "
	"[source: some_doc.md]"
	)
	assert grounded_refusal(answer, "out_of_scope") is False

	def test_out_of_scope_refusal_no_citation_passes_even_with_retrieval(self):
	"""Flavor-A refusal: agent retrieved candidates but answer cites none."""
	answer = (
	"The retrieved context does not contain information about Jaeger "
	"sidecar injection. I cannot provide an answer."
	)
	# Under the old signature this test would have failed because the
	# retrieved-sources list was non-empty. The fix moves the check to
	# the answer text where the actual citations live.
	assert grounded_refusal(answer, "out_of_scope") is True

	def test_canonical_refusal_phrasing_recognized(self):
	"""System-prompt-taught shape "not in the {label} documentation" passes.

	core/prompts.py:17-18 instructs the agent to say "the answer is not
	in the {corpus_label} documentation and stop" on out-of-scope queries.
	The metric must recognize that canonical form.
	"""
	answer = "The answer is not in the Kubernetes documentation."
	assert grounded_refusal(answer, "out_of_scope") is True

	def test_not_in_the_is_not_substring_refusal(self):
	"""Bare "not in the" fragment must NOT count as refusal.

	Pins the design choice to match the canonical shape via a narrow
	regex anchored on "documentation" rather than a loose substring.
	A future refactor that widens the matcher to substring "not in the"
	will break this test — that is the point.
	"""
	answer = "The rate limit is not in the same scope as the request timeout."
	assert grounded_refusal(answer, "out_of_scope") is False

	def test_in_scope_always_true(self):
	assert grounded_refusal("any answer", "retrieval") is True


	class TestCitationAccuracy:
	def test_all_citations_valid(self):
	answer = "Info from [source: a.md] and [source: b.md]."
	assert citation_accuracy(answer, ["a.md", "b.md"]) == 1.0

	def test_hallucinated_citation(self):
	answer = "Info from [source: fake.md]."
	assert citation_accuracy(answer, ["a.md"]) == 0.0

	def test_no_citations(self):
	assert citation_accuracy("No citations here.", ["a.md"]) == 1.0


	class TestCalculatorMetric:
	def test_calculator_used_when_required(self):
	resp = AgentResponse(
	answer="9",
	tools_used=["search_documents", "calculator"],
	iterations=2,
	usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
	latency_ms=1.0,
	)
	assert calculator_used_when_expected(resp, requires_calculator=True) is True

	def test_calculator_not_used_when_required(self):
	resp = AgentResponse(
	answer="9",
	tools_used=["search_documents"],
	iterations=1,
	usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
	latency_ms=1.0,
	)
	assert calculator_used_when_expected(resp, requires_calculator=True) is False

	def test_not_required_always_true(self):
	resp = AgentResponse(
	answer="test",
	tools_used=[],
	iterations=1,
	usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
	latency_ms=1.0,
	)
	assert calculator_used_when_expected(resp, requires_calculator=False) is True


	# --- Golden dataset loading ---


	class TestGoldenDataset:
	def test_load_golden_dataset(self):
	questions = load_golden_dataset("agent_bench/evaluation/datasets/tech_docs_golden.json")
	assert len(questions) == 27
	# Check distribution
	categories = [q.category for q in questions]
	assert categories.count("out_of_scope") == 5
	assert categories.count("calculation") == 3
	# All have required fields
	for q in questions:
	assert q.id
	assert q.question
	assert q.expected_answer_keywords


	# --- Report generation ---


	class TestReportGeneration:
	def _make_results(self) -> list[EvalResult]:
	usage = TokenUsage(input_tokens=100, output_tokens=50, estimated_cost_usd=0.001)
	return [
	EvalResult(
	question_id="q001",
	question="Test question?",
	category="retrieval",
	difficulty="easy",
	retrieval_precision=0.8,
	retrieval_recall=1.0,
	keyword_hit_rate=0.75,
	has_source_citation=True,
	grounded_refusal=True,
	citation_accuracy=1.0,
	calculator_used_correctly=True,
	tool_calls_made=2,
	latency_ms=100.0,
	tokens_used=usage,
	answer="Test answer",
	retrieved_sources=["a.md"],
	),
	EvalResult(
	question_id="q002",
	question="Out of scope?",
	category="out_of_scope",
	difficulty="easy",
	retrieval_precision=0.0,
	retrieval_recall=0.0,
	keyword_hit_rate=0.5,
	has_source_citation=False,
	grounded_refusal=True,
	citation_accuracy=1.0,
	calculator_used_correctly=True,
	tool_calls_made=1,
	latency_ms=50.0,
	tokens_used=usage,
	answer="Does not contain",
	retrieved_sources=[],
	),
	]

	def test_report_contains_required_sections(self):
	report = generate_report(self._make_results(), provider_name="test")
	assert "## Aggregate Metrics" in report
	assert "## By Category" in report
	assert "## By Difficulty" in report
	assert "## Chunking Strategy Comparison" in report
	assert "## Failure Analysis" in report
	assert "## Per-Question Results" in report

	def test_report_contains_metrics(self):
	report = generate_report(self._make_results(), provider_name="test")
	assert "Retrieval P@5" in report
	assert "Grounded Refusal Rate" in report
	assert "Citation Accuracy" in report