Spaces:

Nomearod
/

agentbench

Sleeping

App Files Files Community

agentbench / tests /test_langchain_baseline /test_runner.py

Nomearod

fix: deferred imports, match iteration budget, token cost tracking

2c64504 2 months ago

raw

history blame contribute delete

3.27 kB

	"""Tests for LangChain evaluation runner."""

	from unittest.mock import AsyncMock, MagicMock

	from agent_bench.langchain_baseline.runner import (
	extract_tools_used,
	run_langchain_evaluation,
	)
	from agent_bench.langchain_baseline.tools import LangChainSearchTool

	# --- Unit tests for helper functions ---


	def test_extract_tools_used_from_intermediate_steps():
	step1_action = MagicMock()
	step1_action.tool = "search_documents"
	step2_action = MagicMock()
	step2_action.tool = "calculator"

	steps = [(step1_action, "result1"), (step2_action, "result2")]
	assert extract_tools_used(steps) == ["search_documents", "calculator"]


	def test_extract_tools_used_empty_steps():
	assert extract_tools_used([]) == []


	# --- Integration test with mock agent executor ---


	async def test_runner_produces_eval_results():
	# The runner calls reset() before each question, so pre-populating state
	# won't work. Instead, simulate what happens when the agent calls the
	# search tool during execution: the ainvoke side-effect populates metadata.
	mock_lc_retriever = MagicMock()
	search_tool = LangChainSearchTool(mock_lc_retriever)

	def _populate_search_state(args, *kwargs):
	"""Simulate the search tool populating metadata during agent execution."""
	search_tool.last_ranked_sources.append("fastapi_path_params.md")
	search_tool.last_source_chunks.append("Path params use curly braces.")
	search_tool.last_sources.append("fastapi_path_params.md")
	return {
	"output": "Path params use curly braces. [source: fastapi_path_params.md]",
	"intermediate_steps": [
	(MagicMock(tool="search_documents"), "tool output"),
	],
	}

	agent_executor = MagicMock()
	agent_executor.ainvoke = AsyncMock(side_effect=_populate_search_state)

	golden_path = "agent_bench/evaluation/datasets/tech_docs_golden.json"

	results = await run_langchain_evaluation(
	agent_executor=agent_executor,
	search_tool_state=search_tool,
	golden_path=golden_path,
	provider_name="openai",
	max_questions=2, # only run first 2 for speed
	)

	assert len(results) == 2
	r = results[0]
	assert r.question_id == "q001"
	assert r.question == "How do you define a path parameter in FastAPI?"
	assert r.category == "retrieval"
	assert r.answer != ""
	# Verify metadata actually propagated (not zeroed by reset)
	assert r.retrieval_precision > 0.0
	assert r.retrieval_recall > 0.0
	assert r.retrieved_sources == ["fastapi_path_params.md"]


	async def test_runner_handles_agent_error():
	agent_executor = MagicMock()
	agent_executor.ainvoke = AsyncMock(side_effect=RuntimeError("API error"))

	mock_lc_retriever = MagicMock()
	search_tool = LangChainSearchTool(mock_lc_retriever)

	golden_path = "agent_bench/evaluation/datasets/tech_docs_golden.json"

	results = await run_langchain_evaluation(
	agent_executor=agent_executor,
	search_tool_state=search_tool,
	golden_path=golden_path,
	provider_name="openai",
	max_questions=1,
	)

	assert len(results) == 1
	assert "ERROR" in results[0].answer
	assert results[0].tool_calls_made == 0