Spaces:
Sleeping
Sleeping
| """Tests for LangChain evaluation runner.""" | |
| from unittest.mock import AsyncMock, MagicMock | |
| from agent_bench.langchain_baseline.runner import ( | |
| extract_tools_used, | |
| run_langchain_evaluation, | |
| ) | |
| from agent_bench.langchain_baseline.tools import LangChainSearchTool | |
| # --- Unit tests for helper functions --- | |
| def test_extract_tools_used_from_intermediate_steps(): | |
| step1_action = MagicMock() | |
| step1_action.tool = "search_documents" | |
| step2_action = MagicMock() | |
| step2_action.tool = "calculator" | |
| steps = [(step1_action, "result1"), (step2_action, "result2")] | |
| assert extract_tools_used(steps) == ["search_documents", "calculator"] | |
| def test_extract_tools_used_empty_steps(): | |
| assert extract_tools_used([]) == [] | |
| # --- Integration test with mock agent executor --- | |
| async def test_runner_produces_eval_results(): | |
| # The runner calls reset() before each question, so pre-populating state | |
| # won't work. Instead, simulate what happens when the agent calls the | |
| # search tool during execution: the ainvoke side-effect populates metadata. | |
| mock_lc_retriever = MagicMock() | |
| search_tool = LangChainSearchTool(mock_lc_retriever) | |
| def _populate_search_state(*args, **kwargs): | |
| """Simulate the search tool populating metadata during agent execution.""" | |
| search_tool.last_ranked_sources.append("fastapi_path_params.md") | |
| search_tool.last_source_chunks.append("Path params use curly braces.") | |
| search_tool.last_sources.append("fastapi_path_params.md") | |
| return { | |
| "output": "Path params use curly braces. [source: fastapi_path_params.md]", | |
| "intermediate_steps": [ | |
| (MagicMock(tool="search_documents"), "tool output"), | |
| ], | |
| } | |
| agent_executor = MagicMock() | |
| agent_executor.ainvoke = AsyncMock(side_effect=_populate_search_state) | |
| golden_path = "agent_bench/evaluation/datasets/tech_docs_golden.json" | |
| results = await run_langchain_evaluation( | |
| agent_executor=agent_executor, | |
| search_tool_state=search_tool, | |
| golden_path=golden_path, | |
| provider_name="openai", | |
| max_questions=2, # only run first 2 for speed | |
| ) | |
| assert len(results) == 2 | |
| r = results[0] | |
| assert r.question_id == "q001" | |
| assert r.question == "How do you define a path parameter in FastAPI?" | |
| assert r.category == "retrieval" | |
| assert r.answer != "" | |
| # Verify metadata actually propagated (not zeroed by reset) | |
| assert r.retrieval_precision > 0.0 | |
| assert r.retrieval_recall > 0.0 | |
| assert r.retrieved_sources == ["fastapi_path_params.md"] | |
| async def test_runner_handles_agent_error(): | |
| agent_executor = MagicMock() | |
| agent_executor.ainvoke = AsyncMock(side_effect=RuntimeError("API error")) | |
| mock_lc_retriever = MagicMock() | |
| search_tool = LangChainSearchTool(mock_lc_retriever) | |
| golden_path = "agent_bench/evaluation/datasets/tech_docs_golden.json" | |
| results = await run_langchain_evaluation( | |
| agent_executor=agent_executor, | |
| search_tool_state=search_tool, | |
| golden_path=golden_path, | |
| provider_name="openai", | |
| max_questions=1, | |
| ) | |
| assert len(results) == 1 | |
| assert "ERROR" in results[0].answer | |
| assert results[0].tool_calls_made == 0 | |