agentbee

Sleeping

App Files Files Community

agentbee / test /test_llm_integration.py

mangubee

fix: correct author name formatting in multiple files

e7b4937 13 days ago

raw

history blame contribute delete

10.1 kB

	"""
	LLM Integration Tests - Stage 3 Validation
	Author: @mangubee
	Date: 2026-01-02

	Tests for Stage 3 LLM integration:
	- Planning with LLM
	- Tool selection via function calling
	- Answer synthesis from evidence
	- Full workflow with mocked LLM responses
	"""

	import pytest
	from unittest.mock import patch, MagicMock
	from src.agent.llm_client import (
	plan_question,
	select_tools_with_function_calling,
	synthesize_answer
	)
	from src.tools import TOOLS


	class TestPlanningFunction:
	"""Test LLM-based planning function."""

	@patch('src.agent.llm_client.Anthropic')
	def test_plan_question_basic(self, mock_anthropic):
	"""Test planning with simple question."""
	# Mock LLM response
	mock_client = MagicMock()
	mock_response = MagicMock()
	mock_response.content = [MagicMock(text="1. Search for information\n2. Analyze results")]
	mock_client.messages.create.return_value = mock_response
	mock_anthropic.return_value = mock_client

	# Test planning
	plan = plan_question(
	question="What is the capital of France?",
	available_tools=TOOLS
	)

	assert isinstance(plan, str)
	assert len(plan) > 0
	print(f"✓ Generated plan: {plan[:50]}...")

	@patch('src.agent.llm_client.Anthropic')
	def test_plan_with_files(self, mock_anthropic):
	"""Test planning with file context."""
	# Mock LLM response
	mock_client = MagicMock()
	mock_response = MagicMock()
	mock_response.content = [MagicMock(text="1. Parse file\n2. Extract data\n3. Calculate answer")]
	mock_client.messages.create.return_value = mock_response
	mock_anthropic.return_value = mock_client

	# Test planning with files
	plan = plan_question(
	question="What is the total in the spreadsheet?",
	available_tools=TOOLS,
	file_paths=["data.xlsx"]
	)

	assert isinstance(plan, str)
	assert len(plan) > 0
	print(f"✓ Generated plan with files: {plan[:50]}...")


	class TestToolSelection:
	"""Test LLM function calling for tool selection."""

	@patch('src.agent.llm_client.Anthropic')
	def test_select_single_tool(self, mock_anthropic):
	"""Test selecting single tool with parameters."""
	# Mock LLM response with function call
	mock_client = MagicMock()
	mock_response = MagicMock()

	# Mock tool_use content block
	mock_tool_use = MagicMock()
	mock_tool_use.type = "tool_use"
	mock_tool_use.name = "search"
	mock_tool_use.input = {"query": "capital of France"}
	mock_tool_use.id = "call_001"

	mock_response.content = [mock_tool_use]
	mock_client.messages.create.return_value = mock_response
	mock_anthropic.return_value = mock_client

	# Test tool selection
	tool_calls = select_tools_with_function_calling(
	question="What is the capital of France?",
	plan="1. Search for capital of France",
	available_tools=TOOLS
	)

	assert isinstance(tool_calls, list)
	assert len(tool_calls) == 1
	assert tool_calls[0]["tool"] == "search"
	assert "query" in tool_calls[0]["params"]
	print(f"✓ Selected tool: {tool_calls[0]}")

	@patch('src.agent.llm_client.Anthropic')
	def test_select_multiple_tools(self, mock_anthropic):
	"""Test selecting multiple tools in sequence."""
	# Mock LLM response with multiple function calls
	mock_client = MagicMock()
	mock_response = MagicMock()

	# Mock multiple tool_use blocks
	mock_tool1 = MagicMock()
	mock_tool1.type = "tool_use"
	mock_tool1.name = "parse_file"
	mock_tool1.input = {"file_path": "data.xlsx"}
	mock_tool1.id = "call_001"

	mock_tool2 = MagicMock()
	mock_tool2.type = "tool_use"
	mock_tool2.name = "safe_eval"
	mock_tool2.input = {"expression": "sum(values)"}
	mock_tool2.id = "call_002"

	mock_response.content = [mock_tool1, mock_tool2]
	mock_client.messages.create.return_value = mock_response
	mock_anthropic.return_value = mock_client

	# Test tool selection
	tool_calls = select_tools_with_function_calling(
	question="What is the sum in data.xlsx?",
	plan="1. Parse file\n2. Calculate sum",
	available_tools=TOOLS
	)

	assert isinstance(tool_calls, list)
	assert len(tool_calls) == 2
	assert tool_calls[0]["tool"] == "parse_file"
	assert tool_calls[1]["tool"] == "safe_eval"
	print(f"✓ Selected {len(tool_calls)} tools")


	class TestAnswerSynthesis:
	"""Test LLM-based answer synthesis."""

	@patch('src.agent.llm_client.Anthropic')
	def test_synthesize_simple_answer(self, mock_anthropic):
	"""Test synthesizing answer from single evidence."""
	# Mock LLM response
	mock_client = MagicMock()
	mock_response = MagicMock()
	mock_response.content = [MagicMock(text="Paris")]
	mock_client.messages.create.return_value = mock_response
	mock_anthropic.return_value = mock_client

	# Test answer synthesis
	answer = synthesize_answer(
	question="What is the capital of France?",
	evidence=["[search] Paris is the capital and most populous city of France"]
	)

	assert isinstance(answer, str)
	assert len(answer) > 0
	assert answer == "Paris"
	print(f"✓ Synthesized answer: {answer}")

	@patch('src.agent.llm_client.Anthropic')
	def test_synthesize_from_multiple_evidence(self, mock_anthropic):
	"""Test synthesizing answer from multiple evidence sources."""
	# Mock LLM response
	mock_client = MagicMock()
	mock_response = MagicMock()
	mock_response.content = [MagicMock(text="42")]
	mock_client.messages.create.return_value = mock_response
	mock_anthropic.return_value = mock_client

	# Test answer synthesis with multiple evidence
	answer = synthesize_answer(
	question="What is the answer?",
	evidence=[
	"[search] The answer to life is 42",
	"[safe_eval] 6 * 7 = 42",
	"[parse_file] Result: 42"
	]
	)

	assert isinstance(answer, str)
	assert answer == "42"
	print(f"✓ Synthesized answer from {3} evidence items: {answer}")

	@patch('src.agent.llm_client.Anthropic')
	def test_synthesize_with_conflicts(self, mock_anthropic):
	"""Test synthesizing answer when evidence conflicts."""
	# Mock LLM response - should resolve conflict
	mock_client = MagicMock()
	mock_response = MagicMock()
	mock_response.content = [MagicMock(text="Paris")]
	mock_client.messages.create.return_value = mock_response
	mock_anthropic.return_value = mock_client

	# Test answer synthesis with conflicting evidence
	answer = synthesize_answer(
	question="What is the capital of France?",
	evidence=[
	"[search] Paris is the capital of France (source: Wikipedia, 2024)",
	"[search] Lyon was briefly capital during revolution (source: old text, 1793)"
	]
	)

	assert isinstance(answer, str)
	assert answer == "Paris" # Should pick more recent/credible source
	print(f"✓ Resolved conflict, answer: {answer}")


	class TestEndToEndWorkflow:
	"""Test full agent workflow with mocked LLM."""

	@patch('src.agent.llm_client.Anthropic')
	@patch('src.tools.web_search.tavily_search')
	def test_full_search_workflow(self, mock_tavily, mock_anthropic):
	"""Test complete workflow: plan → search → answer."""
	from src.agent import GAIAAgent

	# Mock tool execution
	mock_tavily.return_value = "Paris is the capital and most populous city of France"

	# Mock LLM responses
	mock_client = MagicMock()

	# Response 1: Planning
	# Response 2: Tool selection (function calling)
	# Response 3: Answer synthesis

	mock_plan_response = MagicMock()
	mock_plan_response.content = [MagicMock(text="1. Search for capital of France")]

	mock_tool_response = MagicMock()
	mock_tool_use = MagicMock()
	mock_tool_use.type = "tool_use"
	mock_tool_use.name = "web_search"
	mock_tool_use.input = {"query": "capital of France"}
	mock_tool_use.id = "call_001"
	mock_tool_response.content = [mock_tool_use]

	mock_answer_response = MagicMock()
	mock_answer_response.content = [MagicMock(text="Paris")]

	# Set up mock to return different responses for each call
	mock_client.messages.create.side_effect = [
	mock_plan_response,
	mock_tool_response,
	mock_answer_response
	]

	mock_anthropic.return_value = mock_client

	# Test full workflow
	agent = GAIAAgent()
	answer = agent("What is the capital of France?")

	assert isinstance(answer, str)
	assert answer == "Paris"
	print(f"✓ Full workflow completed, answer: {answer}")


	if __name__ == "__main__":
	print("\n" + "="*70)
	print("GAIA Agent - Stage 3 LLM Integration Tests")
	print("="*70 + "\n")

	# Run tests manually for quick validation
	test_plan = TestPlanningFunction()
	test_plan.test_plan_question_basic()
	test_plan.test_plan_with_files()

	test_tools = TestToolSelection()
	test_tools.test_select_single_tool()
	test_tools.test_select_multiple_tools()

	test_answer = TestAnswerSynthesis()
	test_answer.test_synthesize_simple_answer()
	test_answer.test_synthesize_from_multiple_evidence()
	test_answer.test_synthesize_with_conflicts()

	test_e2e = TestEndToEndWorkflow()
	test_e2e.test_full_search_workflow()

	print("\n" + "="*70)
	print("✓ All Stage 3 LLM integration tests passed!")
	print("="*70 + "\n")