|
|
""" |
|
|
LLM Integration Tests - Stage 3 Validation |
|
|
Author: @mangubee |
|
|
Date: 2026-01-02 |
|
|
|
|
|
Tests for Stage 3 LLM integration: |
|
|
- Planning with LLM |
|
|
- Tool selection via function calling |
|
|
- Answer synthesis from evidence |
|
|
- Full workflow with mocked LLM responses |
|
|
""" |
|
|
|
|
|
import pytest |
|
|
from unittest.mock import patch, MagicMock |
|
|
from src.agent.llm_client import ( |
|
|
plan_question, |
|
|
select_tools_with_function_calling, |
|
|
synthesize_answer |
|
|
) |
|
|
from src.tools import TOOLS |
|
|
|
|
|
|
|
|
class TestPlanningFunction: |
|
|
"""Test LLM-based planning function.""" |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
def test_plan_question_basic(self, mock_anthropic): |
|
|
"""Test planning with simple question.""" |
|
|
|
|
|
mock_client = MagicMock() |
|
|
mock_response = MagicMock() |
|
|
mock_response.content = [MagicMock(text="1. Search for information\n2. Analyze results")] |
|
|
mock_client.messages.create.return_value = mock_response |
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
plan = plan_question( |
|
|
question="What is the capital of France?", |
|
|
available_tools=TOOLS |
|
|
) |
|
|
|
|
|
assert isinstance(plan, str) |
|
|
assert len(plan) > 0 |
|
|
print(f"✓ Generated plan: {plan[:50]}...") |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
def test_plan_with_files(self, mock_anthropic): |
|
|
"""Test planning with file context.""" |
|
|
|
|
|
mock_client = MagicMock() |
|
|
mock_response = MagicMock() |
|
|
mock_response.content = [MagicMock(text="1. Parse file\n2. Extract data\n3. Calculate answer")] |
|
|
mock_client.messages.create.return_value = mock_response |
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
plan = plan_question( |
|
|
question="What is the total in the spreadsheet?", |
|
|
available_tools=TOOLS, |
|
|
file_paths=["data.xlsx"] |
|
|
) |
|
|
|
|
|
assert isinstance(plan, str) |
|
|
assert len(plan) > 0 |
|
|
print(f"✓ Generated plan with files: {plan[:50]}...") |
|
|
|
|
|
|
|
|
class TestToolSelection: |
|
|
"""Test LLM function calling for tool selection.""" |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
def test_select_single_tool(self, mock_anthropic): |
|
|
"""Test selecting single tool with parameters.""" |
|
|
|
|
|
mock_client = MagicMock() |
|
|
mock_response = MagicMock() |
|
|
|
|
|
|
|
|
mock_tool_use = MagicMock() |
|
|
mock_tool_use.type = "tool_use" |
|
|
mock_tool_use.name = "search" |
|
|
mock_tool_use.input = {"query": "capital of France"} |
|
|
mock_tool_use.id = "call_001" |
|
|
|
|
|
mock_response.content = [mock_tool_use] |
|
|
mock_client.messages.create.return_value = mock_response |
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
tool_calls = select_tools_with_function_calling( |
|
|
question="What is the capital of France?", |
|
|
plan="1. Search for capital of France", |
|
|
available_tools=TOOLS |
|
|
) |
|
|
|
|
|
assert isinstance(tool_calls, list) |
|
|
assert len(tool_calls) == 1 |
|
|
assert tool_calls[0]["tool"] == "search" |
|
|
assert "query" in tool_calls[0]["params"] |
|
|
print(f"✓ Selected tool: {tool_calls[0]}") |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
def test_select_multiple_tools(self, mock_anthropic): |
|
|
"""Test selecting multiple tools in sequence.""" |
|
|
|
|
|
mock_client = MagicMock() |
|
|
mock_response = MagicMock() |
|
|
|
|
|
|
|
|
mock_tool1 = MagicMock() |
|
|
mock_tool1.type = "tool_use" |
|
|
mock_tool1.name = "parse_file" |
|
|
mock_tool1.input = {"file_path": "data.xlsx"} |
|
|
mock_tool1.id = "call_001" |
|
|
|
|
|
mock_tool2 = MagicMock() |
|
|
mock_tool2.type = "tool_use" |
|
|
mock_tool2.name = "safe_eval" |
|
|
mock_tool2.input = {"expression": "sum(values)"} |
|
|
mock_tool2.id = "call_002" |
|
|
|
|
|
mock_response.content = [mock_tool1, mock_tool2] |
|
|
mock_client.messages.create.return_value = mock_response |
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
tool_calls = select_tools_with_function_calling( |
|
|
question="What is the sum in data.xlsx?", |
|
|
plan="1. Parse file\n2. Calculate sum", |
|
|
available_tools=TOOLS |
|
|
) |
|
|
|
|
|
assert isinstance(tool_calls, list) |
|
|
assert len(tool_calls) == 2 |
|
|
assert tool_calls[0]["tool"] == "parse_file" |
|
|
assert tool_calls[1]["tool"] == "safe_eval" |
|
|
print(f"✓ Selected {len(tool_calls)} tools") |
|
|
|
|
|
|
|
|
class TestAnswerSynthesis: |
|
|
"""Test LLM-based answer synthesis.""" |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
def test_synthesize_simple_answer(self, mock_anthropic): |
|
|
"""Test synthesizing answer from single evidence.""" |
|
|
|
|
|
mock_client = MagicMock() |
|
|
mock_response = MagicMock() |
|
|
mock_response.content = [MagicMock(text="Paris")] |
|
|
mock_client.messages.create.return_value = mock_response |
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
answer = synthesize_answer( |
|
|
question="What is the capital of France?", |
|
|
evidence=["[search] Paris is the capital and most populous city of France"] |
|
|
) |
|
|
|
|
|
assert isinstance(answer, str) |
|
|
assert len(answer) > 0 |
|
|
assert answer == "Paris" |
|
|
print(f"✓ Synthesized answer: {answer}") |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
def test_synthesize_from_multiple_evidence(self, mock_anthropic): |
|
|
"""Test synthesizing answer from multiple evidence sources.""" |
|
|
|
|
|
mock_client = MagicMock() |
|
|
mock_response = MagicMock() |
|
|
mock_response.content = [MagicMock(text="42")] |
|
|
mock_client.messages.create.return_value = mock_response |
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
answer = synthesize_answer( |
|
|
question="What is the answer?", |
|
|
evidence=[ |
|
|
"[search] The answer to life is 42", |
|
|
"[safe_eval] 6 * 7 = 42", |
|
|
"[parse_file] Result: 42" |
|
|
] |
|
|
) |
|
|
|
|
|
assert isinstance(answer, str) |
|
|
assert answer == "42" |
|
|
print(f"✓ Synthesized answer from {3} evidence items: {answer}") |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
def test_synthesize_with_conflicts(self, mock_anthropic): |
|
|
"""Test synthesizing answer when evidence conflicts.""" |
|
|
|
|
|
mock_client = MagicMock() |
|
|
mock_response = MagicMock() |
|
|
mock_response.content = [MagicMock(text="Paris")] |
|
|
mock_client.messages.create.return_value = mock_response |
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
answer = synthesize_answer( |
|
|
question="What is the capital of France?", |
|
|
evidence=[ |
|
|
"[search] Paris is the capital of France (source: Wikipedia, 2024)", |
|
|
"[search] Lyon was briefly capital during revolution (source: old text, 1793)" |
|
|
] |
|
|
) |
|
|
|
|
|
assert isinstance(answer, str) |
|
|
assert answer == "Paris" |
|
|
print(f"✓ Resolved conflict, answer: {answer}") |
|
|
|
|
|
|
|
|
class TestEndToEndWorkflow: |
|
|
"""Test full agent workflow with mocked LLM.""" |
|
|
|
|
|
@patch('src.agent.llm_client.Anthropic') |
|
|
@patch('src.tools.web_search.tavily_search') |
|
|
def test_full_search_workflow(self, mock_tavily, mock_anthropic): |
|
|
"""Test complete workflow: plan → search → answer.""" |
|
|
from src.agent import GAIAAgent |
|
|
|
|
|
|
|
|
mock_tavily.return_value = "Paris is the capital and most populous city of France" |
|
|
|
|
|
|
|
|
mock_client = MagicMock() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_plan_response = MagicMock() |
|
|
mock_plan_response.content = [MagicMock(text="1. Search for capital of France")] |
|
|
|
|
|
mock_tool_response = MagicMock() |
|
|
mock_tool_use = MagicMock() |
|
|
mock_tool_use.type = "tool_use" |
|
|
mock_tool_use.name = "web_search" |
|
|
mock_tool_use.input = {"query": "capital of France"} |
|
|
mock_tool_use.id = "call_001" |
|
|
mock_tool_response.content = [mock_tool_use] |
|
|
|
|
|
mock_answer_response = MagicMock() |
|
|
mock_answer_response.content = [MagicMock(text="Paris")] |
|
|
|
|
|
|
|
|
mock_client.messages.create.side_effect = [ |
|
|
mock_plan_response, |
|
|
mock_tool_response, |
|
|
mock_answer_response |
|
|
] |
|
|
|
|
|
mock_anthropic.return_value = mock_client |
|
|
|
|
|
|
|
|
agent = GAIAAgent() |
|
|
answer = agent("What is the capital of France?") |
|
|
|
|
|
assert isinstance(answer, str) |
|
|
assert answer == "Paris" |
|
|
print(f"✓ Full workflow completed, answer: {answer}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("\n" + "="*70) |
|
|
print("GAIA Agent - Stage 3 LLM Integration Tests") |
|
|
print("="*70 + "\n") |
|
|
|
|
|
|
|
|
test_plan = TestPlanningFunction() |
|
|
test_plan.test_plan_question_basic() |
|
|
test_plan.test_plan_with_files() |
|
|
|
|
|
test_tools = TestToolSelection() |
|
|
test_tools.test_select_single_tool() |
|
|
test_tools.test_select_multiple_tools() |
|
|
|
|
|
test_answer = TestAnswerSynthesis() |
|
|
test_answer.test_synthesize_simple_answer() |
|
|
test_answer.test_synthesize_from_multiple_evidence() |
|
|
test_answer.test_synthesize_with_conflicts() |
|
|
|
|
|
test_e2e = TestEndToEndWorkflow() |
|
|
test_e2e.test_full_search_workflow() |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("✓ All Stage 3 LLM integration tests passed!") |
|
|
print("="*70 + "\n") |
|
|
|