agercas's picture
refactor code
b6088cd
"""
Multi-Agent System for GAIA Benchmark using smolagents
Architecture: Coordinator -> Specialized Agents
"""
from typing import Any
from smolagents import CodeAgent, HfApiModel
from src.tools import all_tools
class GAIAMultiAgentSystem:
"""
Multi-agent system designed for GAIA benchmark tasks.
Uses a coordinator agent that delegates to specialized agents.
"""
def __init__(self, model_config: dict[str, Any] | None = None):
"""
Initialize the multi-agent system.
Args:
model_config: Configuration for the language model
e.g., {"model_id": "Qwen/Qwen2.5-Coder-32B-Instruct", "provider": "together"}
"""
model_config = model_config or {}
self.model = HfApiModel(**model_config)
# self.model = InferenceClientModel(**model_config)
self.agents = {}
self._setup_agents()
self._setup_coordinator()
def _setup_agents(self):
"""Set up all specialized agents with their respective tools."""
# Search Agent - Information retrieval
search_tools = [
# Assuming these are your actual tool instances
# Replace with actual tool references from all_tools
"wikipedia_search",
"wikipedia_search_tool",
"duckduckgo_search",
"web_search_duckduckgo",
"arxiv_search",
"fetch_webpage_content",
]
self.agents["search_agent"] = CodeAgent(
model=self.model,
tools=[tool for tool in all_tools if tool.name in search_tools],
name="search_agent",
description="Retrieves factual information and background data from various sources including Wikipedia, web search, and academic papers",
verbosity_level=1,
max_steps=10,
)
# Document Agent - Document processing
document_tools = ["load_csv_file", "load_excel_file", "read_text_file", "transcribe_audio_file"]
self.agents["document_agent"] = CodeAgent(
model=self.model,
tools=[tool for tool in all_tools if tool.name in document_tools],
name="document_agent",
description="Loads and processes structured and unstructured documents including CSV, Excel, text files, and audio transcriptions",
verbosity_level=1,
max_steps=8,
)
# Vision Agent - Image processing
vision_tools = ["ocr_tool", "image_captioning_tool", "visual_qa_tool"]
self.agents["vision_agent"] = CodeAgent(
model=self.model,
tools=[tool for tool in all_tools if tool.name in vision_tools],
name="vision_agent",
description="Extracts text and meaning from images using OCR, captioning, and visual question answering",
verbosity_level=1,
max_steps=6,
)
# Reasoning Agent - Logic and analysis
reasoning_tools = ["analyze_chess_position", "analyze_table_commutativity", "count_items_in_list"]
self.agents["reasoning_agent"] = CodeAgent(
model=self.model,
tools=[tool for tool in all_tools if tool.name in reasoning_tools],
name="reasoning_agent",
description="Performs symbolic reasoning, logical pattern recognition, and analytical tasks",
verbosity_level=1,
max_steps=8,
)
# Language Agent - Text processing
language_tools = ["reverse_string", "reverse_words_in_string"]
# Note: Language agent might need additional string manipulation tools
self.agents["language_agent"] = CodeAgent(
model=self.model,
tools=[tool for tool in all_tools if tool.name in language_tools],
name="language_agent",
description="Handles low-level text transformations and string manipulations",
verbosity_level=1,
max_steps=5,
)
# Coding Agent - Python execution and logic
self.agents["coding_agent"] = CodeAgent(
model=self.model,
tools=[], # Uses implicit code execution capabilities
name="coding_agent",
description="Executes Python code and performs computational logic through code interpretation",
additional_authorized_imports=[
"pandas",
"numpy",
"matplotlib",
"json",
"re",
"datetime",
"math",
"statistics",
"itertools",
],
verbosity_level=1,
max_steps=10,
)
def _setup_coordinator(self):
"""Set up the coordinator agent that manages other agents."""
# Collect all managed agents
managed_agents = list(self.agents.values())
self.coordinator = CodeAgent(
model=self.model,
tools=[], # Coordinator has no direct tools
managed_agents=managed_agents,
name="coordinator",
description="Coordinates and delegates tasks to specialized agents based on task requirements",
planning_interval=3, # Plan every 3 steps
verbosity_level=2,
max_steps=20,
)
def analyze_task(self, task: str) -> dict[str, Any]:
"""
Analyze a GAIA task to determine which agents might be needed.
Args:
task: The task description
Returns:
Dictionary with task analysis
"""
analysis_prompt = f"""
Analyze this GAIA benchmark task and determine which types of agents would be most useful:
Task: {task}
Available agent types:
- search_agent: For finding factual information online
- document_agent: For processing files (CSV, Excel, text, audio)
- vision_agent: For analyzing images
- reasoning_agent: For logical analysis and pattern recognition
- language_agent: For text transformations
- coding_agent: For computational tasks and data processing
Provide a brief analysis of what agents would be needed and why.
"""
# Use the coordinator's model for analysis
response = self.model([{"role": "user", "content": analysis_prompt}])
return {"analysis": response.content, "task": task}
def solve_task(self, task: str, context: str | None = None) -> Any:
"""
Solve a GAIA benchmark task using the multi-agent system.
Args:
task: The task to solve
context: Optional additional context
Returns:
The result from the coordinator agent
"""
# Prepare the enhanced prompt for the coordinator
enhanced_task = f"""
You are coordinating a team of specialized agents to solve this GAIA benchmark task.
TASK: {task}
{f"CONTEXT: {context}" if context else ""}
Available agents and their capabilities:
- search_agent: Retrieves information from Wikipedia, web search, ArXiv
- document_agent: Processes CSV, Excel, text files, and audio transcriptions
- vision_agent: Analyzes images with OCR, captioning, and visual QA
- reasoning_agent: Performs logical analysis and pattern recognition
- language_agent: Handles text transformations and string operations
- coding_agent: Executes Python code for computational tasks
Strategy:
1. Analyze what type of information or processing is needed
2. Delegate to appropriate specialized agents
3. Combine results from multiple agents if needed
4. Provide a final comprehensive answer
Be systematic and thorough. Use multiple agents when the task requires different types of expertise.
"""
return self.coordinator.run(enhanced_task)
def get_agent_info(self) -> dict[str, dict]:
"""Get information about all agents in the system."""
info = {}
for name, agent in self.agents.items():
info[name] = {
"description": agent.description,
"tools": [tool.name for tool in agent.tools] if hasattr(agent, "tools") else [],
"max_steps": agent.max_steps,
}
info["coordinator"] = {
"description": self.coordinator.description,
"managed_agents": [agent.name for agent in self.coordinator.managed_agents],
"max_steps": self.coordinator.max_steps,
}
return info
def visualize_system(self):
"""Visualize the multi-agent system structure."""
if hasattr(self.coordinator, "visualize"):
return self.coordinator.visualize()
else:
print("System Structure:")
print("Coordinator")
for agent_name in self.agents.keys():
print(f" └── {agent_name}")