File size: 10,607 Bytes
225a75e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
#!/usr/bin/env python3
"""
Real GAIA Questions Test for GAIA Agent System
Tests the system with actual GAIA benchmark questions
"""
import json
import os
import sys
import time
from pathlib import Path
from typing import Dict, List
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from agents.state import GAIAAgentState, QuestionType, AgentRole
from agents.router import RouterAgent
from agents.web_researcher import WebResearchAgent
from agents.file_processor_agent import FileProcessorAgent
from agents.reasoning_agent import ReasoningAgent
from models.qwen_client import QwenClient
def load_gaia_questions(file_path: str = "questions.json") -> List[Dict]:
"""Load GAIA questions from JSON file"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
questions = json.load(f)
return questions
except FileNotFoundError:
print(f"β Questions file not found: {file_path}")
return []
except json.JSONDecodeError as e:
print(f"β Invalid JSON in questions file: {e}")
return []
def classify_question_manually(question: str, file_name: str) -> Dict:
"""Manually classify GAIA questions to compare with router"""
question_lower = question.lower()
# Manual classification based on question content
if "wikipedia" in question_lower or "featured article" in question_lower:
return {"type": "Wikipedia Research", "expected_agent": "web_researcher"}
elif "youtube.com" in question or "youtu.be" in question:
return {"type": "YouTube Analysis", "expected_agent": "web_researcher"}
elif file_name and file_name.endswith(('.xlsx', '.csv')):
return {"type": "Excel/CSV Processing", "expected_agent": "file_processor"}
elif file_name and file_name.endswith('.py'):
return {"type": "Python Code Analysis", "expected_agent": "file_processor"}
elif file_name and file_name.endswith(('.mp3', '.wav')):
return {"type": "Audio Processing", "expected_agent": "file_processor"}
elif file_name and file_name.endswith(('.png', '.jpg', '.jpeg')):
return {"type": "Image Analysis", "expected_agent": "file_processor"}
elif any(word in question_lower for word in ['calculate', 'total', 'average', 'sum']):
return {"type": "Mathematical Reasoning", "expected_agent": "reasoning_agent"}
elif "reverse" in question_lower or "encode" in question_lower:
return {"type": "Text Manipulation", "expected_agent": "reasoning_agent"}
elif any(word in question_lower for word in ['athletes', 'competition', 'olympics']):
return {"type": "Sports/Statistics Research", "expected_agent": "web_researcher"}
else:
return {"type": "General Research", "expected_agent": "web_researcher"}
def test_real_gaia_questions():
"""Test system with real GAIA questions"""
print("π§ͺ Real GAIA Questions Test")
print("=" * 50)
# Load questions
questions = load_gaia_questions("../questions.json")
if not questions:
print("β No questions loaded. Exiting.")
return False
print(f"π Loaded {len(questions)} GAIA questions")
# Initialize system
try:
llm_client = QwenClient()
router = RouterAgent(llm_client)
web_agent = WebResearchAgent(llm_client)
file_agent = FileProcessorAgent(llm_client)
reasoning_agent = ReasoningAgent(llm_client)
except Exception as e:
print(f"β Failed to initialize system: {e}")
return False
# Test subset of questions (to manage cost)
test_questions = questions[:8] # Test first 8 questions
results = []
total_cost = 0.0
start_time = time.time()
# Question type distribution tracking
question_types = {}
routing_accuracy = {"correct": 0, "total": 0}
for i, q in enumerate(test_questions, 1):
print(f"\nπ Question {i}/{len(test_questions)}")
print(f" ID: {q['task_id']}")
print(f" Level: {q['Level']}")
print(f" File: {q['file_name'] if q['file_name'] else 'None'}")
print(f" Question: {q['question'][:100]}...")
# Manual classification for comparison
manual_class = classify_question_manually(q['question'], q['file_name'])
print(f" Expected Type: {manual_class['type']}")
try:
# Initialize state
state = GAIAAgentState()
state.task_id = q['task_id']
state.question = q['question']
state.difficulty_level = int(q['Level'])
state.file_name = q['file_name'] if q['file_name'] else None
if state.file_name:
state.file_path = f"/tmp/{state.file_name}" # Placeholder path
# Route question
routed_state = router.route_question(state)
print(f" π§ Router: {routed_state.question_type.value} -> {[a.value for a in routed_state.selected_agents]}")
print(f" π Complexity: {routed_state.complexity_assessment}")
print(f" π° Est. Cost: ${routed_state.estimated_cost:.4f}")
# Track question types
q_type = routed_state.question_type.value
question_types[q_type] = question_types.get(q_type, 0) + 1
# Check routing accuracy (simplified)
expected_agent = manual_class["expected_agent"]
actual_agents = [a.value for a in routed_state.selected_agents]
if expected_agent in actual_agents:
routing_accuracy["correct"] += 1
routing_accuracy["total"] += 1
# Only process if we have the required agent implemented
processed = False
if AgentRole.WEB_RESEARCHER in routed_state.selected_agents:
try:
processed_state = web_agent.process(routed_state)
processed = True
except Exception as e:
print(f" β οΈ Web researcher failed: {e}")
elif AgentRole.REASONING_AGENT in routed_state.selected_agents:
try:
processed_state = reasoning_agent.process(routed_state)
processed = True
except Exception as e:
print(f" β οΈ Reasoning agent failed: {e}")
elif AgentRole.FILE_PROCESSOR in routed_state.selected_agents and not state.file_name:
print(f" β οΈ File processor selected but no file provided")
if processed:
agent_result = list(processed_state.agent_results.values())[-1]
cost = processed_state.total_cost
processing_time = processed_state.total_processing_time
print(f" β
Processed by: {agent_result.agent_role.value}")
print(f" π Result: {agent_result.result[:150]}...")
print(f" π Confidence: {agent_result.confidence:.2f}")
print(f" π° Actual Cost: ${cost:.4f}")
print(f" β±οΈ Time: {processing_time:.2f}s")
total_cost += cost
results.append({
"success": agent_result.success,
"confidence": agent_result.confidence,
"cost": cost,
"time": processing_time
})
else:
print(f" π Routing only (no processing)")
results.append({
"success": True, # Routing succeeded
"confidence": 0.5, # Neutral
"cost": 0.0,
"time": 0.0
})
except Exception as e:
print(f" β Failed: {e}")
results.append({
"success": False,
"confidence": 0.0,
"cost": 0.0,
"time": 0.0
})
# Summary
total_time = time.time() - start_time
successful_results = [r for r in results if r["success"]]
print("\n" + "=" * 50)
print("π REAL GAIA TEST RESULTS")
print("=" * 50)
# Basic stats
print(f"π― Questions Processed: {len(results)}")
print(f"β
Successful Processing: {len(successful_results)}/{len(results)} ({len(successful_results)/len(results)*100:.1f}%)")
print(f"π° Total Cost: ${total_cost:.4f}")
print(f"β±οΈ Total Time: {total_time:.2f} seconds")
if successful_results:
avg_confidence = sum(r["confidence"] for r in successful_results) / len(successful_results)
avg_cost = sum(r["cost"] for r in successful_results) / len(successful_results)
avg_time = sum(r["time"] for r in successful_results) / len(successful_results)
print(f"π Average Confidence: {avg_confidence:.2f}")
print(f"π° Average Cost: ${avg_cost:.4f}")
print(f"β‘ Average Time: {avg_time:.2f}s")
# Question type distribution
print(f"\nπ Question Type Distribution:")
for q_type, count in question_types.items():
print(f" {q_type}: {count}")
# Routing accuracy
routing_rate = routing_accuracy["correct"] / routing_accuracy["total"] * 100 if routing_accuracy["total"] > 0 else 0
print(f"\nπ§ Routing Accuracy: {routing_accuracy['correct']}/{routing_accuracy['total']} ({routing_rate:.1f}%)")
# Budget analysis
monthly_budget = 0.10
if total_cost <= monthly_budget:
remaining = monthly_budget - total_cost
estimated_questions = int(remaining / (total_cost / len(results))) if total_cost > 0 else 1000
print(f"π° Budget Status: β
${remaining:.4f} remaining (~{estimated_questions} more questions)")
else:
print(f"π° Budget Status: β οΈ Over budget by ${total_cost - monthly_budget:.4f}")
# Success assessment
success_rate = len(successful_results) / len(results) * 100
if success_rate >= 80:
print(f"\nπ EXCELLENT! System handles real GAIA questions well ({success_rate:.1f}% success)")
return True
elif success_rate >= 60:
print(f"\nβ
GOOD! System shows promise ({success_rate:.1f}% success)")
return True
else:
print(f"\nβ οΈ NEEDS WORK! Low success rate ({success_rate:.1f}%)")
return False
if __name__ == "__main__":
success = test_real_gaia_questions()
sys.exit(0 if success else 1) |