| |
| """ |
| Real GAIA Questions Test for GAIA Agent System |
| Tests the system with actual GAIA benchmark questions |
| """ |
|
|
| import json |
| import os |
| import sys |
| import time |
| from pathlib import Path |
| from typing import Dict, List |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| from agents.state import GAIAAgentState, QuestionType, AgentRole |
| from agents.router import RouterAgent |
| from agents.web_researcher import WebResearchAgent |
| from agents.file_processor_agent import FileProcessorAgent |
| from agents.reasoning_agent import ReasoningAgent |
| from models.qwen_client import QwenClient |
|
|
| def load_gaia_questions(file_path: str = "questions.json") -> List[Dict]: |
| """Load GAIA questions from JSON file""" |
| try: |
| with open(file_path, 'r', encoding='utf-8') as f: |
| questions = json.load(f) |
| return questions |
| except FileNotFoundError: |
| print(f"β Questions file not found: {file_path}") |
| return [] |
| except json.JSONDecodeError as e: |
| print(f"β Invalid JSON in questions file: {e}") |
| return [] |
|
|
| def classify_question_manually(question: str, file_name: str) -> Dict: |
| """Manually classify GAIA questions to compare with router""" |
| |
| question_lower = question.lower() |
| |
| |
| if "wikipedia" in question_lower or "featured article" in question_lower: |
| return {"type": "Wikipedia Research", "expected_agent": "web_researcher"} |
| elif "youtube.com" in question or "youtu.be" in question: |
| return {"type": "YouTube Analysis", "expected_agent": "web_researcher"} |
| elif file_name and file_name.endswith(('.xlsx', '.csv')): |
| return {"type": "Excel/CSV Processing", "expected_agent": "file_processor"} |
| elif file_name and file_name.endswith('.py'): |
| return {"type": "Python Code Analysis", "expected_agent": "file_processor"} |
| elif file_name and file_name.endswith(('.mp3', '.wav')): |
| return {"type": "Audio Processing", "expected_agent": "file_processor"} |
| elif file_name and file_name.endswith(('.png', '.jpg', '.jpeg')): |
| return {"type": "Image Analysis", "expected_agent": "file_processor"} |
| elif any(word in question_lower for word in ['calculate', 'total', 'average', 'sum']): |
| return {"type": "Mathematical Reasoning", "expected_agent": "reasoning_agent"} |
| elif "reverse" in question_lower or "encode" in question_lower: |
| return {"type": "Text Manipulation", "expected_agent": "reasoning_agent"} |
| elif any(word in question_lower for word in ['athletes', 'competition', 'olympics']): |
| return {"type": "Sports/Statistics Research", "expected_agent": "web_researcher"} |
| else: |
| return {"type": "General Research", "expected_agent": "web_researcher"} |
|
|
| def test_real_gaia_questions(): |
| """Test system with real GAIA questions""" |
| |
| print("π§ͺ Real GAIA Questions Test") |
| print("=" * 50) |
| |
| |
| questions = load_gaia_questions("../questions.json") |
| if not questions: |
| print("β No questions loaded. Exiting.") |
| return False |
| |
| print(f"π Loaded {len(questions)} GAIA questions") |
| |
| |
| try: |
| llm_client = QwenClient() |
| router = RouterAgent(llm_client) |
| web_agent = WebResearchAgent(llm_client) |
| file_agent = FileProcessorAgent(llm_client) |
| reasoning_agent = ReasoningAgent(llm_client) |
| except Exception as e: |
| print(f"β Failed to initialize system: {e}") |
| return False |
| |
| |
| test_questions = questions[:8] |
| |
| results = [] |
| total_cost = 0.0 |
| start_time = time.time() |
| |
| |
| question_types = {} |
| routing_accuracy = {"correct": 0, "total": 0} |
| |
| for i, q in enumerate(test_questions, 1): |
| print(f"\nπ Question {i}/{len(test_questions)}") |
| print(f" ID: {q['task_id']}") |
| print(f" Level: {q['Level']}") |
| print(f" File: {q['file_name'] if q['file_name'] else 'None'}") |
| print(f" Question: {q['question'][:100]}...") |
| |
| |
| manual_class = classify_question_manually(q['question'], q['file_name']) |
| print(f" Expected Type: {manual_class['type']}") |
| |
| try: |
| |
| state = GAIAAgentState() |
| state.task_id = q['task_id'] |
| state.question = q['question'] |
| state.difficulty_level = int(q['Level']) |
| state.file_name = q['file_name'] if q['file_name'] else None |
| if state.file_name: |
| state.file_path = f"/tmp/{state.file_name}" |
| |
| |
| routed_state = router.route_question(state) |
| print(f" π§ Router: {routed_state.question_type.value} -> {[a.value for a in routed_state.selected_agents]}") |
| print(f" π Complexity: {routed_state.complexity_assessment}") |
| print(f" π° Est. Cost: ${routed_state.estimated_cost:.4f}") |
| |
| |
| q_type = routed_state.question_type.value |
| question_types[q_type] = question_types.get(q_type, 0) + 1 |
| |
| |
| expected_agent = manual_class["expected_agent"] |
| actual_agents = [a.value for a in routed_state.selected_agents] |
| if expected_agent in actual_agents: |
| routing_accuracy["correct"] += 1 |
| routing_accuracy["total"] += 1 |
| |
| |
| processed = False |
| if AgentRole.WEB_RESEARCHER in routed_state.selected_agents: |
| try: |
| processed_state = web_agent.process(routed_state) |
| processed = True |
| except Exception as e: |
| print(f" β οΈ Web researcher failed: {e}") |
| |
| elif AgentRole.REASONING_AGENT in routed_state.selected_agents: |
| try: |
| processed_state = reasoning_agent.process(routed_state) |
| processed = True |
| except Exception as e: |
| print(f" β οΈ Reasoning agent failed: {e}") |
| |
| elif AgentRole.FILE_PROCESSOR in routed_state.selected_agents and not state.file_name: |
| print(f" β οΈ File processor selected but no file provided") |
| |
| if processed: |
| agent_result = list(processed_state.agent_results.values())[-1] |
| cost = processed_state.total_cost |
| processing_time = processed_state.total_processing_time |
| |
| print(f" β
Processed by: {agent_result.agent_role.value}") |
| print(f" π Result: {agent_result.result[:150]}...") |
| print(f" π Confidence: {agent_result.confidence:.2f}") |
| print(f" π° Actual Cost: ${cost:.4f}") |
| print(f" β±οΈ Time: {processing_time:.2f}s") |
| |
| total_cost += cost |
| results.append({ |
| "success": agent_result.success, |
| "confidence": agent_result.confidence, |
| "cost": cost, |
| "time": processing_time |
| }) |
| else: |
| print(f" π Routing only (no processing)") |
| results.append({ |
| "success": True, |
| "confidence": 0.5, |
| "cost": 0.0, |
| "time": 0.0 |
| }) |
| |
| except Exception as e: |
| print(f" β Failed: {e}") |
| results.append({ |
| "success": False, |
| "confidence": 0.0, |
| "cost": 0.0, |
| "time": 0.0 |
| }) |
| |
| |
| total_time = time.time() - start_time |
| successful_results = [r for r in results if r["success"]] |
| |
| print("\n" + "=" * 50) |
| print("π REAL GAIA TEST RESULTS") |
| print("=" * 50) |
| |
| |
| print(f"π― Questions Processed: {len(results)}") |
| print(f"β
Successful Processing: {len(successful_results)}/{len(results)} ({len(successful_results)/len(results)*100:.1f}%)") |
| print(f"π° Total Cost: ${total_cost:.4f}") |
| print(f"β±οΈ Total Time: {total_time:.2f} seconds") |
| |
| if successful_results: |
| avg_confidence = sum(r["confidence"] for r in successful_results) / len(successful_results) |
| avg_cost = sum(r["cost"] for r in successful_results) / len(successful_results) |
| avg_time = sum(r["time"] for r in successful_results) / len(successful_results) |
| |
| print(f"π Average Confidence: {avg_confidence:.2f}") |
| print(f"π° Average Cost: ${avg_cost:.4f}") |
| print(f"β‘ Average Time: {avg_time:.2f}s") |
| |
| |
| print(f"\nπ Question Type Distribution:") |
| for q_type, count in question_types.items(): |
| print(f" {q_type}: {count}") |
| |
| |
| routing_rate = routing_accuracy["correct"] / routing_accuracy["total"] * 100 if routing_accuracy["total"] > 0 else 0 |
| print(f"\nπ§ Routing Accuracy: {routing_accuracy['correct']}/{routing_accuracy['total']} ({routing_rate:.1f}%)") |
| |
| |
| monthly_budget = 0.10 |
| if total_cost <= monthly_budget: |
| remaining = monthly_budget - total_cost |
| estimated_questions = int(remaining / (total_cost / len(results))) if total_cost > 0 else 1000 |
| print(f"π° Budget Status: β
${remaining:.4f} remaining (~{estimated_questions} more questions)") |
| else: |
| print(f"π° Budget Status: β οΈ Over budget by ${total_cost - monthly_budget:.4f}") |
| |
| |
| success_rate = len(successful_results) / len(results) * 100 |
| if success_rate >= 80: |
| print(f"\nπ EXCELLENT! System handles real GAIA questions well ({success_rate:.1f}% success)") |
| return True |
| elif success_rate >= 60: |
| print(f"\nβ
GOOD! System shows promise ({success_rate:.1f}% success)") |
| return True |
| else: |
| print(f"\nβ οΈ NEEDS WORK! Low success rate ({success_rate:.1f}%)") |
| return False |
|
|
| if __name__ == "__main__": |
| success = test_real_gaia_questions() |
| sys.exit(0 if success else 1) |