#!/usr/bin/env python3 """ Real GAIA Questions Test for GAIA Agent System Tests the system with actual GAIA benchmark questions """ import json import os import sys import time from pathlib import Path from typing import Dict, List # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent)) from agents.state import GAIAAgentState, QuestionType, AgentRole from agents.router import RouterAgent from agents.web_researcher import WebResearchAgent from agents.file_processor_agent import FileProcessorAgent from agents.reasoning_agent import ReasoningAgent from models.qwen_client import QwenClient def load_gaia_questions(file_path: str = "questions.json") -> List[Dict]: """Load GAIA questions from JSON file""" try: with open(file_path, 'r', encoding='utf-8') as f: questions = json.load(f) return questions except FileNotFoundError: print(f"โŒ Questions file not found: {file_path}") return [] except json.JSONDecodeError as e: print(f"โŒ Invalid JSON in questions file: {e}") return [] def classify_question_manually(question: str, file_name: str) -> Dict: """Manually classify GAIA questions to compare with router""" question_lower = question.lower() # Manual classification based on question content if "wikipedia" in question_lower or "featured article" in question_lower: return {"type": "Wikipedia Research", "expected_agent": "web_researcher"} elif "youtube.com" in question or "youtu.be" in question: return {"type": "YouTube Analysis", "expected_agent": "web_researcher"} elif file_name and file_name.endswith(('.xlsx', '.csv')): return {"type": "Excel/CSV Processing", "expected_agent": "file_processor"} elif file_name and file_name.endswith('.py'): return {"type": "Python Code Analysis", "expected_agent": "file_processor"} elif file_name and file_name.endswith(('.mp3', '.wav')): return {"type": "Audio Processing", "expected_agent": "file_processor"} elif file_name and file_name.endswith(('.png', '.jpg', '.jpeg')): return {"type": "Image Analysis", "expected_agent": "file_processor"} elif any(word in question_lower for word in ['calculate', 'total', 'average', 'sum']): return {"type": "Mathematical Reasoning", "expected_agent": "reasoning_agent"} elif "reverse" in question_lower or "encode" in question_lower: return {"type": "Text Manipulation", "expected_agent": "reasoning_agent"} elif any(word in question_lower for word in ['athletes', 'competition', 'olympics']): return {"type": "Sports/Statistics Research", "expected_agent": "web_researcher"} else: return {"type": "General Research", "expected_agent": "web_researcher"} def test_real_gaia_questions(): """Test system with real GAIA questions""" print("๐Ÿงช Real GAIA Questions Test") print("=" * 50) # Load questions questions = load_gaia_questions("../questions.json") if not questions: print("โŒ No questions loaded. Exiting.") return False print(f"๐Ÿ“‹ Loaded {len(questions)} GAIA questions") # Initialize system try: llm_client = QwenClient() router = RouterAgent(llm_client) web_agent = WebResearchAgent(llm_client) file_agent = FileProcessorAgent(llm_client) reasoning_agent = ReasoningAgent(llm_client) except Exception as e: print(f"โŒ Failed to initialize system: {e}") return False # Test subset of questions (to manage cost) test_questions = questions[:8] # Test first 8 questions results = [] total_cost = 0.0 start_time = time.time() # Question type distribution tracking question_types = {} routing_accuracy = {"correct": 0, "total": 0} for i, q in enumerate(test_questions, 1): print(f"\n๐Ÿ” Question {i}/{len(test_questions)}") print(f" ID: {q['task_id']}") print(f" Level: {q['Level']}") print(f" File: {q['file_name'] if q['file_name'] else 'None'}") print(f" Question: {q['question'][:100]}...") # Manual classification for comparison manual_class = classify_question_manually(q['question'], q['file_name']) print(f" Expected Type: {manual_class['type']}") try: # Initialize state state = GAIAAgentState() state.task_id = q['task_id'] state.question = q['question'] state.difficulty_level = int(q['Level']) state.file_name = q['file_name'] if q['file_name'] else None if state.file_name: state.file_path = f"/tmp/{state.file_name}" # Placeholder path # Route question routed_state = router.route_question(state) print(f" ๐Ÿงญ Router: {routed_state.question_type.value} -> {[a.value for a in routed_state.selected_agents]}") print(f" ๐Ÿ“Š Complexity: {routed_state.complexity_assessment}") print(f" ๐Ÿ’ฐ Est. Cost: ${routed_state.estimated_cost:.4f}") # Track question types q_type = routed_state.question_type.value question_types[q_type] = question_types.get(q_type, 0) + 1 # Check routing accuracy (simplified) expected_agent = manual_class["expected_agent"] actual_agents = [a.value for a in routed_state.selected_agents] if expected_agent in actual_agents: routing_accuracy["correct"] += 1 routing_accuracy["total"] += 1 # Only process if we have the required agent implemented processed = False if AgentRole.WEB_RESEARCHER in routed_state.selected_agents: try: processed_state = web_agent.process(routed_state) processed = True except Exception as e: print(f" โš ๏ธ Web researcher failed: {e}") elif AgentRole.REASONING_AGENT in routed_state.selected_agents: try: processed_state = reasoning_agent.process(routed_state) processed = True except Exception as e: print(f" โš ๏ธ Reasoning agent failed: {e}") elif AgentRole.FILE_PROCESSOR in routed_state.selected_agents and not state.file_name: print(f" โš ๏ธ File processor selected but no file provided") if processed: agent_result = list(processed_state.agent_results.values())[-1] cost = processed_state.total_cost processing_time = processed_state.total_processing_time print(f" โœ… Processed by: {agent_result.agent_role.value}") print(f" ๐Ÿ“ Result: {agent_result.result[:150]}...") print(f" ๐Ÿ“Š Confidence: {agent_result.confidence:.2f}") print(f" ๐Ÿ’ฐ Actual Cost: ${cost:.4f}") print(f" โฑ๏ธ Time: {processing_time:.2f}s") total_cost += cost results.append({ "success": agent_result.success, "confidence": agent_result.confidence, "cost": cost, "time": processing_time }) else: print(f" ๐Ÿ”„ Routing only (no processing)") results.append({ "success": True, # Routing succeeded "confidence": 0.5, # Neutral "cost": 0.0, "time": 0.0 }) except Exception as e: print(f" โŒ Failed: {e}") results.append({ "success": False, "confidence": 0.0, "cost": 0.0, "time": 0.0 }) # Summary total_time = time.time() - start_time successful_results = [r for r in results if r["success"]] print("\n" + "=" * 50) print("๐Ÿ“Š REAL GAIA TEST RESULTS") print("=" * 50) # Basic stats print(f"๐ŸŽฏ Questions Processed: {len(results)}") print(f"โœ… Successful Processing: {len(successful_results)}/{len(results)} ({len(successful_results)/len(results)*100:.1f}%)") print(f"๐Ÿ’ฐ Total Cost: ${total_cost:.4f}") print(f"โฑ๏ธ Total Time: {total_time:.2f} seconds") if successful_results: avg_confidence = sum(r["confidence"] for r in successful_results) / len(successful_results) avg_cost = sum(r["cost"] for r in successful_results) / len(successful_results) avg_time = sum(r["time"] for r in successful_results) / len(successful_results) print(f"๐Ÿ“ˆ Average Confidence: {avg_confidence:.2f}") print(f"๐Ÿ’ฐ Average Cost: ${avg_cost:.4f}") print(f"โšก Average Time: {avg_time:.2f}s") # Question type distribution print(f"\n๐Ÿ“‹ Question Type Distribution:") for q_type, count in question_types.items(): print(f" {q_type}: {count}") # Routing accuracy routing_rate = routing_accuracy["correct"] / routing_accuracy["total"] * 100 if routing_accuracy["total"] > 0 else 0 print(f"\n๐Ÿงญ Routing Accuracy: {routing_accuracy['correct']}/{routing_accuracy['total']} ({routing_rate:.1f}%)") # Budget analysis monthly_budget = 0.10 if total_cost <= monthly_budget: remaining = monthly_budget - total_cost estimated_questions = int(remaining / (total_cost / len(results))) if total_cost > 0 else 1000 print(f"๐Ÿ’ฐ Budget Status: โœ… ${remaining:.4f} remaining (~{estimated_questions} more questions)") else: print(f"๐Ÿ’ฐ Budget Status: โš ๏ธ Over budget by ${total_cost - monthly_budget:.4f}") # Success assessment success_rate = len(successful_results) / len(results) * 100 if success_rate >= 80: print(f"\n๐Ÿš€ EXCELLENT! System handles real GAIA questions well ({success_rate:.1f}% success)") return True elif success_rate >= 60: print(f"\nโœ… GOOD! System shows promise ({success_rate:.1f}% success)") return True else: print(f"\nโš ๏ธ NEEDS WORK! Low success rate ({success_rate:.1f}%)") return False if __name__ == "__main__": success = test_real_gaia_questions() sys.exit(0 if success else 1)