Agent_Course_Final_Assignment

Sleeping

Agent_Course_Final_Assignment / src /test_real_gaia.py

Chris

Final 4

225a75e 11 months ago

10.6 kB

	#!/usr/bin/env python3
	"""
	Real GAIA Questions Test for GAIA Agent System
	Tests the system with actual GAIA benchmark questions
	"""

	import json
	import os
	import sys
	import time
	from pathlib import Path
	from typing import Dict, List

	# Add src to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	from agents.state import GAIAAgentState, QuestionType, AgentRole
	from agents.router import RouterAgent
	from agents.web_researcher import WebResearchAgent
	from agents.file_processor_agent import FileProcessorAgent
	from agents.reasoning_agent import ReasoningAgent
	from models.qwen_client import QwenClient

	def load_gaia_questions(file_path: str = "questions.json") -> List[Dict]:
	"""Load GAIA questions from JSON file"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	questions = json.load(f)
	return questions
	except FileNotFoundError:
	print(f"❌ Questions file not found: {file_path}")
	return []
	except json.JSONDecodeError as e:
	print(f"❌ Invalid JSON in questions file: {e}")
	return []

	def classify_question_manually(question: str, file_name: str) -> Dict:
	"""Manually classify GAIA questions to compare with router"""

	question_lower = question.lower()

	# Manual classification based on question content
	if "wikipedia" in question_lower or "featured article" in question_lower:
	return {"type": "Wikipedia Research", "expected_agent": "web_researcher"}
	elif "youtube.com" in question or "youtu.be" in question:
	return {"type": "YouTube Analysis", "expected_agent": "web_researcher"}
	elif file_name and file_name.endswith(('.xlsx', '.csv')):
	return {"type": "Excel/CSV Processing", "expected_agent": "file_processor"}
	elif file_name and file_name.endswith('.py'):
	return {"type": "Python Code Analysis", "expected_agent": "file_processor"}
	elif file_name and file_name.endswith(('.mp3', '.wav')):
	return {"type": "Audio Processing", "expected_agent": "file_processor"}
	elif file_name and file_name.endswith(('.png', '.jpg', '.jpeg')):
	return {"type": "Image Analysis", "expected_agent": "file_processor"}
	elif any(word in question_lower for word in ['calculate', 'total', 'average', 'sum']):
	return {"type": "Mathematical Reasoning", "expected_agent": "reasoning_agent"}
	elif "reverse" in question_lower or "encode" in question_lower:
	return {"type": "Text Manipulation", "expected_agent": "reasoning_agent"}
	elif any(word in question_lower for word in ['athletes', 'competition', 'olympics']):
	return {"type": "Sports/Statistics Research", "expected_agent": "web_researcher"}
	else:
	return {"type": "General Research", "expected_agent": "web_researcher"}

	def test_real_gaia_questions():
	"""Test system with real GAIA questions"""

	print("🧪 Real GAIA Questions Test")
	print("=" * 50)

	# Load questions
	questions = load_gaia_questions("../questions.json")
	if not questions:
	print("❌ No questions loaded. Exiting.")
	return False

	print(f"📋 Loaded {len(questions)} GAIA questions")

	# Initialize system
	try:
	llm_client = QwenClient()
	router = RouterAgent(llm_client)
	web_agent = WebResearchAgent(llm_client)
	file_agent = FileProcessorAgent(llm_client)
	reasoning_agent = ReasoningAgent(llm_client)
	except Exception as e:
	print(f"❌ Failed to initialize system: {e}")
	return False

	# Test subset of questions (to manage cost)
	test_questions = questions[:8] # Test first 8 questions

	results = []
	total_cost = 0.0
	start_time = time.time()

	# Question type distribution tracking
	question_types = {}
	routing_accuracy = {"correct": 0, "total": 0}

	for i, q in enumerate(test_questions, 1):
	print(f"\n🔍 Question {i}/{len(test_questions)}")
	print(f" ID: {q['task_id']}")
	print(f" Level: {q['Level']}")
	print(f" File: {q['file_name'] if q['file_name'] else 'None'}")
	print(f" Question: {q['question'][:100]}...")

	# Manual classification for comparison
	manual_class = classify_question_manually(q['question'], q['file_name'])
	print(f" Expected Type: {manual_class['type']}")

	try:
	# Initialize state
	state = GAIAAgentState()
	state.task_id = q['task_id']
	state.question = q['question']
	state.difficulty_level = int(q['Level'])
	state.file_name = q['file_name'] if q['file_name'] else None
	if state.file_name:
	state.file_path = f"/tmp/{state.file_name}" # Placeholder path

	# Route question
	routed_state = router.route_question(state)
	print(f" 🧭 Router: {routed_state.question_type.value} -> {[a.value for a in routed_state.selected_agents]}")
	print(f" 📊 Complexity: {routed_state.complexity_assessment}")
	print(f" 💰 Est. Cost: ${routed_state.estimated_cost:.4f}")

	# Track question types
	q_type = routed_state.question_type.value
	question_types[q_type] = question_types.get(q_type, 0) + 1

	# Check routing accuracy (simplified)
	expected_agent = manual_class["expected_agent"]
	actual_agents = [a.value for a in routed_state.selected_agents]
	if expected_agent in actual_agents:
	routing_accuracy["correct"] += 1
	routing_accuracy["total"] += 1

	# Only process if we have the required agent implemented
	processed = False
	if AgentRole.WEB_RESEARCHER in routed_state.selected_agents:
	try:
	processed_state = web_agent.process(routed_state)
	processed = True
	except Exception as e:
	print(f" ⚠️ Web researcher failed: {e}")

	elif AgentRole.REASONING_AGENT in routed_state.selected_agents:
	try:
	processed_state = reasoning_agent.process(routed_state)
	processed = True
	except Exception as e:
	print(f" ⚠️ Reasoning agent failed: {e}")

	elif AgentRole.FILE_PROCESSOR in routed_state.selected_agents and not state.file_name:
	print(f" ⚠️ File processor selected but no file provided")

	if processed:
	agent_result = list(processed_state.agent_results.values())[-1]
	cost = processed_state.total_cost
	processing_time = processed_state.total_processing_time

	print(f" ✅ Processed by: {agent_result.agent_role.value}")
	print(f" 📝 Result: {agent_result.result[:150]}...")
	print(f" 📊 Confidence: {agent_result.confidence:.2f}")
	print(f" 💰 Actual Cost: ${cost:.4f}")
	print(f" ⏱️ Time: {processing_time:.2f}s")

	total_cost += cost
	results.append({
	"success": agent_result.success,
	"confidence": agent_result.confidence,
	"cost": cost,
	"time": processing_time
	})
	else:
	print(f" 🔄 Routing only (no processing)")
	results.append({
	"success": True, # Routing succeeded
	"confidence": 0.5, # Neutral
	"cost": 0.0,
	"time": 0.0
	})

	except Exception as e:
	print(f" ❌ Failed: {e}")
	results.append({
	"success": False,
	"confidence": 0.0,
	"cost": 0.0,
	"time": 0.0
	})

	# Summary
	total_time = time.time() - start_time
	successful_results = [r for r in results if r["success"]]

	print("\n" + "=" * 50)
	print("📊 REAL GAIA TEST RESULTS")
	print("=" * 50)

	# Basic stats
	print(f"🎯 Questions Processed: {len(results)}")
	print(f"✅ Successful Processing: {len(successful_results)}/{len(results)} ({len(successful_results)/len(results)*100:.1f}%)")
	print(f"💰 Total Cost: ${total_cost:.4f}")
	print(f"⏱️ Total Time: {total_time:.2f} seconds")

	if successful_results:
	avg_confidence = sum(r["confidence"] for r in successful_results) / len(successful_results)
	avg_cost = sum(r["cost"] for r in successful_results) / len(successful_results)
	avg_time = sum(r["time"] for r in successful_results) / len(successful_results)

	print(f"📈 Average Confidence: {avg_confidence:.2f}")
	print(f"💰 Average Cost: ${avg_cost:.4f}")
	print(f"⚡ Average Time: {avg_time:.2f}s")

	# Question type distribution
	print(f"\n📋 Question Type Distribution:")
	for q_type, count in question_types.items():
	print(f" {q_type}: {count}")

	# Routing accuracy
	routing_rate = routing_accuracy["correct"] / routing_accuracy["total"] * 100 if routing_accuracy["total"] > 0 else 0
	print(f"\n🧭 Routing Accuracy: {routing_accuracy['correct']}/{routing_accuracy['total']} ({routing_rate:.1f}%)")

	# Budget analysis
	monthly_budget = 0.10
	if total_cost <= monthly_budget:
	remaining = monthly_budget - total_cost
	estimated_questions = int(remaining / (total_cost / len(results))) if total_cost > 0 else 1000
	print(f"💰 Budget Status: ✅ ${remaining:.4f} remaining (~{estimated_questions} more questions)")
	else:
	print(f"💰 Budget Status: ⚠️ Over budget by ${total_cost - monthly_budget:.4f}")

	# Success assessment
	success_rate = len(successful_results) / len(results) * 100
	if success_rate >= 80:
	print(f"\n🚀 EXCELLENT! System handles real GAIA questions well ({success_rate:.1f}% success)")
	return True
	elif success_rate >= 60:
	print(f"\n✅ GOOD! System shows promise ({success_rate:.1f}% success)")
	return True
	else:
	print(f"\n⚠️ NEEDS WORK! Low success rate ({success_rate:.1f}%)")
	return False

	if __name__ == "__main__":
	success = test_real_gaia_questions()
	sys.exit(0 if success else 1)