File size: 10,607 Bytes
225a75e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/env python3
"""
Real GAIA Questions Test for GAIA Agent System
Tests the system with actual GAIA benchmark questions
"""

import json
import os
import sys
import time
from pathlib import Path
from typing import Dict, List

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from agents.state import GAIAAgentState, QuestionType, AgentRole
from agents.router import RouterAgent
from agents.web_researcher import WebResearchAgent
from agents.file_processor_agent import FileProcessorAgent
from agents.reasoning_agent import ReasoningAgent
from models.qwen_client import QwenClient

def load_gaia_questions(file_path: str = "questions.json") -> List[Dict]:
    """Load GAIA questions from JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            questions = json.load(f)
        return questions
    except FileNotFoundError:
        print(f"❌ Questions file not found: {file_path}")
        return []
    except json.JSONDecodeError as e:
        print(f"❌ Invalid JSON in questions file: {e}")
        return []

def classify_question_manually(question: str, file_name: str) -> Dict:
    """Manually classify GAIA questions to compare with router"""
    
    question_lower = question.lower()
    
    # Manual classification based on question content
    if "wikipedia" in question_lower or "featured article" in question_lower:
        return {"type": "Wikipedia Research", "expected_agent": "web_researcher"}
    elif "youtube.com" in question or "youtu.be" in question:
        return {"type": "YouTube Analysis", "expected_agent": "web_researcher"}
    elif file_name and file_name.endswith(('.xlsx', '.csv')):
        return {"type": "Excel/CSV Processing", "expected_agent": "file_processor"}
    elif file_name and file_name.endswith('.py'):
        return {"type": "Python Code Analysis", "expected_agent": "file_processor"}
    elif file_name and file_name.endswith(('.mp3', '.wav')):
        return {"type": "Audio Processing", "expected_agent": "file_processor"}
    elif file_name and file_name.endswith(('.png', '.jpg', '.jpeg')):
        return {"type": "Image Analysis", "expected_agent": "file_processor"}
    elif any(word in question_lower for word in ['calculate', 'total', 'average', 'sum']):
        return {"type": "Mathematical Reasoning", "expected_agent": "reasoning_agent"}
    elif "reverse" in question_lower or "encode" in question_lower:
        return {"type": "Text Manipulation", "expected_agent": "reasoning_agent"}
    elif any(word in question_lower for word in ['athletes', 'competition', 'olympics']):
        return {"type": "Sports/Statistics Research", "expected_agent": "web_researcher"}
    else:
        return {"type": "General Research", "expected_agent": "web_researcher"}

def test_real_gaia_questions():
    """Test system with real GAIA questions"""
    
    print("πŸ§ͺ Real GAIA Questions Test")
    print("=" * 50)
    
    # Load questions
    questions = load_gaia_questions("../questions.json")
    if not questions:
        print("❌ No questions loaded. Exiting.")
        return False
    
    print(f"πŸ“‹ Loaded {len(questions)} GAIA questions")
    
    # Initialize system
    try:
        llm_client = QwenClient()
        router = RouterAgent(llm_client)
        web_agent = WebResearchAgent(llm_client)
        file_agent = FileProcessorAgent(llm_client)
        reasoning_agent = ReasoningAgent(llm_client)
    except Exception as e:
        print(f"❌ Failed to initialize system: {e}")
        return False
    
    # Test subset of questions (to manage cost)
    test_questions = questions[:8]  # Test first 8 questions
    
    results = []
    total_cost = 0.0
    start_time = time.time()
    
    # Question type distribution tracking
    question_types = {}
    routing_accuracy = {"correct": 0, "total": 0}
    
    for i, q in enumerate(test_questions, 1):
        print(f"\nπŸ” Question {i}/{len(test_questions)}")
        print(f"   ID: {q['task_id']}")
        print(f"   Level: {q['Level']}")
        print(f"   File: {q['file_name'] if q['file_name'] else 'None'}")
        print(f"   Question: {q['question'][:100]}...")
        
        # Manual classification for comparison
        manual_class = classify_question_manually(q['question'], q['file_name'])
        print(f"   Expected Type: {manual_class['type']}")
        
        try:
            # Initialize state
            state = GAIAAgentState()
            state.task_id = q['task_id']
            state.question = q['question']
            state.difficulty_level = int(q['Level'])
            state.file_name = q['file_name'] if q['file_name'] else None
            if state.file_name:
                state.file_path = f"/tmp/{state.file_name}"  # Placeholder path
            
            # Route question
            routed_state = router.route_question(state)
            print(f"   🧭 Router: {routed_state.question_type.value} -> {[a.value for a in routed_state.selected_agents]}")
            print(f"   πŸ“Š Complexity: {routed_state.complexity_assessment}")
            print(f"   πŸ’° Est. Cost: ${routed_state.estimated_cost:.4f}")
            
            # Track question types
            q_type = routed_state.question_type.value
            question_types[q_type] = question_types.get(q_type, 0) + 1
            
            # Check routing accuracy (simplified)
            expected_agent = manual_class["expected_agent"]
            actual_agents = [a.value for a in routed_state.selected_agents]
            if expected_agent in actual_agents:
                routing_accuracy["correct"] += 1
            routing_accuracy["total"] += 1
            
            # Only process if we have the required agent implemented
            processed = False
            if AgentRole.WEB_RESEARCHER in routed_state.selected_agents:
                try:
                    processed_state = web_agent.process(routed_state)
                    processed = True
                except Exception as e:
                    print(f"   ⚠️  Web researcher failed: {e}")
            
            elif AgentRole.REASONING_AGENT in routed_state.selected_agents:
                try:
                    processed_state = reasoning_agent.process(routed_state)
                    processed = True
                except Exception as e:
                    print(f"   ⚠️  Reasoning agent failed: {e}")
            
            elif AgentRole.FILE_PROCESSOR in routed_state.selected_agents and not state.file_name:
                print(f"   ⚠️  File processor selected but no file provided")
            
            if processed:
                agent_result = list(processed_state.agent_results.values())[-1]
                cost = processed_state.total_cost
                processing_time = processed_state.total_processing_time
                
                print(f"   βœ… Processed by: {agent_result.agent_role.value}")
                print(f"   πŸ“ Result: {agent_result.result[:150]}...")
                print(f"   πŸ“Š Confidence: {agent_result.confidence:.2f}")
                print(f"   πŸ’° Actual Cost: ${cost:.4f}")
                print(f"   ⏱️  Time: {processing_time:.2f}s")
                
                total_cost += cost
                results.append({
                    "success": agent_result.success,
                    "confidence": agent_result.confidence,
                    "cost": cost,
                    "time": processing_time
                })
            else:
                print(f"   πŸ”„ Routing only (no processing)")
                results.append({
                    "success": True,  # Routing succeeded
                    "confidence": 0.5,  # Neutral
                    "cost": 0.0,
                    "time": 0.0
                })
                
        except Exception as e:
            print(f"   ❌ Failed: {e}")
            results.append({
                "success": False,
                "confidence": 0.0,
                "cost": 0.0,
                "time": 0.0
            })
    
    # Summary
    total_time = time.time() - start_time
    successful_results = [r for r in results if r["success"]]
    
    print("\n" + "=" * 50)
    print("πŸ“Š REAL GAIA TEST RESULTS")
    print("=" * 50)
    
    # Basic stats
    print(f"🎯 Questions Processed: {len(results)}")
    print(f"βœ… Successful Processing: {len(successful_results)}/{len(results)} ({len(successful_results)/len(results)*100:.1f}%)")
    print(f"πŸ’° Total Cost: ${total_cost:.4f}")
    print(f"⏱️  Total Time: {total_time:.2f} seconds")
    
    if successful_results:
        avg_confidence = sum(r["confidence"] for r in successful_results) / len(successful_results)
        avg_cost = sum(r["cost"] for r in successful_results) / len(successful_results)
        avg_time = sum(r["time"] for r in successful_results) / len(successful_results)
        
        print(f"πŸ“ˆ Average Confidence: {avg_confidence:.2f}")
        print(f"πŸ’° Average Cost: ${avg_cost:.4f}")
        print(f"⚑ Average Time: {avg_time:.2f}s")
    
    # Question type distribution
    print(f"\nπŸ“‹ Question Type Distribution:")
    for q_type, count in question_types.items():
        print(f"   {q_type}: {count}")
    
    # Routing accuracy
    routing_rate = routing_accuracy["correct"] / routing_accuracy["total"] * 100 if routing_accuracy["total"] > 0 else 0
    print(f"\n🧭 Routing Accuracy: {routing_accuracy['correct']}/{routing_accuracy['total']} ({routing_rate:.1f}%)")
    
    # Budget analysis
    monthly_budget = 0.10
    if total_cost <= monthly_budget:
        remaining = monthly_budget - total_cost
        estimated_questions = int(remaining / (total_cost / len(results))) if total_cost > 0 else 1000
        print(f"πŸ’° Budget Status: βœ… ${remaining:.4f} remaining (~{estimated_questions} more questions)")
    else:
        print(f"πŸ’° Budget Status: ⚠️  Over budget by ${total_cost - monthly_budget:.4f}")
    
    # Success assessment
    success_rate = len(successful_results) / len(results) * 100
    if success_rate >= 80:
        print(f"\nπŸš€ EXCELLENT! System handles real GAIA questions well ({success_rate:.1f}% success)")
        return True
    elif success_rate >= 60:
        print(f"\nβœ… GOOD! System shows promise ({success_rate:.1f}% success)")
        return True
    else:
        print(f"\n⚠️  NEEDS WORK! Low success rate ({success_rate:.1f}%)")
        return False

if __name__ == "__main__":
    success = test_real_gaia_questions()
    sys.exit(0 if success else 1)