AgileAndy Claude commited on
Commit
26b7da4
·
1 Parent(s): 43ccb47

Clean deployment: Remove all non-critical development files

Browse files

- Removed all development agents (advanced_agent.py, consensus_gaia_agent.py, etc.)
- Removed test files, reports, and development artifacts
- Removed tools directory and cache files
- Kept only critical files: app.py, speed_optimized_gaia_agent.py, requirements.txt, README.md
- Clean production-ready deployment

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
Gradio_UI.py DELETED
@@ -1,296 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding=utf-8
3
- # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- import mimetypes
17
- import os
18
- import re
19
- import shutil
20
- from typing import Optional
21
-
22
- from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
23
- from smolagents.agents import ActionStep, MultiStepAgent
24
- from smolagents.memory import MemoryStep
25
- from smolagents.utils import _is_package_available
26
-
27
-
28
- def pull_messages_from_step(
29
- step_log: MemoryStep,
30
- ):
31
- """Extract ChatMessage objects from agent steps with proper nesting"""
32
- import gradio as gr
33
-
34
- if isinstance(step_log, ActionStep):
35
- # Output the step number
36
- step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else ""
37
- yield gr.ChatMessage(role="assistant", content=f"**{step_number}**")
38
-
39
- # First yield the thought/reasoning from the LLM
40
- if hasattr(step_log, "model_output") and step_log.model_output is not None:
41
- # Clean up the LLM output
42
- model_output = step_log.model_output.strip()
43
- # Remove any trailing <end_code> and extra backticks, handling multiple possible formats
44
- model_output = re.sub(r"```\s*<end_code>", "```", model_output) # handles ```<end_code>
45
- model_output = re.sub(r"<end_code>\s*```", "```", model_output) # handles <end_code>```
46
- model_output = re.sub(r"```\s*\n\s*<end_code>", "```", model_output) # handles ```\n<end_code>
47
- model_output = model_output.strip()
48
- yield gr.ChatMessage(role="assistant", content=model_output)
49
-
50
- # For tool calls, create a parent message
51
- if hasattr(step_log, "tool_calls") and step_log.tool_calls is not None:
52
- first_tool_call = step_log.tool_calls[0]
53
- used_code = first_tool_call.name == "python_interpreter"
54
- parent_id = f"call_{len(step_log.tool_calls)}"
55
-
56
- # Tool call becomes the parent message with timing info
57
- # First we will handle arguments based on type
58
- args = first_tool_call.arguments
59
- if isinstance(args, dict):
60
- content = str(args.get("answer", str(args)))
61
- else:
62
- content = str(args).strip()
63
-
64
- if used_code:
65
- # Clean up the content by removing any end code tags
66
- content = re.sub(r"```.*?\n", "", content) # Remove existing code blocks
67
- content = re.sub(r"\s*<end_code>\s*", "", content) # Remove end_code tags
68
- content = content.strip()
69
- if not content.startswith("```python"):
70
- content = f"```python\n{content}\n```"
71
-
72
- parent_message_tool = gr.ChatMessage(
73
- role="assistant",
74
- content=content,
75
- metadata={
76
- "title": f"🛠️ Used tool {first_tool_call.name}",
77
- "id": parent_id,
78
- "status": "pending",
79
- },
80
- )
81
- yield parent_message_tool
82
-
83
- # Nesting execution logs under the tool call if they exist
84
- if hasattr(step_log, "observations") and (
85
- step_log.observations is not None and step_log.observations.strip()
86
- ): # Only yield execution logs if there's actual content
87
- log_content = step_log.observations.strip()
88
- if log_content:
89
- log_content = re.sub(r"^Execution logs:\s*", "", log_content)
90
- yield gr.ChatMessage(
91
- role="assistant",
92
- content=f"{log_content}",
93
- metadata={"title": "📝 Execution Logs", "parent_id": parent_id, "status": "done"},
94
- )
95
-
96
- # Nesting any errors under the tool call
97
- if hasattr(step_log, "error") and step_log.error is not None:
98
- yield gr.ChatMessage(
99
- role="assistant",
100
- content=str(step_log.error),
101
- metadata={"title": "💥 Error", "parent_id": parent_id, "status": "done"},
102
- )
103
-
104
- # Update parent message metadata to done status without yielding a new message
105
- parent_message_tool.metadata["status"] = "done"
106
-
107
- # Handle standalone errors but not from tool calls
108
- elif hasattr(step_log, "error") and step_log.error is not None:
109
- yield gr.ChatMessage(role="assistant", content=str(step_log.error), metadata={"title": "💥 Error"})
110
-
111
- # Calculate duration and token information
112
- step_footnote = f"{step_number}"
113
- if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"):
114
- token_str = (
115
- f" | Input-tokens:{step_log.input_token_count:,} | Output-tokens:{step_log.output_token_count:,}"
116
- )
117
- step_footnote += token_str
118
- if hasattr(step_log, "duration"):
119
- step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None
120
- step_footnote += step_duration
121
- step_footnote = f"""<span style="color: #bbbbc2; font-size: 12px;">{step_footnote}</span> """
122
- yield gr.ChatMessage(role="assistant", content=f"{step_footnote}")
123
- yield gr.ChatMessage(role="assistant", content="-----")
124
-
125
-
126
- def stream_to_gradio(
127
- agent,
128
- task: str,
129
- reset_agent_memory: bool = False,
130
- additional_args: Optional[dict] = None,
131
- ):
132
- """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
133
- if not _is_package_available("gradio"):
134
- raise ModuleNotFoundError(
135
- "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
136
- )
137
- import gradio as gr
138
-
139
- total_input_tokens = 0
140
- total_output_tokens = 0
141
-
142
- for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args):
143
- # Track tokens if model provides them
144
- if hasattr(agent.model, "last_input_token_count"):
145
- total_input_tokens += agent.model.last_input_token_count
146
- total_output_tokens += agent.model.last_output_token_count
147
- if isinstance(step_log, ActionStep):
148
- step_log.input_token_count = agent.model.last_input_token_count
149
- step_log.output_token_count = agent.model.last_output_token_count
150
-
151
- for message in pull_messages_from_step(
152
- step_log,
153
- ):
154
- yield message
155
-
156
- final_answer = step_log # Last log is the run's final_answer
157
- final_answer = handle_agent_output_types(final_answer)
158
-
159
- if isinstance(final_answer, AgentText):
160
- yield gr.ChatMessage(
161
- role="assistant",
162
- content=f"**Final answer:**\n{final_answer.to_string()}\n",
163
- )
164
- elif isinstance(final_answer, AgentImage):
165
- yield gr.ChatMessage(
166
- role="assistant",
167
- content={"path": final_answer.to_string(), "mime_type": "image/png"},
168
- )
169
- elif isinstance(final_answer, AgentAudio):
170
- yield gr.ChatMessage(
171
- role="assistant",
172
- content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
173
- )
174
- else:
175
- yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
176
-
177
-
178
- class GradioUI:
179
- """A one-line interface to launch your agent in Gradio"""
180
-
181
- def __init__(self, agent: MultiStepAgent, file_upload_folder: str | None = None):
182
- if not _is_package_available("gradio"):
183
- raise ModuleNotFoundError(
184
- "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
185
- )
186
- self.agent = agent
187
- self.file_upload_folder = file_upload_folder
188
- if self.file_upload_folder is not None:
189
- if not os.path.exists(file_upload_folder):
190
- os.mkdir(file_upload_folder)
191
-
192
- def interact_with_agent(self, prompt, messages):
193
- import gradio as gr
194
-
195
- messages.append(gr.ChatMessage(role="user", content=prompt))
196
- yield messages
197
- for msg in stream_to_gradio(self.agent, task=prompt, reset_agent_memory=False):
198
- messages.append(msg)
199
- yield messages
200
- yield messages
201
-
202
- def upload_file(
203
- self,
204
- file,
205
- file_uploads_log,
206
- allowed_file_types=[
207
- "application/pdf",
208
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
209
- "text/plain",
210
- ],
211
- ):
212
- """
213
- Handle file uploads, default allowed types are .pdf, .docx, and .txt
214
- """
215
- import gradio as gr
216
-
217
- if file is None:
218
- return gr.Textbox("No file uploaded", visible=True), file_uploads_log
219
-
220
- try:
221
- mime_type, _ = mimetypes.guess_type(file.name)
222
- except Exception as e:
223
- return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
224
-
225
- if mime_type not in allowed_file_types:
226
- return gr.Textbox("File type disallowed", visible=True), file_uploads_log
227
-
228
- # Sanitize file name
229
- original_name = os.path.basename(file.name)
230
- sanitized_name = re.sub(
231
- r"[^\w\-.]", "_", original_name
232
- ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
233
-
234
- type_to_ext = {}
235
- for ext, t in mimetypes.types_map.items():
236
- if t not in type_to_ext:
237
- type_to_ext[t] = ext
238
-
239
- # Ensure the extension correlates to the mime type
240
- sanitized_name = sanitized_name.split(".")[:-1]
241
- sanitized_name.append("" + type_to_ext[mime_type])
242
- sanitized_name = "".join(sanitized_name)
243
-
244
- # Save the uploaded file to the specified folder
245
- file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name))
246
- shutil.copy(file.name, file_path)
247
-
248
- return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
249
-
250
- def log_user_message(self, text_input, file_uploads_log):
251
- return (
252
- text_input
253
- + (
254
- f"\nYou have been provided with these files, which might be helpful or not: {file_uploads_log}"
255
- if len(file_uploads_log) > 0
256
- else ""
257
- ),
258
- "",
259
- )
260
-
261
- def launch(self, **kwargs):
262
- import gradio as gr
263
-
264
- with gr.Blocks(fill_height=True) as demo:
265
- stored_messages = gr.State([])
266
- file_uploads_log = gr.State([])
267
- chatbot = gr.Chatbot(
268
- label="Agent",
269
- type="messages",
270
- avatar_images=(
271
- None,
272
- "https://huggingface.co/datasets/agents-course/course-images/resolve/main/en/communication/Alfred.png",
273
- ),
274
- resizeable=True,
275
- scale=1,
276
- )
277
- # If an upload folder is provided, enable the upload feature
278
- if self.file_upload_folder is not None:
279
- upload_file = gr.File(label="Upload a file")
280
- upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
281
- upload_file.change(
282
- self.upload_file,
283
- [upload_file, file_uploads_log],
284
- [upload_status, file_uploads_log],
285
- )
286
- text_input = gr.Textbox(lines=1, label="Chat Message")
287
- text_input.submit(
288
- self.log_user_message,
289
- [text_input, file_uploads_log],
290
- [stored_messages, text_input],
291
- ).then(self.interact_with_agent, [stored_messages, chatbot], [chatbot])
292
-
293
- demo.launch(debug=True, share=True, **kwargs)
294
-
295
-
296
- __all__ = ["stream_to_gradio", "GradioUI"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/advanced_agent.cpython-312.pyc DELETED
Binary file (18.1 kB)
 
__pycache__/app.cpython-312.pyc DELETED
Binary file (23.4 kB)
 
__pycache__/app.cpython-313.pyc DELETED
Binary file (21.5 kB)
 
__pycache__/consensus_gaia_agent.cpython-312.pyc DELETED
Binary file (19.8 kB)
 
__pycache__/framework_gaia_agent.cpython-312.pyc DELETED
Binary file (23.2 kB)
 
__pycache__/gaia_agent.cpython-312.pyc DELETED
Binary file (29.9 kB)
 
__pycache__/simplified_gaia_agent.cpython-312.pyc DELETED
Binary file (20.6 kB)
 
__pycache__/test_agent.cpython-312.pyc DELETED
Binary file (30 kB)
 
__pycache__/test_agent.cpython-313-pytest-8.3.5.pyc DELETED
Binary file (31.2 kB)
 
__pycache__/test_exa_fix.cpython-313-pytest-8.3.5.pyc DELETED
Binary file (2.6 kB)
 
advanced_agent.py DELETED
@@ -1,410 +0,0 @@
1
- import re
2
- import os
3
- import requests
4
- import json
5
- from datetime import datetime
6
- import tempfile
7
- import subprocess
8
- import pandas as pd
9
- from pathlib import Path
10
-
11
- # Search engines
12
- import wikipedia
13
- from ddgs import DDGS
14
-
15
- # LLM and multimedia
16
- import openai
17
- from PIL import Image
18
- import base64
19
- from io import BytesIO
20
-
21
- # Import additional search engines
22
- try:
23
- from exa_py import Exa
24
- EXA_AVAILABLE = True
25
- except ImportError:
26
- EXA_AVAILABLE = False
27
-
28
- try:
29
- from tavily import TavilyClient
30
- TAVILY_AVAILABLE = True
31
- except ImportError:
32
- TAVILY_AVAILABLE = False
33
-
34
- class AdvancedGAIAAgent:
35
- """Advanced GAIA agent with LLM reasoning and multimedia support"""
36
-
37
- def __init__(self):
38
- print("🚀 Advanced GAIA Agent initialized with LLM reasoning")
39
-
40
- # Initialize OpenAI
41
- self.openai_client = None
42
- openai_key = os.getenv("OPENAI_API_KEY")
43
- if openai_key:
44
- self.openai_client = openai.OpenAI(api_key=openai_key)
45
- print("✅ OpenAI client initialized")
46
- else:
47
- print("⚠️ OPENAI_API_KEY not found - LLM reasoning disabled")
48
-
49
- # Initialize search engines
50
- self.ddgs = DDGS()
51
-
52
- # Initialize Exa
53
- if EXA_AVAILABLE:
54
- exa_api_key = os.getenv("EXA_API_KEY")
55
- if exa_api_key:
56
- self.exa = Exa(api_key=exa_api_key)
57
- print("✅ Exa search engine initialized")
58
- else:
59
- self.exa = None
60
- else:
61
- self.exa = None
62
-
63
- # Initialize Tavily
64
- if TAVILY_AVAILABLE:
65
- tavily_api_key = os.getenv("TAVILY_API_KEY")
66
- if tavily_api_key:
67
- self.tavily = TavilyClient(api_key=tavily_api_key)
68
- print("✅ Tavily search engine initialized")
69
- else:
70
- self.tavily = None
71
- else:
72
- self.tavily = None
73
-
74
- def search_comprehensive(self, query, max_results=5):
75
- """Search using all available engines"""
76
- all_results = []
77
-
78
- # Try Tavily first (most relevant for current events)
79
- if self.tavily:
80
- try:
81
- tavily_query = query[:350] # Respect 400 char limit
82
- tavily_results = self.tavily.search(tavily_query, max_results=3)
83
- if tavily_results and 'results' in tavily_results:
84
- for result in tavily_results['results']:
85
- all_results.append({
86
- "title": result.get("title", ""),
87
- "content": result.get("content", ""),
88
- "url": result.get("url", ""),
89
- "source": "Tavily"
90
- })
91
- print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
92
- except Exception as e:
93
- print(f"❌ Tavily error: {e}")
94
-
95
- # Try Exa for academic/factual content
96
- if self.exa and len(all_results) < max_results:
97
- try:
98
- exa_query = query[:200]
99
- remaining = max_results - len(all_results)
100
- exa_results = self.exa.search_and_contents(exa_query, num_results=remaining)
101
- if exa_results and hasattr(exa_results, 'results'):
102
- for result in exa_results.results:
103
- all_results.append({
104
- "title": getattr(result, 'title', ''),
105
- "content": getattr(result, 'text', ''),
106
- "url": getattr(result, 'url', ''),
107
- "source": "Exa"
108
- })
109
- print(f"📊 Exa: {len(exa_results.results)} results")
110
- except Exception as e:
111
- print(f"❌ Exa error: {e}")
112
-
113
- # Try Wikipedia for encyclopedic content
114
- try:
115
- wiki_query = self.extract_key_terms(query)[:100]
116
- wiki_results = wikipedia.search(wiki_query, results=2)
117
- if wiki_results:
118
- page = wikipedia.page(wiki_results[0])
119
- all_results.append({
120
- "title": page.title,
121
- "content": page.summary,
122
- "url": page.url,
123
- "source": "Wikipedia"
124
- })
125
- print(f"📊 Wikipedia: {len(wiki_results)} results")
126
- except Exception as e:
127
- print(f"❌ Wikipedia error: {e}")
128
-
129
- # DuckDuckGo fallback
130
- if len(all_results) < max_results:
131
- try:
132
- remaining = max_results - len(all_results)
133
- ddg_results = list(self.ddgs.text(query, max_results=remaining))
134
- for result in ddg_results:
135
- all_results.append({
136
- "title": result.get("title", ""),
137
- "content": result.get("body", ""),
138
- "url": result.get("href", ""),
139
- "source": "DuckDuckGo"
140
- })
141
- print(f"📊 DuckDuckGo: {len(ddg_results)} results")
142
- except Exception as e:
143
- print(f"❌ DuckDuckGo error: {e}")
144
-
145
- return all_results[:max_results]
146
-
147
- def extract_key_terms(self, text):
148
- """Extract key terms for search optimization"""
149
- # Remove common question patterns
150
- text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
151
- text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
152
- text = re.sub(r'Give.*?answer\.?', '', text, flags=re.IGNORECASE)
153
-
154
- # Extract proper nouns and important terms
155
- proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
156
- years = re.findall(r'\b(19|20)\d{2}\b', text)
157
-
158
- key_terms = proper_nouns[:5] + years[:2]
159
- return ' '.join(key_terms) if key_terms else text[:100]
160
-
161
- def detect_multimedia(self, question):
162
- """Detect if question involves multimedia content"""
163
- multimedia_indicators = {
164
- 'video': ['youtube.com', 'video', '.mp4', 'watch?v='],
165
- 'image': ['image', 'picture', 'photo', '.jpg', '.png', 'chess position'],
166
- 'audio': ['.mp3', '.wav', 'audio', 'listen', 'recording'],
167
- 'excel': ['.xlsx', '.xls', 'Excel file', 'spreadsheet'],
168
- 'python': ['Python code', '.py', 'attached Python', 'code?']
169
- }
170
-
171
- for media_type, indicators in multimedia_indicators.items():
172
- if any(indicator.lower() in question.lower() for indicator in indicators):
173
- return media_type
174
- return None
175
-
176
- def handle_multimedia_question(self, question, media_type):
177
- """Handle questions that require multimedia processing"""
178
- print(f"🎬 Detected {media_type} question")
179
-
180
- if media_type == 'video':
181
- return self.handle_video_question(question)
182
- elif media_type == 'image':
183
- return self.handle_image_question(question)
184
- elif media_type == 'audio':
185
- return self.handle_audio_question(question)
186
- elif media_type == 'excel':
187
- return self.handle_excel_question(question)
188
- elif media_type == 'python':
189
- return self.handle_python_question(question)
190
-
191
- return None
192
-
193
- def handle_video_question(self, question):
194
- """Handle YouTube video questions"""
195
- # Extract YouTube URL
196
- youtube_pattern = r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]+)'
197
- match = re.search(youtube_pattern, question)
198
-
199
- if match:
200
- video_id = match.group(1)
201
- print(f"🎥 Processing YouTube video: {video_id}")
202
-
203
- # Search for information about this specific video
204
- search_query = f"YouTube video {video_id} transcript content"
205
- results = self.search_comprehensive(search_query, max_results=3)
206
-
207
- # Try to get video metadata/description
208
- try:
209
- # This would need YouTube API implementation
210
- # For now, search for known information about the video
211
- search_results_text = "\n".join([r.get('content', '') for r in results])
212
-
213
- if self.openai_client:
214
- return self.llm_reasoning(question, search_results_text)
215
- except Exception as e:
216
- print(f"❌ Video processing error: {e}")
217
-
218
- return "Unable to process video content - requires YouTube API access"
219
-
220
- def handle_image_question(self, question):
221
- """Handle image-based questions"""
222
- print("🖼️ Processing image question")
223
-
224
- if 'chess' in question.lower():
225
- # For chess positions, search for common chess puzzles/solutions
226
- search_query = "chess puzzle black to move winning move algebraic notation"
227
- results = self.search_comprehensive(search_query, max_results=3)
228
- search_text = "\n".join([r.get('content', '') for r in results])
229
-
230
- if self.openai_client:
231
- return self.llm_reasoning(question, search_text)
232
-
233
- return "Unable to process image content - requires vision model"
234
-
235
- def handle_audio_question(self, question):
236
- """Handle audio file questions"""
237
- print("🔊 Processing audio question")
238
- return "Unable to process audio content - requires speech-to-text API"
239
-
240
- def handle_excel_question(self, question):
241
- """Handle Excel file questions"""
242
- print("📊 Processing Excel question")
243
- return "Unable to process Excel files - file not provided"
244
-
245
- def handle_python_question(self, question):
246
- """Handle Python code execution questions"""
247
- print("🐍 Processing Python code question")
248
- return "Unable to execute Python code - code file not provided"
249
-
250
- def llm_reasoning(self, question, context="", max_tokens=150):
251
- """Use LLM for sophisticated reasoning"""
252
- if not self.openai_client:
253
- return "LLM reasoning unavailable - no OpenAI API key"
254
-
255
- try:
256
- system_prompt = """You are a precise AI assistant specialized in answering GAIA benchmark questions.
257
-
258
- CRITICAL FORMATTING RULES:
259
- - Your answer must be a number OR as few words as possible OR a comma separated list
260
- - For numbers: NO commas, NO units like $ or % (unless specifically requested)
261
- - For strings: NO articles (a, an, the), NO abbreviations for cities
262
- - For lists: comma separated, apply above rules to each element
263
- - Write numbers in digits unless specifically asked for words
264
-
265
- Examples:
266
- - "What is 25 * 4?" → "100" (not "100.0" or "one hundred")
267
- - "Capital of France?" → "Paris" (not "The capital is Paris")
268
- - "When was JFK shot?" → "1963" (not "in 1963" or "November 1963")
269
-
270
- ANSWER ONLY THE SPECIFIC QUESTION ASKED. Be direct and concise."""
271
-
272
- user_prompt = f"""Question: {question}
273
-
274
- Context from search results:
275
- {context[:2000]}
276
-
277
- Provide a precise, direct answer following the formatting rules."""
278
-
279
- response = self.openai_client.chat.completions.create(
280
- model="gpt-4o-mini",
281
- messages=[
282
- {"role": "system", "content": system_prompt},
283
- {"role": "user", "content": user_prompt}
284
- ],
285
- max_tokens=max_tokens,
286
- temperature=0.1
287
- )
288
-
289
- answer = response.choices[0].message.content.strip()
290
- print(f"🧠 LLM reasoning: {answer}")
291
- return answer
292
-
293
- except Exception as e:
294
- print(f"❌ LLM error: {e}")
295
- return "LLM reasoning failed"
296
-
297
- def process_question(self, question):
298
- """Main question processing with LLM reasoning"""
299
- print(f"🎯 Processing: {question[:100]}...")
300
-
301
- # Check for multimedia content first
302
- media_type = self.detect_multimedia(question)
303
- if media_type:
304
- multimedia_result = self.handle_multimedia_question(question, media_type)
305
- if multimedia_result != "Unable to process video content - requires YouTube API access":
306
- return multimedia_result
307
-
308
- # Handle reversed text
309
- if ".rewsna eht sa" in question:
310
- return "right"
311
-
312
- # For regular questions, search and use LLM reasoning
313
- print("🔍 Searching for relevant information...")
314
- search_results = self.search_comprehensive(question, max_results=4)
315
-
316
- # Combine search results
317
- context = ""
318
- for result in search_results:
319
- context += f"Source ({result['source']}): {result['title']}\n{result['content']}\n\n"
320
-
321
- # Use LLM for reasoning
322
- if self.openai_client:
323
- answer = self.llm_reasoning(question, context)
324
-
325
- # Clean up answer for GAIA format
326
- answer = self.format_gaia_answer(answer)
327
- return answer
328
- else:
329
- # Fallback to basic pattern matching if no LLM
330
- return self.basic_fallback(question, search_results)
331
-
332
- def format_gaia_answer(self, answer):
333
- """Format answer according to GAIA requirements"""
334
- if not answer or answer in ["Unable to determine answer", "LLM reasoning failed"]:
335
- return "Unable to determine answer"
336
-
337
- # Remove common prefixes
338
- answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
339
-
340
- # Remove articles
341
- answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
342
-
343
- # Remove trailing punctuation
344
- answer = re.sub(r'[.!?]+$', '', answer)
345
-
346
- # Clean whitespace
347
- answer = ' '.join(answer.split())
348
-
349
- return answer
350
-
351
- def basic_fallback(self, question, search_results):
352
- """Basic fallback when LLM is unavailable"""
353
- # Combine search content
354
- all_text = question + " "
355
- for result in search_results:
356
- all_text += f" {result.get('content', '')}"
357
-
358
- question_lower = question.lower()
359
-
360
- # Basic patterns
361
- if 'capital' in question_lower:
362
- capitals = re.findall(r'\b([A-Z][a-z]+)\s+is\s+the\s+capital|capital.*?is\s+([A-Z][a-z]+)', all_text)
363
- if capitals:
364
- return capitals[0][0] or capitals[0][1]
365
-
366
- if 'who' in question_lower and 'first person' in question_lower and 'moon' in question_lower:
367
- return "Neil Armstrong"
368
-
369
- if any(op in question for op in ['+', '-', '*', '/']):
370
- numbers = re.findall(r'\d+', question)
371
- if len(numbers) >= 2:
372
- a, b = int(numbers[0]), int(numbers[1])
373
- if '+' in question:
374
- return str(a + b)
375
- elif '*' in question:
376
- return str(a * b)
377
-
378
- return "Unable to determine answer"
379
-
380
- def __call__(self, question: str) -> str:
381
- """Main entry point"""
382
- try:
383
- answer = self.process_question(question)
384
- print(f"✅ Final answer: {answer}")
385
- return answer
386
- except Exception as e:
387
- print(f"❌ Error: {e}")
388
- return "Error processing question"
389
-
390
- # Use as drop-in replacement
391
- BasicAgent = AdvancedGAIAAgent
392
-
393
- if __name__ == "__main__":
394
- agent = AdvancedGAIAAgent()
395
-
396
- test_questions = [
397
- "What is 15 + 27?",
398
- "Who was the first person to walk on the moon?",
399
- "What is the capital of France?",
400
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
401
- ]
402
-
403
- print("Testing Advanced GAIA Agent:")
404
- print("=" * 50)
405
-
406
- for i, question in enumerate(test_questions, 1):
407
- print(f"\n{i}. Question: {question}")
408
- answer = agent(question)
409
- print(f" Answer: {answer}")
410
- print("-" * 30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agent.json DELETED
@@ -1,53 +0,0 @@
1
- {
2
- "tools": [
3
- "web_search",
4
- "visit_webpage",
5
- "final_answer"
6
- ],
7
- "model": {
8
- "class": "HfApiModel",
9
- "data": {
10
- "max_tokens": 2096,
11
- "temperature": 0.5,
12
- "last_input_token_count": null,
13
- "last_output_token_count": null,
14
- "model_id": "Qwen/Qwen2.5-Coder-32B-Instruct",
15
- "custom_role_conversions": null
16
- }
17
- },
18
- "prompt_templates": {
19
- "system_prompt": "You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.\nTo do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.\nTo solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.\n\nAt each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.\nThen in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.\nDuring each intermediate step, you can use 'print()' to save whatever important information you will then need.\nThese print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.\nIn the end you have to return a final answer using the `final_answer` tool.\n\nHere are a few examples using notional tools:\n---\nTask: \"Generate an image of the oldest person in this document.\"\n\nThought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.\nCode:\n```py\nanswer = document_qa(document=document, question=\"Who is the oldest person mentioned?\")\nprint(answer)\n```<end_code>\nObservation: \"The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland.\"\n\nThought: I will now generate an image showcasing the oldest person.\nCode:\n```py\nimage = image_generator(\"A portrait of John Doe, a 55-year-old man living in Canada.\")\nfinal_answer(image)\n```<end_code>\n\n---\nTask: \"What is the result of the following operation: 5 + 3 + 1294.678?\"\n\nThought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool\nCode:\n```py\nresult = 5 + 3 + 1294.678\nfinal_answer(result)\n```<end_code>\n\n---\nTask:\n\"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.\nYou have been provided with these additional arguments, that you can access using the keys as variables in your python code:\n{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}\"\n\nThought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.\nCode:\n```py\ntranslated_question = translator(question=question, src_lang=\"French\", tgt_lang=\"English\")\nprint(f\"The translated question is {translated_question}.\")\nanswer = image_qa(image=image, question=translated_question)\nfinal_answer(f\"The answer is {answer}\")\n```<end_code>\n\n---\nTask:\nIn a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.\nWhat does he say was the consequence of Einstein learning too much math on his creativity, in one word?\n\nThought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.\nCode:\n```py\npages = search(query=\"1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein\")\nprint(pages)\n```<end_code>\nObservation:\nNo result found for query \"1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein\".\n\nThought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.\nCode:\n```py\npages = search(query=\"1979 interview Stanislaus Ulam\")\nprint(pages)\n```<end_code>\nObservation:\nFound 6 pages:\n[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)\n\n[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)\n\n(truncated)\n\nThought: I will read the first 2 pages to know more.\nCode:\n```py\nfor url in [\"https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/\", \"https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/\"]:\n whole_page = visit_webpage(url)\n print(whole_page)\n print(\"\\n\" + \"=\"*80 + \"\\n\") # Print separator between pages\n```<end_code>\nObservation:\nManhattan Project Locations:\nLos Alamos, NM\nStanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at\n(truncated)\n\nThought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: \"He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity.\" Let's answer in one word.\nCode:\n```py\nfinal_answer(\"diminished\")\n```<end_code>\n\n---\nTask: \"Which city has the highest population: Guangzhou or Shanghai?\"\n\nThought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.\nCode:\n```py\nfor city in [\"Guangzhou\", \"Shanghai\"]:\n print(f\"Population {city}:\", search(f\"{city} population\")\n```<end_code>\nObservation:\nPopulation Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']\nPopulation Shanghai: '26 million (2019)'\n\nThought: Now I know that Shanghai has the highest population.\nCode:\n```py\nfinal_answer(\"Shanghai\")\n```<end_code>\n\n---\nTask: \"What is the current age of the pope, raised to the power 0.36?\"\n\nThought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.\nCode:\n```py\npope_age_wiki = wiki(query=\"current pope age\")\nprint(\"Pope age as per wikipedia:\", pope_age_wiki)\npope_age_search = web_search(query=\"current pope age\")\nprint(\"Pope age as per google search:\", pope_age_search)\n```<end_code>\nObservation:\nPope age: \"The pope Francis is currently 88 years old.\"\n\nThought: I know that the pope is 88 years old. Let's compute the result using python code.\nCode:\n```py\npope_current_age = 88 ** 0.36\nfinal_answer(pope_current_age)\n```<end_code>\n\nAbove example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:\n{%- for tool in tools.values() %}\n- {{ tool.name }}: {{ tool.description }}\n Takes inputs: {{tool.inputs}}\n Returns an output of type: {{tool.output_type}}\n{%- endfor %}\n\n{%- if managed_agents and managed_agents.values() | list %}\nYou can also give tasks to team members.\nCalling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.\nGiven that this team member is a real human, you should be very verbose in your task.\nHere is a list of the team members that you can call:\n{%- for agent in managed_agents.values() %}\n- {{ agent.name }}: {{ agent.description }}\n{%- endfor %}\n{%- else %}\n{%- endif %}\n\nHere are the rules you should always follow to solve your task:\n1. Always provide a 'Thought:' sequence, and a 'Code:\\n```py' sequence ending with '```<end_code>' sequence, else you will fail.\n2. Use only variables that you have defined!\n3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': \"What is the place where James Bond lives?\"})', but use the arguments directly as in 'answer = wiki(query=\"What is the place where James Bond lives?\")'.\n4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.\n5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.\n6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.\n7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.\n8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}\n9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.\n10. Don't give up! You're in charge of solving the task, not providing directions to solve it.\n\nNow Begin! If you solve the task correctly, you will receive a reward of $1,000,000.",
20
- "planning": {
21
- "initial_facts": "Below I will present you a task.\n\nYou will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need.\nTo do so, you will have to read the task and identify things that must be discovered in order to successfully complete it.\nDon't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey:\n\n---\n### 1. Facts given in the task\nList here the specific facts given in the task that could help you (there might be nothing here).\n\n### 2. Facts to look up\nList here any facts that we may need to look up.\nAlso list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here.\n\n### 3. Facts to derive\nList here anything that we want to derive from the above by logical reasoning, for instance computation or simulation.\n\nKeep in mind that \"facts\" will typically be specific names, dates, values, etc. Your answer should use the below headings:\n### 1. Facts given in the task\n### 2. Facts to look up\n### 3. Facts to derive\nDo not add anything else.",
22
- "initial_plan": "You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.\n\nNow for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.\nThis plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.\nDo not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.\nAfter writing the final step of the plan, write the '\\n<end_plan>' tag and stop there.\n\nHere is your task:\n\nTask:\n```\n{{task}}\n```\nYou can leverage these tools:\n{%- for tool in tools.values() %}\n- {{ tool.name }}: {{ tool.description }}\n Takes inputs: {{tool.inputs}}\n Returns an output of type: {{tool.output_type}}\n{%- endfor %}\n\n{%- if managed_agents and managed_agents.values() | list %}\nYou can also give tasks to team members.\nCalling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.\nGiven that this team member is a real human, you should be very verbose in your request.\nHere is a list of the team members that you can call:\n{%- for agent in managed_agents.values() %}\n- {{ agent.name }}: {{ agent.description }}\n{%- endfor %}\n{%- else %}\n{%- endif %}\n\nList of facts that you know:\n```\n{{answer_facts}}\n```\n\nNow begin! Write your plan below.",
23
- "update_facts_pre_messages": "You are a world expert at gathering known and unknown facts based on a conversation.\nBelow you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these:\n### 1. Facts given in the task\n### 2. Facts that we have learned\n### 3. Facts still to look up\n### 4. Facts still to derive\nFind the task and history below:",
24
- "update_facts_post_messages": "Earlier we've built a list of facts.\nBut since in your previous steps you may have learned useful new facts or invalidated some false ones.\nPlease update your list of facts based on the previous history, and provide these headings:\n### 1. Facts given in the task\n### 2. Facts that we have learned\n### 3. Facts still to look up\n### 4. Facts still to derive\n\nNow write your new list of facts below.",
25
- "update_plan_pre_messages": "You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.\n\nYou have been given a task:\n```\n{{task}}\n```\n\nFind below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task.\nIf the previous tries so far have met some success, you can make an updated plan based on these actions.\nIf you are stalled, you can make a completely new plan starting from scratch.",
26
- "update_plan_post_messages": "You're still working towards solving this task:\n```\n{{task}}\n```\n\nYou can leverage these tools:\n{%- for tool in tools.values() %}\n- {{ tool.name }}: {{ tool.description }}\n Takes inputs: {{tool.inputs}}\n Returns an output of type: {{tool.output_type}}\n{%- endfor %}\n\n{%- if managed_agents and managed_agents.values() | list %}\nYou can also give tasks to team members.\nCalling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'.\nGiven that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.\nHere is a list of the team members that you can call:\n{%- for agent in managed_agents.values() %}\n- {{ agent.name }}: {{ agent.description }}\n{%- endfor %}\n{%- else %}\n{%- endif %}\n\nHere is the up to date list of facts that you know:\n```\n{{facts_update}}\n```\n\nNow for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.\nThis plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.\nBeware that you have {remaining_steps} steps remaining.\nDo not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.\nAfter writing the final step of the plan, write the '\\n<end_plan>' tag and stop there.\n\nNow write your new plan below."
27
- },
28
- "managed_agent": {
29
- "task": "You're a helpful agent named '{{name}}'.\nYou have been submitted this task by your manager.\n---\nTask:\n{{task}}\n---\nYou're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.\n\nYour final_answer WILL HAVE to contain these parts:\n### 1. Task outcome (short version):\n### 2. Task outcome (extremely detailed version):\n### 3. Additional context (if relevant):\n\nPut all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.\nAnd even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.",
30
- "report": "Here is the final answer from your managed agent '{{name}}':\n{{final_answer}}"
31
- }
32
- },
33
- "max_steps": 6,
34
- "verbosity_level": 1,
35
- "grammar": null,
36
- "planning_interval": null,
37
- "name": null,
38
- "description": null,
39
- "authorized_imports": [
40
- "unicodedata",
41
- "stat",
42
- "datetime",
43
- "random",
44
- "pandas",
45
- "itertools",
46
- "math",
47
- "statistics",
48
- "queue",
49
- "time",
50
- "collections",
51
- "re"
52
- ]
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
consensus_gaia_agent.py DELETED
@@ -1,430 +0,0 @@
1
- """
2
- Multi-LLM Consensus GAIA Agent using OpenRouter
3
- Uses Gemini cypher, Qwen3-235B, and deepseek Ultra in parallel for consensus
4
- """
5
-
6
- import os
7
- import re
8
- import json
9
- import asyncio
10
- import threading
11
- from concurrent.futures import ThreadPoolExecutor
12
- from typing import Dict, List, Any, Optional, Tuple
13
- import pandas as pd
14
- from datetime import datetime
15
-
16
- # Core imports
17
- from ddgs import DDGS
18
- import wikipedia
19
-
20
- # OpenRouter integration
21
- try:
22
- import openai
23
- OPENAI_AVAILABLE = True
24
- except ImportError:
25
- OPENAI_AVAILABLE = False
26
-
27
- # Search engines
28
- try:
29
- from exa_py import Exa
30
- EXA_AVAILABLE = True
31
- except ImportError:
32
- EXA_AVAILABLE = False
33
-
34
- try:
35
- from tavily import TavilyClient
36
- TAVILY_AVAILABLE = True
37
- except ImportError:
38
- TAVILY_AVAILABLE = False
39
-
40
-
41
- class ConsensusGAIAAgent:
42
- """
43
- Multi-LLM consensus agent using three different models on OpenRouter
44
- Each model works independently, then they debate to reach consensus
45
- """
46
-
47
- def __init__(self):
48
- print("🚀 Initializing Multi-LLM Consensus GAIA Agent")
49
-
50
- # API setup
51
- self.openrouter_key = os.getenv("OPENROUTER_API_KEY")
52
-
53
- if not self.openrouter_key:
54
- print("❌ OPENROUTER_API_KEY required for consensus agent")
55
- raise ValueError("OpenRouter API key is required")
56
-
57
- print(f"🔑 OpenRouter API: ✅ Available")
58
-
59
- # Initialize the three models
60
- self.models = {
61
- "gemini": {
62
- "name": "openrouter/cypher-alpha:free",
63
- "role": "Speed & Creativity",
64
- "client": self._create_openrouter_client()
65
- },
66
- "qwen": {
67
- "name": "qwen/qwen-2.5-coder-32b-instruct:free",
68
- "role": "Logic & Reasoning",
69
- "client": self._create_openrouter_client()
70
- },
71
- "deepseek": {
72
- "name": "deepseek/deepseek-r1-0528:free",
73
- "role": "Analysis & Validation",
74
- "client": self._create_openrouter_client()
75
- }
76
- }
77
-
78
- print("🤖 Initialized 3 LLM models:")
79
- for key, model in self.models.items():
80
- print(f" {key}: {model['name']} ({model['role']})")
81
-
82
- # Search engines
83
- self.ddgs = DDGS()
84
- self.setup_search_engines()
85
-
86
- def _create_openrouter_client(self):
87
- """Create OpenRouter client"""
88
- return openai.OpenAI(
89
- api_key=self.openrouter_key,
90
- base_url="https://openrouter.ai/api/v1"
91
- )
92
-
93
- def setup_search_engines(self):
94
- """Setup search engines"""
95
- print("🔍 Setting up search engines...")
96
-
97
- # Exa
98
- if EXA_AVAILABLE and os.getenv("EXA_API_KEY"):
99
- self.exa = Exa(api_key=os.getenv("EXA_API_KEY"))
100
- print("✅ Exa search initialized")
101
- else:
102
- self.exa = None
103
-
104
- # Tavily
105
- if TAVILY_AVAILABLE and os.getenv("TAVILY_API_KEY"):
106
- self.tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
107
- print("✅ Tavily search initialized")
108
- else:
109
- self.tavily = None
110
-
111
- def comprehensive_web_search(self, query: str, max_results: int = 5) -> str:
112
- """Search using all available engines"""
113
- print(f"🔍 Comprehensive search: {query}")
114
- all_results = []
115
-
116
- # Try Tavily first
117
- if self.tavily:
118
- try:
119
- tavily_results = self.tavily.search(query[:350], max_results=3)
120
- if tavily_results and 'results' in tavily_results:
121
- for result in tavily_results['results']:
122
- all_results.append(f"Tavily: {result.get('title', '')}\n{result.get('content', '')}")
123
- print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
124
- except Exception as e:
125
- print(f"❌ Tavily error: {e}")
126
-
127
- # Try Exa
128
- if self.exa and len(all_results) < max_results:
129
- try:
130
- exa_results = self.exa.search_and_contents(query[:200], num_results=2)
131
- if exa_results and hasattr(exa_results, 'results'):
132
- for result in exa_results.results:
133
- title = getattr(result, 'title', '')
134
- text = getattr(result, 'text', '')
135
- all_results.append(f"Exa: {title}\n{text}")
136
- print(f"📊 Exa: {len(exa_results.results)} results")
137
- except Exception as e:
138
- print(f"❌ Exa error: {e}")
139
-
140
- # Wikipedia search
141
- try:
142
- wiki_terms = self.extract_key_terms(query)[:100]
143
- wiki_results = wikipedia.search(wiki_terms, results=2)
144
- if wiki_results:
145
- page = wikipedia.page(wiki_results[0])
146
- all_results.append(f"Wikipedia: {page.title}\n{page.summary}")
147
- print(f"📊 Wikipedia: {len(wiki_results)} results")
148
- except Exception as e:
149
- print(f"❌ Wikipedia error: {e}")
150
-
151
- # DuckDuckGo fallback
152
- if len(all_results) < max_results:
153
- try:
154
- remaining = max_results - len(all_results)
155
- ddg_results = list(self.ddgs.text(query, max_results=remaining))
156
- for result in ddg_results:
157
- all_results.append(f"DuckDuckGo: {result.get('title', '')}\n{result.get('body', '')}")
158
- print(f"📊 DuckDuckGo: {len(ddg_results)} results")
159
- except Exception as e:
160
- print(f"❌ DuckDuckGo error: {e}")
161
-
162
- return "\n\n".join(all_results) if all_results else "No search results found"
163
-
164
- def extract_key_terms(self, text: str) -> str:
165
- """Extract key terms for better search"""
166
- # Remove question patterns
167
- text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
168
- text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
169
-
170
- # Extract proper nouns and years
171
- proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
172
- years = re.findall(r'\b(19|20)\d{2}\b', text)
173
-
174
- key_terms = proper_nouns[:5] + years[:2]
175
- return ' '.join(key_terms) if key_terms else text[:100]
176
-
177
- def get_model_response(self, model_key: str, question: str, context: str = "",
178
- previous_answers: List[str] = None) -> Dict[str, Any]:
179
- """Get response from a specific model"""
180
- model = self.models[model_key]
181
- previous_answers = previous_answers or []
182
-
183
- print(f"🤖 {model_key} ({model['role']}) thinking...")
184
-
185
- # Create system prompt based on model role
186
- if model_key == "gemini":
187
- system_prompt = """You are the Speed & Creativity expert in a consensus team. You excel at quick insights and creative problem-solving.
188
-
189
- CRITICAL GAIA FORMATTING RULES:
190
- - Numbers: NO commas, NO units like $ or % unless requested (e.g., "42" not "42.0")
191
- - Strings: NO articles (a, an, the), NO abbreviations (e.g., "Paris" not "The Paris")
192
- - Lists: comma separated, apply above rules
193
-
194
- Your role: Provide fast, intuitive answers and catch obvious patterns others might miss."""
195
-
196
- elif model_key == "qwen":
197
- system_prompt = """You are the Logic & Reasoning expert in a consensus team. You excel at step-by-step analysis and logical deduction.
198
-
199
- CRITICAL GAIA FORMATTING RULES:
200
- - Numbers: NO commas, NO units like $ or % unless requested (e.g., "42" not "42.0")
201
- - Strings: NO articles (a, an, the), NO abbreviations (e.g., "Paris" not "The Paris")
202
- - Lists: comma separated, apply above rules
203
-
204
- Your role: Break down complex problems logically and verify reasoning chains."""
205
-
206
- else: # deepseek
207
- system_prompt = """You are the Analysis & Validation expert in a consensus team. You excel at critical evaluation and fact-checking.
208
-
209
- CRITICAL GAIA FORMATTING RULES:
210
- - Numbers: NO commas, NO units like $ or % unless requested (e.g., "42" not "42.0")
211
- - Strings: NO articles (a, an, the), NO abbreviations (e.g., "Paris" not "The Paris")
212
- - Lists: comma separated, apply above rules
213
-
214
- Your role: Validate information accuracy and catch potential errors in reasoning."""
215
-
216
- # Build prompt
217
- user_prompt = f"""Question: {question}
218
-
219
- Context from research:
220
- {context}
221
-
222
- """
223
-
224
- if previous_answers:
225
- user_prompt += f"""Previous team answers for reference:
226
- {chr(10).join([f'- {ans}' for ans in previous_answers])}
227
-
228
- """
229
-
230
- user_prompt += """Analyze this carefully and provide your best answer. Be precise and follow GAIA formatting rules."""
231
-
232
- try:
233
- response = model["client"].chat.completions.create(
234
- model=model["name"],
235
- messages=[
236
- {"role": "system", "content": system_prompt},
237
- {"role": "user", "content": user_prompt}
238
- ],
239
- max_tokens=500,
240
- temperature=0.2 if model_key == "qwen" else 0.3 # Lower temp for reasoning model
241
- )
242
-
243
- answer = response.choices[0].message.content.strip()
244
-
245
- return {
246
- "model": model_key,
247
- "answer": answer,
248
- "role": model["role"],
249
- "success": True
250
- }
251
-
252
- except Exception as e:
253
- print(f"❌ {model_key} error: {e}")
254
- return {
255
- "model": model_key,
256
- "answer": f"Error: {e}",
257
- "role": model["role"],
258
- "success": False
259
- }
260
-
261
- def run_parallel_models(self, question: str, context: str) -> List[Dict[str, Any]]:
262
- """Run all models in parallel threads"""
263
- print("🔄 Running all 3 models in parallel...")
264
-
265
- results = []
266
-
267
- def run_model(model_key):
268
- return self.get_model_response(model_key, question, context)
269
-
270
- # Use ThreadPoolExecutor for parallel execution
271
- with ThreadPoolExecutor(max_workers=3) as executor:
272
- # Submit all models
273
- futures = {
274
- executor.submit(run_model, model_key): model_key
275
- for model_key in self.models.keys()
276
- }
277
-
278
- # Wait for all to complete
279
- for future in futures:
280
- try:
281
- result = future.result(timeout=30) # 30 second timeout per model
282
- results.append(result)
283
- except Exception as e:
284
- model_key = futures[future]
285
- print(f"❌ {model_key} failed: {e}")
286
- results.append({
287
- "model": model_key,
288
- "answer": f"Timeout/Error: {e}",
289
- "role": self.models[model_key]["role"],
290
- "success": False
291
- })
292
-
293
- print("✅ All models completed")
294
- return results
295
-
296
- def consensus_debate(self, question: str, initial_responses: List[Dict[str, Any]],
297
- context: str) -> str:
298
- """Have models debate and reach consensus"""
299
- print("🗣️ Starting consensus debate...")
300
-
301
- # Extract answers from successful responses
302
- valid_responses = [r for r in initial_responses if r["success"]]
303
- if not valid_responses:
304
- return "All models failed - unable to determine answer"
305
-
306
- answers = [r["answer"] for r in valid_responses]
307
-
308
- # Check if all models already agree
309
- cleaned_answers = [self.format_gaia_answer(ans) for ans in answers]
310
- if len(set(cleaned_answers)) == 1:
311
- print("✅ All models agree - no debate needed")
312
- return cleaned_answers[0]
313
-
314
- print(f"🔄 Models disagree - running consensus round...")
315
- print(f" Initial answers: {cleaned_answers}")
316
-
317
- # Run consensus round - each model sees others' answers
318
- consensus_results = []
319
-
320
- def run_consensus(model_key):
321
- other_answers = [r["answer"] for r in valid_responses if r["model"] != model_key]
322
- return self.get_model_response(model_key, question, context, other_answers)
323
-
324
- with ThreadPoolExecutor(max_workers=3) as executor:
325
- futures = {
326
- executor.submit(run_consensus, model_key): model_key
327
- for model_key in [r["model"] for r in valid_responses]
328
- }
329
-
330
- for future in futures:
331
- try:
332
- result = future.result(timeout=30)
333
- consensus_results.append(result)
334
- except Exception as e:
335
- model_key = futures[future]
336
- print(f"❌ {model_key} consensus failed: {e}")
337
-
338
- # Analyze consensus
339
- if consensus_results:
340
- consensus_answers = [self.format_gaia_answer(r["answer"]) for r in consensus_results if r["success"]]
341
-
342
- if consensus_answers:
343
- # Return most common answer
344
- from collections import Counter
345
- answer_counts = Counter(consensus_answers)
346
- final_answer = answer_counts.most_common(1)[0][0]
347
-
348
- print(f"✅ Consensus reached: {final_answer}")
349
- print(f" Vote breakdown: {dict(answer_counts)}")
350
- return final_answer
351
-
352
- # Fallback: return the answer from the most successful model
353
- print("⚠️ No clear consensus - using best single answer")
354
- return self.format_gaia_answer(valid_responses[0]["answer"])
355
-
356
- def format_gaia_answer(self, answer: str) -> str:
357
- """Format answer for GAIA requirements"""
358
- if not answer or "error" in answer.lower() or "unable" in answer.lower():
359
- return "Unable to determine answer"
360
-
361
- # Clean up
362
- answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
363
- answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
364
- answer = re.sub(r'[.!?]+$', '', answer)
365
- answer = ' '.join(answer.split())
366
-
367
- return answer
368
-
369
- def __call__(self, question: str) -> str:
370
- """Main entry point for consensus agent"""
371
- print(f"🎯 Consensus GAIA Agent processing: {question[:100]}...")
372
-
373
- try:
374
- # Handle special cases quickly
375
- if ".rewsna eht sa" in question:
376
- return "right"
377
-
378
- # Step 1: Gather research context
379
- print("📚 Step 1: Gathering research context...")
380
- context = self.comprehensive_web_search(question)
381
-
382
- # Step 2: Run all models in parallel
383
- print("🤖 Step 2: Running parallel model analysis...")
384
- initial_responses = self.run_parallel_models(question, context)
385
-
386
- # Print initial responses
387
- print("\n📋 Initial Model Responses:")
388
- for response in initial_responses:
389
- status = "✅" if response["success"] else "❌"
390
- print(f" {status} {response['model']} ({response['role']}): {response['answer'][:100]}...")
391
-
392
- # Step 3: Consensus and debate
393
- print("\n🗣️ Step 3: Consensus building...")
394
- final_answer = self.consensus_debate(question, initial_responses, context)
395
-
396
- print(f"\n🎉 Final consensus answer: {final_answer}")
397
- return final_answer
398
-
399
- except Exception as e:
400
- print(f"❌ Consensus agent error: {e}")
401
- return "Error processing question"
402
-
403
-
404
- # Create aliases for compatibility
405
- BasicAgent = ConsensusGAIAAgent
406
- GAIAAgent = ConsensusGAIAAgent
407
- FrameworkGAIAAgent = ConsensusGAIAAgent
408
- SimplifiedGAIAAgent = ConsensusGAIAAgent
409
-
410
-
411
- if __name__ == "__main__":
412
- # Test the consensus agent
413
- agent = ConsensusGAIAAgent()
414
-
415
- test_questions = [
416
- "What is 25 * 4?",
417
- "Who was the first person to walk on the moon?",
418
- "What is the capital of France?",
419
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
420
- ]
421
-
422
- print("\n" + "="*60)
423
- print("Testing Multi-LLM Consensus GAIA Agent")
424
- print("="*60)
425
-
426
- for i, question in enumerate(test_questions, 1):
427
- print(f"\n{i}. Testing: {question}")
428
- answer = agent(question)
429
- print(f" Final Answer: {answer}")
430
- print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
framework_gaia_agent.py DELETED
@@ -1,508 +0,0 @@
1
- """
2
- Framework-based GAIA Agent using SmolAgents, LlamaIndex, and LangGraph
3
- Following the Hugging Face agents course best practices
4
- """
5
-
6
- import os
7
- import re
8
- import json
9
- import tempfile
10
- import subprocess
11
- from typing import Dict, List, Any, Optional
12
- import pandas as pd
13
- from datetime import datetime
14
-
15
- # Framework imports
16
- from smolagents import CodeAgent, DuckDuckGoSearchTool
17
- try:
18
- from smolagents import OpenAIModel
19
- OPENAI_SMOLAGENTS = True
20
- except ImportError:
21
- OPENAI_SMOLAGENTS = False
22
-
23
- from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
24
- from llama_index.core.agent import ReActAgent
25
- from llama_index.core.tools import FunctionTool, QueryEngineTool
26
- from llama_index.core.llms import ChatMessage
27
- try:
28
- from llama_index.llms.openai import OpenAI as LlamaOpenAI
29
- OPENAI_LLAMAINDEX = True
30
- except ImportError:
31
- OPENAI_LLAMAINDEX = False
32
-
33
- from langgraph.prebuilt import create_react_agent
34
- try:
35
- from langchain_openai import ChatOpenAI
36
- OPENAI_LANGGRAPH = True
37
- except ImportError:
38
- OPENAI_LANGGRAPH = False
39
-
40
- # Search engines
41
- from ddgs import DDGS
42
- import wikipedia
43
-
44
- # Optional engines
45
- try:
46
- from exa_py import Exa
47
- EXA_AVAILABLE = True
48
- except ImportError:
49
- EXA_AVAILABLE = False
50
-
51
- try:
52
- from tavily import TavilyClient
53
- TAVILY_AVAILABLE = True
54
- except ImportError:
55
- TAVILY_AVAILABLE = False
56
-
57
-
58
- class FrameworkGAIAAgent:
59
- """
60
- Multi-framework GAIA agent that can use SmolAgents, LlamaIndex, or LangGraph
61
- depending on what's available and the question type
62
- """
63
-
64
- def __init__(self, preferred_framework: str = "auto"):
65
- """
66
- Initialize the framework-based GAIA agent
67
-
68
- Args:
69
- preferred_framework: "smolagents", "llamaindex", "langgraph", or "auto"
70
- """
71
- print("🚀 Initializing Framework-based GAIA Agent")
72
-
73
- self.preferred_framework = preferred_framework
74
- self.available_frameworks = []
75
-
76
- # Initialize OpenAI if available
77
- self.openai_key = os.getenv("OPENAI_API_KEY")
78
-
79
- # Initialize search engines
80
- self.ddgs = DDGS()
81
- self.setup_search_engines()
82
-
83
- # Setup frameworks
84
- self.setup_smolagents()
85
- self.setup_llamaindex()
86
- self.setup_langgraph()
87
-
88
- # Create tools for all frameworks
89
- self.setup_tools()
90
-
91
- print(f"✅ Available frameworks: {', '.join(self.available_frameworks)}")
92
-
93
- def setup_search_engines(self):
94
- """Setup search engines"""
95
- print("🔍 Setting up search engines...")
96
-
97
- # Exa
98
- if EXA_AVAILABLE and os.getenv("EXA_API_KEY"):
99
- self.exa = Exa(api_key=os.getenv("EXA_API_KEY"))
100
- print("✅ Exa search initialized")
101
- else:
102
- self.exa = None
103
-
104
- # Tavily
105
- if TAVILY_AVAILABLE and os.getenv("TAVILY_API_KEY"):
106
- self.tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
107
- print("✅ Tavily search initialized")
108
- else:
109
- self.tavily = None
110
-
111
- def setup_smolagents(self):
112
- """Setup SmolAgents framework"""
113
- try:
114
- print("🔧 Setting up SmolAgents...")
115
-
116
- if self.openai_key and OPENAI_SMOLAGENTS:
117
- # Use OpenAI model
118
- self.smol_model = OpenAIModel("gpt-4o-mini", api_key=self.openai_key)
119
- else:
120
- # Use HuggingFace inference
121
- from smolagents import InferenceClientModel
122
- self.smol_model = InferenceClientModel(
123
- "Qwen/Qwen2.5-Coder-32B-Instruct",
124
- token=os.getenv("HF_TOKEN")
125
- )
126
-
127
- # Create search tool
128
- search_tool = DuckDuckGoSearchTool()
129
-
130
- # Create agent
131
- self.smol_agent = CodeAgent(
132
- tools=[search_tool],
133
- model=self.smol_model,
134
- max_iterations=10
135
- )
136
-
137
- self.available_frameworks.append("smolagents")
138
- print("✅ SmolAgents initialized")
139
-
140
- except Exception as e:
141
- print(f"❌ SmolAgents setup failed: {e}")
142
- self.smol_agent = None
143
-
144
- def setup_llamaindex(self):
145
- """Setup LlamaIndex framework"""
146
- try:
147
- print("🔧 Setting up LlamaIndex...")
148
-
149
- if self.openai_key and OPENAI_LLAMAINDEX:
150
- self.llama_llm = LlamaOpenAI(
151
- model="gpt-4o-mini",
152
- api_key=self.openai_key,
153
- temperature=0.1
154
- )
155
- else:
156
- # Use HuggingFace model
157
- from llama_index.llms.huggingface import HuggingFaceLLM
158
- self.llama_llm = HuggingFaceLLM(
159
- model_name="microsoft/DialoGPT-medium",
160
- tokenizer_name="microsoft/DialoGPT-medium",
161
- max_new_tokens=512,
162
- )
163
-
164
- # Create function tools
165
- def web_search_tool(query: str) -> str:
166
- """Search the web for information"""
167
- return self.comprehensive_web_search(query)
168
-
169
- def calculate_tool(expression: str) -> str:
170
- """Calculate mathematical expressions"""
171
- try:
172
- result = eval(expression)
173
- return str(result)
174
- except:
175
- return "Calculation error"
176
-
177
- web_tool = FunctionTool.from_defaults(fn=web_search_tool)
178
- calc_tool = FunctionTool.from_defaults(fn=calculate_tool)
179
-
180
- # Create ReAct agent
181
- self.llama_agent = ReActAgent.from_tools(
182
- [web_tool, calc_tool],
183
- llm=self.llama_llm,
184
- verbose=True,
185
- max_iterations=10
186
- )
187
-
188
- self.available_frameworks.append("llamaindex")
189
- print("✅ LlamaIndex initialized")
190
-
191
- except Exception as e:
192
- print(f"❌ LlamaIndex setup failed: {e}")
193
- self.llama_agent = None
194
-
195
- def setup_langgraph(self):
196
- """Setup LangGraph framework"""
197
- try:
198
- print("🔧 Setting up LangGraph...")
199
-
200
- if self.openai_key and OPENAI_LANGGRAPH:
201
- self.langgraph_llm = ChatOpenAI(
202
- model="gpt-4o-mini",
203
- api_key=self.openai_key,
204
- temperature=0.1
205
- )
206
-
207
- # Create tools for LangGraph
208
- def web_search(query: str) -> str:
209
- """Search the web for information"""
210
- return self.comprehensive_web_search(query)
211
-
212
- def calculator(expression: str) -> str:
213
- """Calculate mathematical expressions"""
214
- try:
215
- result = eval(expression)
216
- return str(result)
217
- except Exception as e:
218
- return f"Calculation error: {e}"
219
-
220
- def process_youtube_video(url: str) -> str:
221
- """Process YouTube video for transcription"""
222
- return f"Processing video {url} - transcription would go here"
223
-
224
- # Create LangGraph agent
225
- tools = [web_search, calculator, process_youtube_video]
226
- self.langgraph_agent = create_react_agent(
227
- self.langgraph_llm,
228
- tools,
229
- state_modifier="You are a specialized GAIA benchmark agent. Provide precise, factual answers. For numbers, don't use commas or units unless requested. For names/places, don't use articles."
230
- )
231
-
232
- self.available_frameworks.append("langgraph")
233
- print("✅ LangGraph initialized")
234
- else:
235
- print("❌ LangGraph requires OpenAI API key")
236
- self.langgraph_agent = None
237
-
238
- except Exception as e:
239
- print(f"❌ LangGraph setup failed: {e}")
240
- self.langgraph_agent = None
241
-
242
- def setup_tools(self):
243
- """Setup common tools for all frameworks"""
244
- self.tools = {
245
- "web_search": self.comprehensive_web_search,
246
- "wikipedia_search": self.wikipedia_search,
247
- "calculator": self.calculator,
248
- "process_video": self.process_video,
249
- "extract_answer": self.extract_final_answer
250
- }
251
-
252
- def comprehensive_web_search(self, query: str, max_results: int = 4) -> str:
253
- """Comprehensive web search using all available engines"""
254
- print(f"🔍 Comprehensive search: {query}")
255
- all_results = []
256
-
257
- # Try Tavily first
258
- if self.tavily:
259
- try:
260
- tavily_results = self.tavily.search(query[:350], max_results=2)
261
- if tavily_results and 'results' in tavily_results:
262
- for result in tavily_results['results']:
263
- all_results.append(f"Tavily: {result.get('title', '')}\n{result.get('content', '')}")
264
- except Exception as e:
265
- print(f"Tavily error: {e}")
266
-
267
- # Try Exa
268
- if self.exa and len(all_results) < max_results:
269
- try:
270
- exa_results = self.exa.search_and_contents(query[:200], num_results=2)
271
- if exa_results and hasattr(exa_results, 'results'):
272
- for result in exa_results.results:
273
- all_results.append(f"Exa: {getattr(result, 'title', '')}\n{getattr(result, 'text', '')}")
274
- except Exception as e:
275
- print(f"Exa error: {e}")
276
-
277
- # DuckDuckGo fallback
278
- if len(all_results) < max_results:
279
- try:
280
- ddg_results = list(self.ddgs.text(query, max_results=max_results-len(all_results)))
281
- for result in ddg_results:
282
- all_results.append(f"DuckDuckGo: {result.get('title', '')}\n{result.get('body', '')}")
283
- except Exception as e:
284
- print(f"DuckDuckGo error: {e}")
285
-
286
- return "\n\n".join(all_results) if all_results else "No search results found"
287
-
288
- def wikipedia_search(self, query: str) -> str:
289
- """Search Wikipedia"""
290
- try:
291
- search_results = wikipedia.search(query, results=2)
292
- if search_results:
293
- page = wikipedia.page(search_results[0])
294
- return f"Wikipedia: {page.title}\n{page.summary}"
295
- return "No Wikipedia results"
296
- except Exception as e:
297
- return f"Wikipedia error: {e}"
298
-
299
- def calculator(self, expression: str) -> str:
300
- """Safe calculator"""
301
- try:
302
- # Only allow safe operations
303
- allowed_chars = set('0123456789+-*/().= ')
304
- if all(c in allowed_chars for c in expression):
305
- result = eval(expression)
306
- return str(result)
307
- else:
308
- return "Invalid expression"
309
- except Exception as e:
310
- return f"Calculation error: {e}"
311
-
312
- def process_video(self, url: str) -> str:
313
- """Process video URLs"""
314
- if 'youtube.com' in url:
315
- video_id = re.search(r'v=([a-zA-Z0-9_-]+)', url)
316
- if video_id:
317
- return f"Processing YouTube video {video_id.group(1)} - transcription capability needed"
318
- return "Video processing requires additional setup"
319
-
320
- def extract_final_answer(self, text: str, question: str) -> str:
321
- """Extract final answer from text"""
322
- question_lower = question.lower()
323
-
324
- # Numbers
325
- if any(word in question_lower for word in ['how many', 'count', 'number']):
326
- numbers = re.findall(r'\b\d+\b', text)
327
- if numbers:
328
- return numbers[0]
329
-
330
- # Names
331
- if 'who' in question_lower:
332
- names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
333
- if names:
334
- return names[0]
335
-
336
- # Places
337
- if 'where' in question_lower or 'capital' in question_lower:
338
- places = re.findall(r'\b[A-Z][a-z]+\b', text)
339
- if places:
340
- return places[0]
341
-
342
- return "Unable to extract answer"
343
-
344
- def choose_framework(self, question: str) -> str:
345
- """Choose the best framework for the question"""
346
- question_lower = question.lower()
347
-
348
- # For code/calculation heavy tasks, prefer SmolAgents (code-first)
349
- if any(word in question_lower for word in ['calculate', 'code', 'python', 'math']):
350
- if "smolagents" in self.available_frameworks:
351
- return "smolagents"
352
-
353
- # For multi-step reasoning, prefer LangGraph
354
- if any(word in question_lower for word in ['step', 'process', 'analyze', 'between', 'from']):
355
- if "langgraph" in self.available_frameworks:
356
- return "langgraph"
357
-
358
- # For document/knowledge tasks, prefer LlamaIndex
359
- if any(word in question_lower for word in ['wikipedia', 'document', 'article', 'paper']):
360
- if "llamaindex" in self.available_frameworks:
361
- return "llamaindex"
362
-
363
- # Default preference order
364
- if self.preferred_framework != "auto" and self.preferred_framework in self.available_frameworks:
365
- return self.preferred_framework
366
-
367
- # Auto selection
368
- if "langgraph" in self.available_frameworks:
369
- return "langgraph"
370
- elif "smolagents" in self.available_frameworks:
371
- return "smolagents"
372
- elif "llamaindex" in self.available_frameworks:
373
- return "llamaindex"
374
- else:
375
- return "fallback"
376
-
377
- def solve_with_smolagents(self, question: str) -> str:
378
- """Solve using SmolAgents"""
379
- print("🔧 Using SmolAgents framework")
380
- try:
381
- result = self.smol_agent.run(question)
382
- return str(result)
383
- except Exception as e:
384
- print(f"SmolAgents error: {e}")
385
- return self.fallback_solve(question)
386
-
387
- def solve_with_llamaindex(self, question: str) -> str:
388
- """Solve using LlamaIndex"""
389
- print("🔧 Using LlamaIndex framework")
390
- try:
391
- response = self.llama_agent.chat(question)
392
- return str(response)
393
- except Exception as e:
394
- print(f"LlamaIndex error: {e}")
395
- return self.fallback_solve(question)
396
-
397
- def solve_with_langgraph(self, question: str) -> str:
398
- """Solve using LangGraph"""
399
- print("🔧 Using LangGraph framework")
400
- try:
401
- result = self.langgraph_agent.invoke({
402
- "messages": [{"role": "user", "content": question}]
403
- })
404
- # Extract the final message
405
- if "messages" in result and result["messages"]:
406
- return result["messages"][-1]["content"]
407
- return str(result)
408
- except Exception as e:
409
- print(f"LangGraph error: {e}")
410
- return self.fallback_solve(question)
411
-
412
- def fallback_solve(self, question: str) -> str:
413
- """Fallback solving without frameworks"""
414
- print("��� Using fallback approach")
415
-
416
- # Handle special cases
417
- if ".rewsna eht sa" in question:
418
- return "right"
419
-
420
- # Math questions
421
- if any(op in question for op in ['+', '-', '*', '/']):
422
- numbers = re.findall(r'\d+', question)
423
- if len(numbers) >= 2:
424
- try:
425
- a, b = int(numbers[0]), int(numbers[1])
426
- if '+' in question:
427
- return str(a + b)
428
- elif '*' in question:
429
- return str(a * b)
430
- elif '-' in question:
431
- return str(a - b)
432
- elif '/' in question:
433
- return str(a / b)
434
- except:
435
- pass
436
-
437
- # Search and extract
438
- search_results = self.comprehensive_web_search(question)
439
- answer = self.extract_final_answer(search_results, question)
440
- return answer
441
-
442
- def format_gaia_answer(self, answer: str) -> str:
443
- """Format answer for GAIA requirements"""
444
- if not answer or "unable" in answer.lower() or "error" in answer.lower():
445
- return "Unable to determine answer"
446
-
447
- # Clean up
448
- answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
449
- answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
450
- answer = re.sub(r'[.!?]+$', '', answer)
451
- answer = ' '.join(answer.split())
452
-
453
- return answer
454
-
455
- def __call__(self, question: str) -> str:
456
- """Main entry point"""
457
- print(f"🎯 Framework GAIA Agent processing: {question[:100]}...")
458
-
459
- try:
460
- # Choose framework
461
- framework = self.choose_framework(question)
462
- print(f"🎛️ Selected framework: {framework}")
463
-
464
- # Route to appropriate framework
465
- if framework == "smolagents" and self.smol_agent:
466
- answer = self.solve_with_smolagents(question)
467
- elif framework == "llamaindex" and self.llama_agent:
468
- answer = self.solve_with_llamaindex(question)
469
- elif framework == "langgraph" and self.langgraph_agent:
470
- answer = self.solve_with_langgraph(question)
471
- else:
472
- answer = self.fallback_solve(question)
473
-
474
- # Format for GAIA
475
- final_answer = self.format_gaia_answer(answer)
476
- print(f"✅ Final answer: {final_answer}")
477
- return final_answer
478
-
479
- except Exception as e:
480
- print(f"❌ Agent error: {e}")
481
- return "Error processing question"
482
-
483
-
484
- # Create aliases for compatibility
485
- BasicAgent = FrameworkGAIAAgent
486
- GAIAAgent = FrameworkGAIAAgent
487
-
488
-
489
- if __name__ == "__main__":
490
- # Test the framework agent
491
- agent = FrameworkGAIAAgent()
492
-
493
- test_questions = [
494
- "What is 25 * 4?",
495
- "Who was the first person to walk on the moon?",
496
- "What is the capital of France?",
497
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
498
- ]
499
-
500
- print("\n" + "="*60)
501
- print("Testing Framework-based GAIA Agent")
502
- print("="*60)
503
-
504
- for i, question in enumerate(test_questions, 1):
505
- print(f"\n{i}. Testing: {question}")
506
- answer = agent(question)
507
- print(f" Final Answer: {answer}")
508
- print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_agent.py DELETED
@@ -1,653 +0,0 @@
1
- """
2
- GAIA Agent - A comprehensive multi-modal AI agent for the GAIA benchmark
3
- Following best practices: LLM brain, multi-modal tools, ReAct loop, state management
4
- """
5
-
6
- import re
7
- import os
8
- import json
9
- import tempfile
10
- import subprocess
11
- import pandas as pd
12
- import requests
13
- from datetime import datetime
14
- from pathlib import Path
15
- from typing import Dict, List, Any, Optional, Tuple
16
- import base64
17
- from io import BytesIO
18
-
19
- # Core imports
20
- import wikipedia
21
- from ddgs import DDGS
22
-
23
- # LLM and multimedia
24
- import openai
25
- from PIL import Image
26
-
27
- # Optional search engines
28
- try:
29
- from exa_py import Exa
30
- EXA_AVAILABLE = True
31
- except ImportError:
32
- EXA_AVAILABLE = False
33
-
34
- try:
35
- from tavily import TavilyClient
36
- TAVILY_AVAILABLE = True
37
- except ImportError:
38
- TAVILY_AVAILABLE = False
39
-
40
- # Optional multimedia tools
41
- try:
42
- import pytube
43
- PYTUBE_AVAILABLE = True
44
- except ImportError:
45
- PYTUBE_AVAILABLE = False
46
-
47
- try:
48
- import whisper
49
- WHISPER_AVAILABLE = True
50
- except ImportError:
51
- WHISPER_AVAILABLE = False
52
-
53
-
54
- class GAIAAgent:
55
- """
56
- A comprehensive GAIA agent with:
57
- - LLM brain for reasoning and planning
58
- - Multi-modal tool execution
59
- - ReAct (Reason + Act) loop
60
- - State management and history tracking
61
- """
62
-
63
- def __init__(self):
64
- print("🚀 Initializing GAIA Agent with LLM brain and multi-modal tools")
65
-
66
- # Initialize LLM (the brain)
67
- self.openai_client = None
68
- openai_key = os.getenv("OPENAI_API_KEY")
69
- if openai_key:
70
- self.openai_client = openai.OpenAI(api_key=openai_key)
71
- print("✅ LLM brain (OpenAI) initialized")
72
- else:
73
- print("❌ CRITICAL: OPENAI_API_KEY not found - agent will fail without reasoning!")
74
- print(" Please set: export OPENAI_API_KEY=your_key_here")
75
-
76
- # Initialize search engines
77
- self.ddgs = DDGS()
78
- print("✅ DuckDuckGo search initialized")
79
-
80
- # Initialize Exa (fixed API)
81
- if EXA_AVAILABLE:
82
- exa_key = os.getenv("EXA_API_KEY")
83
- if exa_key:
84
- self.exa = Exa(api_key=exa_key)
85
- print("✅ Exa search initialized")
86
- else:
87
- self.exa = None
88
- print("⚠️ EXA_API_KEY not found")
89
- else:
90
- self.exa = None
91
-
92
- # Initialize Tavily
93
- if TAVILY_AVAILABLE:
94
- tavily_key = os.getenv("TAVILY_API_KEY")
95
- if tavily_key:
96
- self.tavily = TavilyClient(api_key=tavily_key)
97
- print("✅ Tavily search initialized")
98
- else:
99
- self.tavily = None
100
- print("⚠️ TAVILY_API_KEY not found")
101
- else:
102
- self.tavily = None
103
-
104
- # Initialize multimedia capabilities
105
- if WHISPER_AVAILABLE:
106
- print("✅ Whisper (audio transcription) available")
107
- else:
108
- print("⚠️ Whisper not available - install with: pip install whisper")
109
-
110
- if PYTUBE_AVAILABLE:
111
- print("✅ PyTube (video download) available")
112
- else:
113
- print("⚠️ PyTube not available - install with: pip install pytube")
114
-
115
- # Agent state
116
- self.reset_state()
117
-
118
- def reset_state(self):
119
- """Reset agent state for a new question"""
120
- self.state = {
121
- "question": "",
122
- "plan": "",
123
- "history": [],
124
- "facts_gathered": [],
125
- "current_step": 0,
126
- "max_steps": 15,
127
- "answer": None
128
- }
129
-
130
- def plan_and_reason(self, question: str, history: List[str] = None) -> Dict[str, Any]:
131
- """
132
- Use LLM to reason about the question and plan the next action
133
- This is the core "brain" of the agent
134
- """
135
- if not self.openai_client:
136
- print("⚠️ No LLM available - using fallback rule-based reasoning")
137
- return self.fallback_reasoning(question, history or [])
138
-
139
- history = history or []
140
-
141
- system_prompt = """You are a sophisticated AI agent designed to solve GAIA benchmark questions. You have access to multiple tools and must use multi-step reasoning to find correct answers.
142
-
143
- AVAILABLE TOOLS:
144
- 1. web_search(query) - Search the web for information
145
- 2. wikipedia_search(query) - Search Wikipedia for factual information
146
- 3. process_image(image_description) - Analyze image content (when image is described)
147
- 4. transcribe_audio_video(url) - Get transcript from YouTube/audio URLs
148
- 5. read_excel_file(description) - Process Excel/CSV data (when file is described)
149
- 6. execute_python(code) - Run Python code for calculations/data processing
150
- 7. final_answer(answer) - Provide the final answer
151
-
152
- CRITICAL FORMATTING RULES for final_answer:
153
- - Numbers: NO commas, NO units like $ or % unless requested (e.g., "100" not "100.0")
154
- - Strings: NO articles (a, an, the), NO abbreviations for cities (e.g., "Paris" not "The Paris")
155
- - Lists: comma separated, apply above rules to each element
156
-
157
- Your response must be a JSON object with either:
158
- - {"action": "tool_name", "parameters": {"param": "value"}, "reasoning": "why this action"}
159
- - {"action": "final_answer", "parameters": {"answer": "the_answer"}, "reasoning": "why this is correct"}
160
-
161
- Think step by step. Many questions require multiple steps:
162
- 1. Gather information (search/read files)
163
- 2. Process/analyze the data
164
- 3. Perform calculations if needed
165
- 4. Provide final answer
166
-
167
- Be methodical and thorough."""
168
-
169
- history_text = "\n".join([f"Step {i+1}: {step}" for i, step in enumerate(history)])
170
-
171
- user_prompt = f"""Question: {question}
172
-
173
- Previous steps taken:
174
- {history_text if history_text else "No previous steps - this is the first action."}
175
-
176
- Based on the question and any previous steps, what should I do next? Respond with a JSON object containing the action, parameters, and reasoning."""
177
-
178
- try:
179
- response = self.openai_client.chat.completions.create(
180
- model="gpt-4o-mini",
181
- messages=[
182
- {"role": "system", "content": system_prompt},
183
- {"role": "user", "content": user_prompt}
184
- ],
185
- max_tokens=500,
186
- temperature=0.1
187
- )
188
-
189
- response_text = response.choices[0].message.content.strip()
190
- print(f"🧠 LLM Reasoning: {response_text}")
191
-
192
- # Parse JSON response
193
- try:
194
- action_plan = json.loads(response_text)
195
- return action_plan
196
- except json.JSONDecodeError:
197
- # Fallback: extract JSON from response
198
- json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
199
- if json_match:
200
- return json.loads(json_match.group())
201
- else:
202
- return {"error": f"Invalid JSON response: {response_text}"}
203
-
204
- except Exception as e:
205
- print(f"❌ LLM reasoning error: {e}")
206
- return {"error": f"LLM error: {e}"}
207
-
208
- def fallback_reasoning(self, question: str, history: List[str]) -> Dict[str, Any]:
209
- """
210
- Rule-based fallback reasoning when no LLM is available
211
- """
212
- question_lower = question.lower()
213
-
214
- # Check if we already have search results in history
215
- has_search_results = any("web_search" in step or "wikipedia_search" in step for step in history)
216
- has_python_execution = any("execute_python" in step for step in history)
217
-
218
- # Math questions - calculate directly
219
- if any(op in question for op in ['+', '-', '*', '/', 'calculate']) and re.search(r'\b\d+\b', question):
220
- numbers = re.findall(r'\d+', question)
221
- if len(numbers) >= 2:
222
- code = self.generate_math_code(question, numbers)
223
- return {
224
- "action": "execute_python",
225
- "parameters": {"code": code},
226
- "reasoning": "Mathematical calculation detected - using Python to compute result"
227
- }
228
-
229
- # Video/audio questions - need transcription
230
- if ('youtube.com' in question or 'video' in question_lower) and not has_search_results:
231
- url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', question)
232
- if url_match:
233
- return {
234
- "action": "transcribe_audio_video",
235
- "parameters": {"url": url_match.group(0)},
236
- "reasoning": "Video question detected - need to process video content"
237
- }
238
-
239
- # Excel/data questions
240
- if ('excel' in question_lower or 'spreadsheet' in question_lower or 'csv' in question_lower):
241
- return {
242
- "action": "read_excel_file",
243
- "parameters": {"description": question},
244
- "reasoning": "Data file question detected - need to process file content"
245
- }
246
-
247
- # Image questions
248
- if ('image' in question_lower or 'picture' in question_lower or 'chess position' in question_lower):
249
- return {
250
- "action": "process_image",
251
- "parameters": {"image_description": question},
252
- "reasoning": "Image-based question detected - need visual analysis"
253
- }
254
-
255
- # If we have search results but haven't tried Python analysis yet
256
- if has_search_results and not has_python_execution:
257
- # Try to extract and process data with Python
258
- if any(word in question_lower for word in ['how many', 'count', 'number of', 'between', 'from', 'to']):
259
- code = self.generate_extraction_code(question)
260
- return {
261
- "action": "execute_python",
262
- "parameters": {"code": code},
263
- "reasoning": "Search completed - now analyzing data to answer counting/filtering question"
264
- }
265
-
266
- # First step - need to search for information
267
- if not has_search_results:
268
- # Determine best search strategy
269
- if any(term in question_lower for term in ['wikipedia', 'encyclopedia', 'factual']):
270
- search_query = self.extract_key_terms(question)
271
- return {
272
- "action": "wikipedia_search",
273
- "parameters": {"query": search_query},
274
- "reasoning": "Question needs factual information - searching Wikipedia"
275
- }
276
- else:
277
- search_query = self.extract_key_terms(question)
278
- return {
279
- "action": "web_search",
280
- "parameters": {"query": search_query},
281
- "reasoning": "Question needs current information - searching web"
282
- }
283
-
284
- # If we've tried everything, attempt to extract answer from existing data
285
- return {
286
- "action": "final_answer",
287
- "parameters": {"answer": self.extract_answer_from_history(question, history)},
288
- "reasoning": "Attempting to extract answer from gathered information"
289
- }
290
-
291
- def generate_math_code(self, question: str, numbers: List[str]) -> str:
292
- """Generate Python code for mathematical operations"""
293
- nums = [int(n) for n in numbers[:2]]
294
-
295
- if '+' in question or 'add' in question.lower():
296
- return f"result = {nums[0]} + {nums[1]}\nprint(result)"
297
- elif '-' in question or 'subtract' in question.lower():
298
- return f"result = {nums[0]} - {nums[1]}\nprint(result)"
299
- elif '*' in question or 'multiply' in question.lower():
300
- return f"result = {nums[0]} * {nums[1]}\nprint(result)"
301
- elif '/' in question or 'divide' in question.lower():
302
- return f"result = {nums[0]} / {nums[1]}\nprint(result)"
303
- else:
304
- return f"# Numbers found: {nums}\nprint('Please specify the operation')"
305
-
306
- def generate_extraction_code(self, question: str) -> str:
307
- """Generate Python code to extract answers from search results"""
308
- question_lower = question.lower()
309
-
310
- if 'album' in question_lower and ('between' in question_lower or 'from' in question_lower):
311
- return """
312
- # Extract albums from search results and filter by date range
313
- import re
314
- text = '''SEARCH_RESULTS_HERE''' # This would be replaced with actual search results
315
-
316
- # Find years and album mentions
317
- years = re.findall(r'\\b(19|20)\\d{2}\\b', text)
318
- albums = re.findall(r'album|studio|release', text.lower())
319
-
320
- # Filter years between 2000-2009
321
- target_years = [y for y in years if '2000' <= y <= '2009']
322
- print(f"Albums found in target period: {len(target_years)}")
323
- """
324
-
325
- elif 'how many' in question_lower:
326
- return """
327
- # Count items from search results
328
- import re
329
- text = '''SEARCH_RESULTS_HERE'''
330
-
331
- # Extract numbers and count relevant items
332
- numbers = re.findall(r'\\b\\d+\\b', text)
333
- print(f"Numbers found: {numbers}")
334
- print(f"Count: {len(numbers)}")
335
- """
336
-
337
- else:
338
- return "# Analyze search results\nprint('Search results analysis needed')"
339
-
340
- def extract_answer_from_history(self, question: str, history: List[str]) -> str:
341
- """Extract final answer from conversation history"""
342
- # Combine all history text
343
- all_text = " ".join(history)
344
- question_lower = question.lower()
345
-
346
- # Look for numbers in results
347
- if any(word in question_lower for word in ['how many', 'count', 'number']):
348
- numbers = re.findall(r'\\b\\d+\\b', all_text)
349
- if numbers:
350
- return numbers[0]
351
-
352
- # Look for names
353
- if 'who' in question_lower:
354
- names = re.findall(r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b', all_text)
355
- if names:
356
- return names[0]
357
-
358
- # Look for places
359
- if 'where' in question_lower or 'capital' in question_lower:
360
- places = re.findall(r'\\b[A-Z][a-z]+\\b', all_text)
361
- if places:
362
- return places[0]
363
-
364
- return "Unable to determine answer"
365
-
366
- def web_search(self, query: str, max_results: int = 4) -> str:
367
- """Comprehensive web search using all available engines"""
368
- print(f"🔍 Web search: {query}")
369
- all_results = []
370
-
371
- # Try Tavily first
372
- if self.tavily:
373
- try:
374
- tavily_query = query[:350]
375
- tavily_results = self.tavily.search(tavily_query, max_results=3)
376
- if tavily_results and 'results' in tavily_results:
377
- for result in tavily_results['results']:
378
- all_results.append(f"Source (Tavily): {result.get('title', '')}\n{result.get('content', '')}")
379
- print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
380
- except Exception as e:
381
- print(f"❌ Tavily error: {e}")
382
-
383
- # Try Exa next (with fixed API)
384
- if self.exa and len(all_results) < max_results:
385
- try:
386
- exa_query = query[:200]
387
- remaining = max_results - len(all_results)
388
- exa_results = self.exa.search_and_contents(exa_query, num_results=remaining)
389
- if exa_results and hasattr(exa_results, 'results'):
390
- for result in exa_results.results:
391
- title = getattr(result, 'title', '')
392
- text = getattr(result, 'text', '')
393
- all_results.append(f"Source (Exa): {title}\n{text}")
394
- print(f"📊 Exa: {len(exa_results.results)} results")
395
- except Exception as e:
396
- print(f"❌ Exa error: {e}")
397
-
398
- # Wikipedia search
399
- try:
400
- wiki_terms = self.extract_key_terms(query)[:100]
401
- wiki_results = wikipedia.search(wiki_terms, results=2)
402
- if wiki_results:
403
- page = wikipedia.page(wiki_results[0])
404
- all_results.append(f"Source (Wikipedia): {page.title}\n{page.summary}")
405
- print(f"📊 Wikipedia: {len(wiki_results)} results")
406
- except Exception as e:
407
- print(f"❌ Wikipedia error: {e}")
408
-
409
- # DuckDuckGo fallback
410
- if len(all_results) < max_results:
411
- try:
412
- remaining = max_results - len(all_results)
413
- ddg_results = list(self.ddgs.text(query, max_results=remaining))
414
- for result in ddg_results:
415
- all_results.append(f"Source (DuckDuckGo): {result.get('title', '')}\n{result.get('body', '')}")
416
- print(f"📊 DuckDuckGo: {len(ddg_results)} results")
417
- except Exception as e:
418
- print(f"❌ DuckDuckGo error: {e}")
419
-
420
- return "\n\n".join(all_results) if all_results else "No search results found"
421
-
422
- def wikipedia_search(self, query: str) -> str:
423
- """Dedicated Wikipedia search"""
424
- print(f"📖 Wikipedia search: {query}")
425
- try:
426
- search_terms = self.extract_key_terms(query)[:100]
427
- search_results = wikipedia.search(search_terms, results=3)
428
- if not search_results:
429
- return "No Wikipedia results found"
430
-
431
- page = wikipedia.page(search_results[0])
432
- content = f"Wikipedia: {page.title}\n\nSummary:\n{page.summary}\n\nFull content (first 2000 chars):\n{page.content[:2000]}"
433
- return content
434
- except Exception as e:
435
- return f"Wikipedia search error: {e}"
436
-
437
- def extract_key_terms(self, text: str) -> str:
438
- """Extract key terms for better search results"""
439
- # Remove common question patterns
440
- text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
441
- text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
442
-
443
- # Extract proper nouns and years
444
- proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
445
- years = re.findall(r'\b(19|20)\d{2}\b', text)
446
-
447
- key_terms = proper_nouns[:5] + years[:2]
448
- return ' '.join(key_terms) if key_terms else text[:100]
449
-
450
- def process_image(self, image_description: str) -> str:
451
- """Process image content using vision model"""
452
- print(f"🖼️ Processing image: {image_description}")
453
-
454
- if not self.openai_client:
455
- return "Cannot process image - no LLM available"
456
-
457
- # For chess positions, search for general chess knowledge
458
- if 'chess' in image_description.lower():
459
- search_result = self.web_search("chess puzzle black to move winning move algebraic notation")
460
- return f"Chess analysis based on web search:\n{search_result}"
461
-
462
- return "Image processing requires direct image file access - cannot process from description alone"
463
-
464
- def transcribe_audio_video(self, url: str) -> str:
465
- """Download and transcribe audio/video from URLs"""
466
- print(f"🎥 Processing audio/video: {url}")
467
-
468
- if not PYTUBE_AVAILABLE or not WHISPER_AVAILABLE:
469
- return "Audio/video processing requires pytube and whisper libraries"
470
-
471
- try:
472
- # Extract video ID for YouTube URLs
473
- if 'youtube.com' in url or 'youtu.be' in url:
474
- video_id = re.search(r'(?:v=|\/)([a-zA-Z0-9_-]{11})', url)
475
- if video_id:
476
- video_id = video_id.group(1)
477
- print(f"📺 YouTube video ID: {video_id}")
478
-
479
- # Search for transcripts or information about this video
480
- search_query = f"YouTube video {video_id} transcript content summary"
481
- search_result = self.web_search(search_query)
482
- return f"Video information from web search:\n{search_result}"
483
-
484
- return "Direct video download and transcription not implemented in this demo"
485
-
486
- except Exception as e:
487
- return f"Audio/video processing error: {e}"
488
-
489
- def read_excel_file(self, description: str) -> str:
490
- """Process Excel/CSV file data"""
491
- print(f"📊 Processing Excel/CSV: {description}")
492
- return "Excel file processing requires direct file access - cannot process from description alone"
493
-
494
- def execute_python(self, code: str) -> str:
495
- """Execute Python code securely"""
496
- print(f"🐍 Executing Python code")
497
- print(f"Code: {code}")
498
-
499
- try:
500
- # Create safe execution environment
501
- safe_globals = {
502
- '__builtins__': {
503
- 'len': len, 'str': str, 'int': int, 'float': float,
504
- 'list': list, 'dict': dict, 'set': set, 'tuple': tuple,
505
- 'range': range, 'enumerate': enumerate, 'zip': zip,
506
- 'sum': sum, 'max': max, 'min': min, 'abs': abs,
507
- 'round': round, 'sorted': sorted, 'reversed': reversed,
508
- 'print': print
509
- }
510
- }
511
-
512
- # Capture output
513
- import io
514
- import sys
515
- old_stdout = sys.stdout
516
- sys.stdout = captured_output = io.StringIO()
517
-
518
- # Execute code
519
- exec(code, safe_globals)
520
-
521
- # Get output
522
- sys.stdout = old_stdout
523
- output = captured_output.getvalue()
524
-
525
- return f"Python execution output:\n{output}" if output else "Code executed successfully (no output)"
526
-
527
- except Exception as e:
528
- return f"Python execution error: {e}"
529
-
530
- def execute_action(self, action: str, parameters: Dict[str, Any]) -> str:
531
- """Execute the specified action with parameters"""
532
- try:
533
- if action == "web_search":
534
- return self.web_search(parameters.get("query", ""))
535
- elif action == "wikipedia_search":
536
- return self.wikipedia_search(parameters.get("query", ""))
537
- elif action == "process_image":
538
- return self.process_image(parameters.get("image_description", ""))
539
- elif action == "transcribe_audio_video":
540
- return self.transcribe_audio_video(parameters.get("url", ""))
541
- elif action == "read_excel_file":
542
- return self.read_excel_file(parameters.get("description", ""))
543
- elif action == "execute_python":
544
- return self.execute_python(parameters.get("code", ""))
545
- elif action == "final_answer":
546
- return parameters.get("answer", "No answer provided")
547
- else:
548
- return f"Unknown action: {action}"
549
- except Exception as e:
550
- return f"Action execution error: {e}"
551
-
552
- def solve_question(self, question: str) -> str:
553
- """
554
- Main ReAct loop: Reason -> Act -> Observe -> Repeat
555
- """
556
- print(f"🎯 Starting GAIA Agent on: {question[:100]}...")
557
-
558
- self.reset_state()
559
- self.state["question"] = question
560
-
561
- # Handle special cases quickly
562
- if ".rewsna eht sa" in question:
563
- return "right"
564
-
565
- # Main ReAct loop
566
- for step in range(self.state["max_steps"]):
567
- print(f"\n--- Step {step + 1}/{self.state['max_steps']} ---")
568
-
569
- # REASON: Ask LLM what to do next
570
- action_plan = self.plan_and_reason(question, self.state["history"])
571
-
572
- if "error" in action_plan:
573
- print(f"❌ Planning error: {action_plan['error']}")
574
- break
575
-
576
- action = action_plan.get("action")
577
- parameters = action_plan.get("parameters", {})
578
- reasoning = action_plan.get("reasoning", "")
579
-
580
- print(f"🤔 Reasoning: {reasoning}")
581
- print(f"🎬 Action: {action} with parameters: {parameters}")
582
-
583
- # ACT: Execute the planned action
584
- if action == "final_answer":
585
- answer = parameters.get("answer", "No answer provided")
586
- print(f"✅ Final answer: {answer}")
587
- return self.format_gaia_answer(answer)
588
-
589
- result = self.execute_action(action, parameters)
590
-
591
- # OBSERVE: Record the result and update state
592
- step_summary = f"Action: {action}({parameters}) -> Result: {result[:200]}..."
593
- self.state["history"].append(step_summary)
594
-
595
- print(f"📝 Result: {result[:200]}...")
596
-
597
- # Add to facts if this was informational
598
- if action in ["web_search", "wikipedia_search"] and "error" not in result.lower():
599
- self.state["facts_gathered"].append(result[:500])
600
-
601
- # If we exit the loop without a final answer
602
- print("❌ Max steps reached without final answer")
603
- return "Unable to determine answer"
604
-
605
- def format_gaia_answer(self, answer: str) -> str:
606
- """Format answer according to GAIA requirements"""
607
- if not answer or answer in ["Unable to determine answer", "No answer provided"]:
608
- return "Unable to determine answer"
609
-
610
- # Remove common prefixes
611
- answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
612
- answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
613
-
614
- # Clean up
615
- answer = re.sub(r'[.!?]+$', '', answer)
616
- answer = ' '.join(answer.split())
617
-
618
- return answer
619
-
620
- def __call__(self, question: str) -> str:
621
- """Main entry point for the agent"""
622
- try:
623
- return self.solve_question(question)
624
- except Exception as e:
625
- print(f"❌ Agent error: {e}")
626
- return "Error processing question"
627
-
628
-
629
- # Create alias for compatibility
630
- BasicAgent = GAIAAgent
631
- AdvancedGAIAAgent = GAIAAgent
632
-
633
-
634
- if __name__ == "__main__":
635
- # Test the agent
636
- agent = GAIAAgent()
637
-
638
- test_questions = [
639
- "What is 25 * 4?",
640
- "Who was the first person to walk on the moon?",
641
- "What is the capital of France?",
642
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
643
- ]
644
-
645
- print("\n" + "="*60)
646
- print("Testing GAIA Agent with ReAct Loop")
647
- print("="*60)
648
-
649
- for i, question in enumerate(test_questions, 1):
650
- print(f"\n{i}. Testing: {question}")
651
- answer = agent(question)
652
- print(f" Final Answer: {answer}")
653
- print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_agent_update_plan.md DELETED
@@ -1,23 +0,0 @@
1
- # GAIA Agent Configuration Update Plan
2
-
3
- ## Objective:
4
- Replace the Gemini cypher model in the consensus agent with `openrouter/cypher-alpha:free` while maintaining environment variable dependencies and preserving model architecture integrity.
5
-
6
- ## Tasks:
7
-
8
- 1. **Verify OpenRouter Availability:**
9
- - Confirm `OPENROUTER_API_KEY` is set as visible in [`consensus_gaia_agent.py:51`](consensus_gaia_agent.py:51)
10
- - Check `_create_openrouter_client()` configuration at [`consensus_gaia_agent.py:86`](consensus_gaia_agent.py:86)
11
-
12
- 2. **Modify Model Configuration:**
13
- - Replace `google/gemini-2.0-cypher-exp:free` with `openrouter/cypher-alpha:free` in model initialization at [`consensus_gaia_agent.py:62-63`](consensus_gaia_agent.py:62-63)
14
-
15
- 3. **Preserve GAIA Formatting Rules:**
16
- - Maintain role assignment structure from original Gemini cypher configuration
17
-
18
- 4. **Environment Variables:**
19
- - Ensure `OPENROUTER_API_KEY` environment variable remains set
20
- - Verify no conflicts with other model path patterns (e.g. `qwen`, `deepseek`)
21
-
22
- 5. **Version Control:**
23
- - Operate on new branch "replace-gemini-with-cypher-alpha" if possible - may require follow-up `git checkout -b` outside Architect mode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_evaluation_report_2025-07-13_13-09-20.md DELETED
@@ -1,72 +0,0 @@
1
- # GAIA Level 1 Evaluation Report
2
-
3
- **Date:** 2025-07-13 13:09:20
4
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
5
- **Username:** AgileAndy
6
- **Total Questions:** 20
7
- **Processing Time:** 89.60 seconds
8
-
9
- ## 📊 Results Summary
10
-
11
- - **Overall Score:** 5.0%
12
- - **Correct Answers:** 1/20
13
- - **Average Time per Question:** 4.48 seconds
14
- - **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
15
-
16
- ## 🎯 Agent Performance
17
-
18
- The SimpleAgent uses a direct approach with:
19
- - 🌐 Web search via DuckDuckGo
20
- - 📖 Wikipedia integration
21
- - 🧮 Calculator for math questions
22
- - 🎯 Pattern-based answer extraction
23
-
24
- ## 📋 Detailed Results
25
-
26
- | # | Task ID | Question | Answer | Time (s) |
27
- |---|---------|----------|--------|----------|
28
- | 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Unable to determine answer | 6.27 |
29
- | 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unable to determine answer | 9.56 |
30
- | 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
31
- | 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to process image content - requires vision ... | 4.66 |
32
- | 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Unable to determine answer | 5.84 |
33
- | 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | Unable to determine answer | 5.56 |
34
- | 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 8.81 |
35
- | 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Unable to determine answer | 4.19 |
36
- | 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Unable to determine answer | 4.73 |
37
- | 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to process audio content - requires speech-... | 0.00 |
38
- | 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Unable to determine answer | 5.18 |
39
- | 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to execute Python code - code file not prov... | 0.00 |
40
- | 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | Unable to determine answer | 6.13 |
41
- | 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to process audio content - requires speech-... | 0.00 |
42
- | 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Unable to determine answer | 7.19 |
43
- | 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 4.23 |
44
- | 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | Unable to determine answer | 5.67 |
45
- | 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Unable to determine answer | 5.33 |
46
- | 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to process Excel files - file not provided | 0.00 |
47
- | 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Unable to determine answer | 6.22 |
48
-
49
-
50
- ## 🔍 Analysis
51
-
52
- ### Strengths
53
- - ✅ Handles basic math questions accurately
54
- - ✅ Good web search integration
55
- - ✅ Pattern matching for common question types
56
- - ✅ Detailed logging for debugging
57
-
58
- ### Areas for Improvement
59
- - 🔄 Handle multimedia content (videos, images, audio)
60
- - 🔄 Better extraction for complex questions
61
- - 🔄 Improve Wikipedia search relevance
62
- - 🔄 Add more sophisticated reasoning
63
-
64
- ### Question Types Performance
65
- - **Math Questions:** 8 questions
66
- - **Who Questions:** 5 questions
67
- - **When/Year Questions:** 1 questions
68
-
69
-
70
- ---
71
- *Report generated by SimpleAgent GAIA Evaluation Tool*
72
- *Timestamp: 2025-07-13_13-09-20*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_evaluation_report_2025-07-13_13-20-50.md DELETED
@@ -1,72 +0,0 @@
1
- # GAIA Level 1 Evaluation Report
2
-
3
- **Date:** 2025-07-13 13:20:50
4
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
5
- **Username:** AgileAndy
6
- **Total Questions:** 20
7
- **Processing Time:** 0.00 seconds
8
-
9
- ## 📊 Results Summary
10
-
11
- - **Overall Score:** 5.0%
12
- - **Correct Answers:** 1/20
13
- - **Average Time per Question:** 0.00 seconds
14
- - **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
15
-
16
- ## 🎯 Agent Performance
17
-
18
- The SimpleAgent uses a direct approach with:
19
- - 🌐 Web search via DuckDuckGo
20
- - 📖 Wikipedia integration
21
- - 🧮 Calculator for math questions
22
- - 🎯 Pattern-based answer extraction
23
-
24
- ## 📋 Detailed Results
25
-
26
- | # | Task ID | Question | Answer | Time (s) |
27
- |---|---------|----------|--------|----------|
28
- | 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Unable to determine answer | 0.00 |
29
- | 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unable to determine answer | 0.00 |
30
- | 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
31
- | 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to determine answer | 0.00 |
32
- | 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Unable to determine answer | 0.00 |
33
- | 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | Unable to determine answer | 0.00 |
34
- | 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 0.00 |
35
- | 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Unable to determine answer | 0.00 |
36
- | 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Unable to determine answer | 0.00 |
37
- | 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to determine answer | 0.00 |
38
- | 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Unable to determine answer | 0.00 |
39
- | 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to determine answer | 0.00 |
40
- | 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | Unable to determine answer | 0.00 |
41
- | 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to determine answer | 0.00 |
42
- | 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Unable to determine answer | 0.00 |
43
- | 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 0.00 |
44
- | 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | Unable to determine answer | 0.00 |
45
- | 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Unable to determine answer | 0.00 |
46
- | 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to determine answer | 0.00 |
47
- | 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Unable to determine answer | 0.00 |
48
-
49
-
50
- ## 🔍 Analysis
51
-
52
- ### Strengths
53
- - ✅ Handles basic math questions accurately
54
- - ✅ Good web search integration
55
- - ✅ Pattern matching for common question types
56
- - ✅ Detailed logging for debugging
57
-
58
- ### Areas for Improvement
59
- - 🔄 Handle multimedia content (videos, images, audio)
60
- - 🔄 Better extraction for complex questions
61
- - 🔄 Improve Wikipedia search relevance
62
- - 🔄 Add more sophisticated reasoning
63
-
64
- ### Question Types Performance
65
- - **Math Questions:** 8 questions
66
- - **Who Questions:** 5 questions
67
- - **When/Year Questions:** 1 questions
68
-
69
-
70
- ---
71
- *Report generated by SimpleAgent GAIA Evaluation Tool*
72
- *Timestamp: 2025-07-13_13-20-50*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_evaluation_report_2025-07-13_13-25-10.md DELETED
@@ -1,72 +0,0 @@
1
- # GAIA Level 1 Evaluation Report
2
-
3
- **Date:** 2025-07-13 13:25:10
4
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
5
- **Username:** AgileAndy
6
- **Total Questions:** 20
7
- **Processing Time:** 58.01 seconds
8
-
9
- ## 📊 Results Summary
10
-
11
- - **Overall Score:** 5.0%
12
- - **Correct Answers:** 1/20
13
- - **Average Time per Question:** 2.90 seconds
14
- - **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
15
-
16
- ## 🎯 Agent Performance
17
-
18
- The SimpleAgent uses a direct approach with:
19
- - 🌐 Web search via DuckDuckGo
20
- - 📖 Wikipedia integration
21
- - 🧮 Calculator for math questions
22
- - 🎯 Pattern-based answer extraction
23
-
24
- ## 📋 Detailed Results
25
-
26
- | # | Task ID | Question | Answer | Time (s) |
27
- |---|---------|----------|--------|----------|
28
- | 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Unable to determine answer | 3.08 |
29
- | 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unable to determine answer | 0.00 |
30
- | 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
31
- | 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to determine answer | 0.00 |
32
- | 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Unable to determine answer | 4.08 |
33
- | 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | Unable to determine answer | 4.40 |
34
- | 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 0.00 |
35
- | 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Unable to determine answer | 0.00 |
36
- | 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Unable to determine answer | 4.53 |
37
- | 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to determine answer | 3.62 |
38
- | 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Unable to determine answer | 4.69 |
39
- | 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to determine answer | 4.37 |
40
- | 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | Unable to determine answer | 4.58 |
41
- | 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to determine answer | 3.07 |
42
- | 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Unable to determine answer | 4.80 |
43
- | 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 3.05 |
44
- | 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | Unable to determine answer | 4.73 |
45
- | 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Unable to determine answer | 4.80 |
46
- | 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to determine answer | 0.00 |
47
- | 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Unable to determine answer | 4.22 |
48
-
49
-
50
- ## 🔍 Analysis
51
-
52
- ### Strengths
53
- - ✅ Handles basic math questions accurately
54
- - ✅ Good web search integration
55
- - ✅ Pattern matching for common question types
56
- - ✅ Detailed logging for debugging
57
-
58
- ### Areas for Improvement
59
- - 🔄 Handle multimedia content (videos, images, audio)
60
- - 🔄 Better extraction for complex questions
61
- - 🔄 Improve Wikipedia search relevance
62
- - 🔄 Add more sophisticated reasoning
63
-
64
- ### Question Types Performance
65
- - **Math Questions:** 8 questions
66
- - **Who Questions:** 5 questions
67
- - **When/Year Questions:** 1 questions
68
-
69
-
70
- ---
71
- *Report generated by SimpleAgent GAIA Evaluation Tool*
72
- *Timestamp: 2025-07-13_13-25-10*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_evaluation_report_2025-07-13_15-55-52.md DELETED
@@ -1,72 +0,0 @@
1
- # GAIA Level 1 Evaluation Report
2
-
3
- **Date:** 2025-07-13 15:55:52
4
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
5
- **Username:** AgileAndy
6
- **Total Questions:** 20
7
- **Processing Time:** 105.51 seconds
8
-
9
- ## 📊 Results Summary
10
-
11
- - **Overall Score:** 5.0%
12
- - **Correct Answers:** 1/20
13
- - **Average Time per Question:** 5.28 seconds
14
- - **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
15
-
16
- ## 🎯 Agent Performance
17
-
18
- The SimpleAgent uses a direct approach with:
19
- - 🌐 Web search via DuckDuckGo
20
- - 📖 Wikipedia integration
21
- - 🧮 Calculator for math questions
22
- - 🎯 Pattern-based answer extraction
23
-
24
- ## 📋 Detailed Results
25
-
26
- | # | Task ID | Question | Answer | Time (s) |
27
- |---|---------|----------|--------|----------|
28
- | 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | 2000 | 6.78 |
29
- | 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | 41500 | 6.27 |
30
- | 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
31
- | 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to determine answer | 5.61 |
32
- | 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Scott Hartman | 6.79 |
33
- | 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | 2 | 7.08 |
34
- | 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 4.62 |
35
- | 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | -11 | 0.00 |
36
- | 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Atlantic Commercial | 5.61 |
37
- | 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to determine answer | 3.88 |
38
- | 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Wikipedia The | 7.21 |
39
- | 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to determine answer | 6.19 |
40
- | 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 1977 | 6.26 |
41
- | 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | 2024 | 4.01 |
42
- | 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | 2013 | 8.33 |
43
- | 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 4.11 |
44
- | 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | 1928 | 5.52 |
45
- | 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | 91 | 5.63 |
46
- | 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to determine answer | 5.60 |
47
- | 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | 2011 | 5.99 |
48
-
49
-
50
- ## 🔍 Analysis
51
-
52
- ### Strengths
53
- - ✅ Handles basic math questions accurately
54
- - ✅ Good web search integration
55
- - ✅ Pattern matching for common question types
56
- - ✅ Detailed logging for debugging
57
-
58
- ### Areas for Improvement
59
- - 🔄 Handle multimedia content (videos, images, audio)
60
- - 🔄 Better extraction for complex questions
61
- - 🔄 Improve Wikipedia search relevance
62
- - 🔄 Add more sophisticated reasoning
63
-
64
- ### Question Types Performance
65
- - **Math Questions:** 8 questions
66
- - **Who Questions:** 5 questions
67
- - **When/Year Questions:** 1 questions
68
-
69
-
70
- ---
71
- *Report generated by SimpleAgent GAIA Evaluation Tool*
72
- *Timestamp: 2025-07-13_15-55-52*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_evaluation_report_2025-07-13_16-12-38.md DELETED
@@ -1,72 +0,0 @@
1
- # GAIA Level 1 Evaluation Report
2
-
3
- **Date:** 2025-07-13 16:12:38
4
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
5
- **Username:** AgileAndy
6
- **Total Questions:** 20
7
- **Processing Time:** 294.86 seconds
8
-
9
- ## 📊 Results Summary
10
-
11
- - **Overall Score:** 10.0%
12
- - **Correct Answers:** 2/20
13
- - **Average Time per Question:** 14.74 seconds
14
- - **Status:** Score calculated successfully: 2/20 total questions answered correctly (20 valid tasks attempted). High score updated on leaderboard.
15
-
16
- ## 🎯 Agent Performance
17
-
18
- The SimpleAgent uses a direct approach with:
19
- - 🌐 Web search via DuckDuckGo
20
- - 📖 Wikipedia integration
21
- - 🧮 Calculator for math questions
22
- - 🎯 Pattern-based answer extraction
23
-
24
- ## 📋 Detailed Results
25
-
26
- | # | Task ID | Question | Answer | Time (s) |
27
- |---|---------|----------|--------|----------|
28
- | 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | To determine number of studio albums published by ... | 17.00 |
29
- | 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Cannot determine highest number of bird species ob... | 16.04 |
30
- | 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
31
- | 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | bxa4 | 8.29 |
32
- | 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | FunkMonk | 11.02 |
33
- | 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | To determine if operation * is commutative, we nee... | 17.70 |
34
- | 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | All models failed - unable to determine answer | 8.60 |
35
- | 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | surname not found | 12.12 |
36
- | 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | bell pepper, broccoli, celery, corn, green beans, ... | 12.60 |
37
- | 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | almond extract, cornstarch, lemon juice, ripe stra... | 13.03 |
38
- | 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Bartłomiej | 13.08 |
39
- | 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | All models failed - unable to determine answer | 9.99 |
40
- | 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 565 | 36.34 |
41
- | 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to determine answer | 12.42 |
42
- | 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Okay, I understand. Previous answer punted due to ... | 23.51 |
43
- | 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | St Petersburg | 8.22 |
44
- | 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | AFG | 27.65 |
45
- | 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | All models failed - unable to determine answer | 10.44 |
46
- | 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Okay, I've reviewed information. I need actual dat... | 22.73 |
47
- | 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Dmitry | 14.08 |
48
-
49
-
50
- ## 🔍 Analysis
51
-
52
- ### Strengths
53
- - ✅ Handles basic math questions accurately
54
- - ✅ Good web search integration
55
- - ✅ Pattern matching for common question types
56
- - ✅ Detailed logging for debugging
57
-
58
- ### Areas for Improvement
59
- - 🔄 Handle multimedia content (videos, images, audio)
60
- - 🔄 Better extraction for complex questions
61
- - 🔄 Improve Wikipedia search relevance
62
- - 🔄 Add more sophisticated reasoning
63
-
64
- ### Question Types Performance
65
- - **Math Questions:** 8 questions
66
- - **Who Questions:** 5 questions
67
- - **When/Year Questions:** 1 questions
68
-
69
-
70
- ---
71
- *Report generated by SimpleAgent GAIA Evaluation Tool*
72
- *Timestamp: 2025-07-13_16-12-38*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_evaluation_report_2025-07-13_17-06-34.md DELETED
@@ -1,72 +0,0 @@
1
- # GAIA Level 1 Evaluation Report
2
-
3
- **Date:** 2025-07-13 17:06:34
4
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
5
- **Username:** AgileAndy
6
- **Total Questions:** 20
7
- **Processing Time:** 870.35 seconds
8
-
9
- ## 📊 Results Summary
10
-
11
- - **Overall Score:** 40.0%
12
- - **Correct Answers:** 8/20
13
- - **Average Time per Question:** 43.52 seconds
14
- - **Status:** Score calculated successfully: 8/20 total questions answered correctly (20 valid tasks attempted). High score updated on leaderboard.
15
-
16
- ## 🎯 Agent Performance
17
-
18
- The SimpleAgent uses a direct approach with:
19
- - 🌐 Web search via DuckDuckGo
20
- - 📖 Wikipedia integration
21
- - 🧮 Calculator for math questions
22
- - 🎯 Pattern-based answer extraction
23
-
24
- ## 📋 Detailed Results
25
-
26
- | # | Task ID | Question | Answer | Time (s) |
27
- |---|---------|----------|--------|----------|
28
- | 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | 2 Reasoning: The provided context shows "Cantora, ... | 69.07 |
29
- | 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unknown | 29.48 |
30
- | 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
31
- | 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | bxa4 | 67.86 |
32
- | 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | FunkMonk | 47.34 |
33
- | 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | b, d, e | 35.98 |
34
- | 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Teal'c: Extremely | 24.45 |
35
- | 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Louvrier | 26.83 |
36
- | 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | broccoli, celery, green beans, lettuce, sweet pota... | 32.60 |
37
- | 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | berries, cornstarch, lemon juice, salt, sugar, van... | 31.39 |
38
- | 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Wojciech | 29.71 |
39
- | 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | 9 | 29.67 |
40
- | 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 589 | 79.03 |
41
- | 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57... | 36.75 |
42
- | 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | 80GSFC21M0002 | 33.32 |
43
- | 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Saint Petersburg | 162.22 |
44
- | 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | CUB | 40.48 |
45
- | 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Kato, Tanaka | 28.20 |
46
- | 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | 1. **Identify Food Categories**: From the dataset'... | 33.39 |
47
- | 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Claus | 32.57 |
48
-
49
-
50
- ## 🔍 Analysis
51
-
52
- ### Strengths
53
- - ✅ Handles basic math questions accurately
54
- - ✅ Good web search integration
55
- - ✅ Pattern matching for common question types
56
- - ✅ Detailed logging for debugging
57
-
58
- ### Areas for Improvement
59
- - 🔄 Handle multimedia content (videos, images, audio)
60
- - 🔄 Better extraction for complex questions
61
- - 🔄 Improve Wikipedia search relevance
62
- - 🔄 Add more sophisticated reasoning
63
-
64
- ### Question Types Performance
65
- - **Math Questions:** 8 questions
66
- - **Who Questions:** 5 questions
67
- - **When/Year Questions:** 1 questions
68
-
69
-
70
- ---
71
- *Report generated by SimpleAgent GAIA Evaluation Tool*
72
- *Timestamp: 2025-07-13_17-06-34*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_evaluation_report_2025-07-13_17-29-02.md DELETED
@@ -1,72 +0,0 @@
1
- # GAIA Level 1 Evaluation Report
2
-
3
- **Date:** 2025-07-13 17:29:02
4
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
5
- **Username:** AgileAndy
6
- **Total Questions:** 20
7
- **Processing Time:** 706.59 seconds
8
-
9
- ## 📊 Results Summary
10
-
11
- - **Overall Score:** 35.0%
12
- - **Correct Answers:** 7/20
13
- - **Average Time per Question:** 35.33 seconds
14
- - **Status:** Score calculated successfully: 7/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
15
-
16
- ## 🎯 Agent Performance
17
-
18
- The SimpleAgent uses a direct approach with:
19
- - 🌐 Web search via DuckDuckGo
20
- - 📖 Wikipedia integration
21
- - 🧮 Calculator for math questions
22
- - 🎯 Pattern-based answer extraction
23
-
24
- ## 📋 Detailed Results
25
-
26
- | # | Task ID | Question | Answer | Time (s) |
27
- |---|---------|----------|--------|----------|
28
- | 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Total studio albums published by Mercedes Sosa bet... | 34.94 |
29
- | 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | provided context doesn't contain specific informat... | 34.07 |
30
- | 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
31
- | 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | bxa4 | 59.96 |
32
- | 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | FunkMonk | 45.66 |
33
- | 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | b, e | 42.83 |
34
- | 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Teal'c says: Extremely Validation: - Multiple sour... | 26.63 |
35
- | 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Louvrier | 29.19 |
36
- | 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | broccoli, celery, green beans, lettuce, sweet pota... | 29.08 |
37
- | 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | cornstarch, lemon juice, ripe strawberries, salt, ... | 41.16 |
38
- | 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Wojciech | 44.05 |
39
- | 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | final numeric output of the Python code depends on... | 32.43 |
40
- | 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 589 | 37.80 |
41
- | 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | 34, 45, 56, 67, 78, 89, 100, 111, 122, 133, 144, 1... | 33.18 |
42
- | 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | 80NSSC21K0122 | 32.16 |
43
- | 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | St Petersburg | 42.59 |
44
- | 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | CUB | 39.46 |
45
- | 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | KentaSato, YukiTanaka | 35.54 |
46
- | 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | 254400.00 | 39.23 |
47
- | 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Claus | 26.63 |
48
-
49
-
50
- ## 🔍 Analysis
51
-
52
- ### Strengths
53
- - ✅ Handles basic math questions accurately
54
- - ✅ Good web search integration
55
- - ✅ Pattern matching for common question types
56
- - ✅ Detailed logging for debugging
57
-
58
- ### Areas for Improvement
59
- - 🔄 Handle multimedia content (videos, images, audio)
60
- - 🔄 Better extraction for complex questions
61
- - 🔄 Improve Wikipedia search relevance
62
- - 🔄 Add more sophisticated reasoning
63
-
64
- ### Question Types Performance
65
- - **Math Questions:** 8 questions
66
- - **Who Questions:** 5 questions
67
- - **When/Year Questions:** 1 questions
68
-
69
-
70
- ---
71
- *Report generated by SimpleAgent GAIA Evaluation Tool*
72
- *Timestamp: 2025-07-13_17-29-02*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inspect_exa_api.py DELETED
@@ -1,44 +0,0 @@
1
- import os
2
- import sys
3
- import inspect
4
-
5
- try:
6
- from exa_py import Exa
7
- EXA_AVAILABLE = True
8
- except ImportError:
9
- EXA_AVAILABLE = False
10
- print("Exa not available - install with: pip install exa-py")
11
- sys.exit(1)
12
-
13
- def inspect_exa_api():
14
- """Inspect the Exa API to understand its parameters"""
15
- print("Inspecting Exa API...")
16
-
17
- # Get the search method signature
18
- search_signature = inspect.signature(Exa.search)
19
- print(f"\nExa.search method signature:")
20
- print(search_signature)
21
-
22
- # Get parameter details
23
- print("\nParameter details:")
24
- for param_name, param in search_signature.parameters.items():
25
- if param_name != 'self':
26
- print(f"- {param_name}: {param.default if param.default is not param.empty else 'Required'}")
27
-
28
- # Try to get method docstring
29
- print("\nMethod docstring:")
30
- print(Exa.search.__doc__ or "No docstring available")
31
-
32
- # Initialize Exa to check for any help methods
33
- exa_api_key = os.getenv("EXA_API_KEY")
34
- if exa_api_key:
35
- exa = Exa(api_key=exa_api_key)
36
- print("\nAvailable methods on Exa instance:")
37
- methods = [method for method in dir(exa) if not method.startswith('_')]
38
- for method in methods:
39
- print(f"- {method}")
40
- else:
41
- print("\n❌ EXA_API_KEY not found in environment")
42
-
43
- if __name__ == "__main__":
44
- inspect_exa_api()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py DELETED
@@ -1,6 +0,0 @@
1
- def main():
2
- print("Hello from final-assignment-template!")
3
-
4
-
5
- if __name__ == "__main__":
6
- main()
 
 
 
 
 
 
 
prompts.yaml DELETED
@@ -1,321 +0,0 @@
1
- "system_prompt": |-
2
- You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.
3
- To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
4
- To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
5
-
6
- At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
7
- Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
8
- During each intermediate step, you can use 'print()' to save whatever important information you will then need.
9
- These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
10
- In the end you have to return a final answer using the `final_answer` tool.
11
-
12
- Here are a few examples using notional tools:
13
- ---
14
- Task: "Generate an image of the oldest person in this document."
15
-
16
- Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
17
- Code:
18
- ```py
19
- answer = document_qa(document=document, question="Who is the oldest person mentioned?")
20
- print(answer)
21
- ```<end_code>
22
- Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
23
-
24
- Thought: I will now generate an image showcasing the oldest person.
25
- Code:
26
- ```py
27
- image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
28
- final_answer(image)
29
- ```<end_code>
30
-
31
- ---
32
- Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
33
-
34
- Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
35
- Code:
36
- ```py
37
- result = 5 + 3 + 1294.678
38
- final_answer(result)
39
- ```<end_code>
40
-
41
- ---
42
- Task:
43
- "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
44
- You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
45
- {'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
46
-
47
- Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
48
- Code:
49
- ```py
50
- translated_question = translator(question=question, src_lang="French", tgt_lang="English")
51
- print(f"The translated question is {translated_question}.")
52
- answer = image_qa(image=image, question=translated_question)
53
- final_answer(f"The answer is {answer}")
54
- ```<end_code>
55
-
56
- ---
57
- Task:
58
- In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.
59
- What does he say was the consequence of Einstein learning too much math on his creativity, in one word?
60
-
61
- Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.
62
- Code:
63
- ```py
64
- pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein")
65
- print(pages)
66
- ```<end_code>
67
- Observation:
68
- No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein".
69
-
70
- Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.
71
- Code:
72
- ```py
73
- pages = search(query="1979 interview Stanislaus Ulam")
74
- print(pages)
75
- ```<end_code>
76
- Observation:
77
- Found 6 pages:
78
- [Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)
79
-
80
- [Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)
81
-
82
- (truncated)
83
-
84
- Thought: I will read the first 2 pages to know more.
85
- Code:
86
- ```py
87
- for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]:
88
- whole_page = visit_webpage(url)
89
- print(whole_page)
90
- print("\n" + "="*80 + "\n") # Print separator between pages
91
- ```<end_code>
92
- Observation:
93
- Manhattan Project Locations:
94
- Los Alamos, NM
95
- Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at
96
- (truncated)
97
-
98
- Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word.
99
- Code:
100
- ```py
101
- final_answer("diminished")
102
- ```<end_code>
103
-
104
- ---
105
- Task: "Which city has the highest population: Guangzhou or Shanghai?"
106
-
107
- Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
108
- Code:
109
- ```py
110
- for city in ["Guangzhou", "Shanghai"]:
111
- print(f"Population {city}:", search(f"{city} population")
112
- ```<end_code>
113
- Observation:
114
- Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
115
- Population Shanghai: '26 million (2019)'
116
-
117
- Thought: Now I know that Shanghai has the highest population.
118
- Code:
119
- ```py
120
- final_answer("Shanghai")
121
- ```<end_code>
122
-
123
- ---
124
- Task: "What is the current age of the pope, raised to the power 0.36?"
125
-
126
- Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.
127
- Code:
128
- ```py
129
- pope_age_wiki = wiki(query="current pope age")
130
- print("Pope age as per wikipedia:", pope_age_wiki)
131
- pope_age_search = web_search(query="current pope age")
132
- print("Pope age as per google search:", pope_age_search)
133
- ```<end_code>
134
- Observation:
135
- Pope age: "The pope Francis is currently 88 years old."
136
-
137
- Thought: I know that the pope is 88 years old. Let's compute the result using python code.
138
- Code:
139
- ```py
140
- pope_current_age = 88 ** 0.36
141
- final_answer(pope_current_age)
142
- ```<end_code>
143
-
144
- Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
145
- {%- for tool in tools.values() %}
146
- - {{ tool.name }}: {{ tool.description }}
147
- Takes inputs: {{tool.inputs}}
148
- Returns an output of type: {{tool.output_type}}
149
- {%- endfor %}
150
-
151
- {%- if managed_agents and managed_agents.values() | list %}
152
- You can also give tasks to team members.
153
- Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
154
- Given that this team member is a real human, you should be very verbose in your task.
155
- Here is a list of the team members that you can call:
156
- {%- for agent in managed_agents.values() %}
157
- - {{ agent.name }}: {{ agent.description }}
158
- {%- endfor %}
159
- {%- else %}
160
- {%- endif %}
161
-
162
- Here are the rules you should always follow to solve your task:
163
- 1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
164
- 2. Use only variables that you have defined!
165
- 3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
166
- 4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
167
- 5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
168
- 6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
169
- 7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
170
- 8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
171
- 9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
172
- 10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
173
-
174
- Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
175
- "planning":
176
- "initial_facts": |-
177
- Below I will present you a task.
178
-
179
- You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need.
180
- To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it.
181
- Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey:
182
-
183
- ---
184
- ### 1. Facts given in the task
185
- List here the specific facts given in the task that could help you (there might be nothing here).
186
-
187
- ### 2. Facts to look up
188
- List here any facts that we may need to look up.
189
- Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here.
190
-
191
- ### 3. Facts to derive
192
- List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation.
193
-
194
- Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings:
195
- ### 1. Facts given in the task
196
- ### 2. Facts to look up
197
- ### 3. Facts to derive
198
- Do not add anything else.
199
- "initial_plan": |-
200
- You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
201
-
202
- Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
203
- This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
204
- Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
205
- After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
206
-
207
- Here is your task:
208
-
209
- Task:
210
- ```
211
- {{task}}
212
- ```
213
- You can leverage these tools:
214
- {%- for tool in tools.values() %}
215
- - {{ tool.name }}: {{ tool.description }}
216
- Takes inputs: {{tool.inputs}}
217
- Returns an output of type: {{tool.output_type}}
218
- {%- endfor %}
219
-
220
- {%- if managed_agents and managed_agents.values() | list %}
221
- You can also give tasks to team members.
222
- Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.
223
- Given that this team member is a real human, you should be very verbose in your request.
224
- Here is a list of the team members that you can call:
225
- {%- for agent in managed_agents.values() %}
226
- - {{ agent.name }}: {{ agent.description }}
227
- {%- endfor %}
228
- {%- else %}
229
- {%- endif %}
230
-
231
- List of facts that you know:
232
- ```
233
- {{answer_facts}}
234
- ```
235
-
236
- Now begin! Write your plan below.
237
- "update_facts_pre_messages": |-
238
- You are a world expert at gathering known and unknown facts based on a conversation.
239
- Below you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these:
240
- ### 1. Facts given in the task
241
- ### 2. Facts that we have learned
242
- ### 3. Facts still to look up
243
- ### 4. Facts still to derive
244
- Find the task and history below:
245
- "update_facts_post_messages": |-
246
- Earlier we've built a list of facts.
247
- But since in your previous steps you may have learned useful new facts or invalidated some false ones.
248
- Please update your list of facts based on the previous history, and provide these headings:
249
- ### 1. Facts given in the task
250
- ### 2. Facts that we have learned
251
- ### 3. Facts still to look up
252
- ### 4. Facts still to derive
253
-
254
- Now write your new list of facts below.
255
- "update_plan_pre_messages": |-
256
- You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
257
-
258
- You have been given a task:
259
- ```
260
- {{task}}
261
- ```
262
-
263
- Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task.
264
- If the previous tries so far have met some success, you can make an updated plan based on these actions.
265
- If you are stalled, you can make a completely new plan starting from scratch.
266
- "update_plan_post_messages": |-
267
- You're still working towards solving this task:
268
- ```
269
- {{task}}
270
- ```
271
-
272
- You can leverage these tools:
273
- {%- for tool in tools.values() %}
274
- - {{ tool.name }}: {{ tool.description }}
275
- Takes inputs: {{tool.inputs}}
276
- Returns an output of type: {{tool.output_type}}
277
- {%- endfor %}
278
-
279
- {%- if managed_agents and managed_agents.values() | list %}
280
- You can also give tasks to team members.
281
- Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'.
282
- Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
283
- Here is a list of the team members that you can call:
284
- {%- for agent in managed_agents.values() %}
285
- - {{ agent.name }}: {{ agent.description }}
286
- {%- endfor %}
287
- {%- else %}
288
- {%- endif %}
289
-
290
- Here is the up to date list of facts that you know:
291
- ```
292
- {{facts_update}}
293
- ```
294
-
295
- Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
296
- This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
297
- Beware that you have {remaining_steps} steps remaining.
298
- Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
299
- After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
300
-
301
- Now write your new plan below.
302
- "managed_agent":
303
- "task": |-
304
- You're a helpful agent named '{{name}}'.
305
- You have been submitted this task by your manager.
306
- ---
307
- Task:
308
- {{task}}
309
- ---
310
- You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
311
-
312
- Your final_answer WILL HAVE to contain these parts:
313
- ### 1. Task outcome (short version):
314
- ### 2. Task outcome (extremely detailed version):
315
- ### 3. Additional context (if relevant):
316
-
317
- Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.
318
- And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.
319
- "report": |-
320
- Here is the final answer from your managed agent '{{name}}':
321
- {{final_answer}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml DELETED
@@ -1,19 +0,0 @@
1
- [project]
2
- name = "final-assignment-template"
3
- version = "0.1.0"
4
- description = "Add your description here"
5
- requires-python = ">=3.12.4"
6
- dependencies = [
7
- "beautifulsoup4>=4.13.4",
8
- "ddgs>=9.1.0",
9
- "duckduckgo-search>=8.1.1",
10
- "exa-py>=1.14.16",
11
- "gradio[oauth]>=5.36.2",
12
- "pillow>=11.3.0",
13
- "python-dateutil>=2.9.0.post0",
14
- "requests>=2.32.4",
15
- "tavily-python>=0.7.9",
16
- "torch>=2.7.1",
17
- "transformers>=4.53.2",
18
- "wikipedia>=1.4.0",
19
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
simplified_gaia_agent.py DELETED
@@ -1,463 +0,0 @@
1
- """
2
- Simplified Framework-based GAIA Agent
3
- Working version without import issues
4
- """
5
-
6
- import os
7
- import re
8
- import json
9
- from typing import Dict, List, Any, Optional
10
- import pandas as pd
11
- from datetime import datetime
12
-
13
- # Core imports that work
14
- from ddgs import DDGS
15
- import wikipedia
16
-
17
- # LlamaIndex imports (these should work)
18
- try:
19
- from llama_index.core.agent import ReActAgent
20
- from llama_index.core.tools import FunctionTool
21
- from llama_index.llms.openrouter import OpenRouter
22
- LLAMAINDEX_AVAILABLE = True
23
- except ImportError:
24
- try:
25
- # Fallback to OpenAI if OpenRouter not available
26
- from llama_index.core.agent import ReActAgent
27
- from llama_index.core.tools import FunctionTool
28
- from llama_index.llms.openai import OpenAI as LlamaOpenAI
29
- LLAMAINDEX_AVAILABLE = True
30
- OPENROUTER_AVAILABLE = False
31
- except ImportError:
32
- LLAMAINDEX_AVAILABLE = False
33
- OPENROUTER_AVAILABLE = False
34
- print("❌ LlamaIndex imports failed")
35
- else:
36
- OPENROUTER_AVAILABLE = True
37
-
38
- # LangGraph imports (these should work)
39
- try:
40
- from langgraph.prebuilt import create_react_agent
41
- from langchain_openai import ChatOpenAI
42
- LANGGRAPH_AVAILABLE = True
43
- except ImportError:
44
- LANGGRAPH_AVAILABLE = False
45
- print("❌ LangGraph imports failed")
46
-
47
- # Search engines
48
- try:
49
- from exa_py import Exa
50
- EXA_AVAILABLE = True
51
- except ImportError:
52
- EXA_AVAILABLE = False
53
-
54
- try:
55
- from tavily import TavilyClient
56
- TAVILY_AVAILABLE = True
57
- except ImportError:
58
- TAVILY_AVAILABLE = False
59
-
60
-
61
- class SimplifiedGAIAAgent:
62
- """
63
- Simplified GAIA agent focusing on what works
64
- Uses available frameworks without import issues
65
- """
66
-
67
- def __init__(self):
68
- print("🚀 Initializing Simplified GAIA Agent")
69
-
70
- # API setup - prioritize OpenRouter
71
- self.openrouter_key = os.getenv("OPENROUTER_API_KEY")
72
- self.openai_key = os.getenv("OPENAI_API_KEY")
73
-
74
- print(f"🔑 OpenRouter API: {'✅ Available' if self.openrouter_key else '❌ Not found'}")
75
- print(f"🔑 OpenAI API: {'✅ Available' if self.openai_key else '❌ Not found'}")
76
-
77
- # Search engines
78
- self.ddgs = DDGS()
79
- self.setup_search_engines()
80
-
81
- # Available frameworks
82
- self.available_frameworks = []
83
-
84
- # Setup frameworks that work
85
- self.setup_frameworks()
86
-
87
- print(f"✅ Available frameworks: {', '.join(self.available_frameworks)}")
88
- if not self.available_frameworks:
89
- print("⚠️ No frameworks available - using fallback mode")
90
-
91
- def setup_search_engines(self):
92
- """Setup search engines"""
93
- print("🔍 Setting up search engines...")
94
-
95
- # Exa
96
- if EXA_AVAILABLE and os.getenv("EXA_API_KEY"):
97
- self.exa = Exa(api_key=os.getenv("EXA_API_KEY"))
98
- print("✅ Exa search initialized")
99
- else:
100
- self.exa = None
101
-
102
- # Tavily
103
- if TAVILY_AVAILABLE and os.getenv("TAVILY_API_KEY"):
104
- self.tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
105
- print("✅ Tavily search initialized")
106
- else:
107
- self.tavily = None
108
-
109
- def setup_frameworks(self):
110
- """Setup available frameworks"""
111
-
112
- # Try LlamaIndex with OpenRouter first, then OpenAI
113
- if LLAMAINDEX_AVAILABLE and (self.openrouter_key or self.openai_key):
114
- try:
115
- self.setup_llamaindex()
116
- self.available_frameworks.append("llamaindex")
117
- print("✅ LlamaIndex framework ready")
118
- except Exception as e:
119
- print(f"❌ LlamaIndex setup failed: {e}")
120
-
121
- # Try LangGraph with OpenRouter/OpenAI
122
- if LANGGRAPH_AVAILABLE and (self.openrouter_key or self.openai_key):
123
- try:
124
- self.setup_langgraph()
125
- self.available_frameworks.append("langgraph")
126
- print("✅ LangGraph framework ready")
127
- except Exception as e:
128
- print(f"❌ LangGraph setup failed: {e}")
129
-
130
- def setup_llamaindex(self):
131
- """Setup LlamaIndex with OpenRouter or OpenAI"""
132
- if self.openrouter_key and OPENROUTER_AVAILABLE:
133
- print("🎯 Using OpenRouter with Gemini 2.0 cypher Exp for LlamaIndex")
134
- self.llama_llm = OpenRouter(
135
- api_key=self.openrouter_key,
136
- model="google/gemini-2.0-cypher-exp:free",
137
- temperature=0.1,
138
- max_tokens=2048
139
- )
140
- elif self.openai_key:
141
- print("🎯 Using OpenAI for LlamaIndex")
142
- self.llama_llm = LlamaOpenAI(
143
- model="gpt-4o-mini",
144
- api_key=self.openai_key,
145
- temperature=0.1
146
- )
147
- else:
148
- raise Exception("No API key available for LlamaIndex")
149
-
150
- # Create tools
151
- def web_search_tool(query: str) -> str:
152
- """Search the web for information"""
153
- return self.comprehensive_web_search(query)
154
-
155
- def calculator_tool(expression: str) -> str:
156
- """Calculate mathematical expressions safely"""
157
- return self.safe_calculate(expression)
158
-
159
- web_tool = FunctionTool.from_defaults(fn=web_search_tool)
160
- calc_tool = FunctionTool.from_defaults(fn=calculator_tool)
161
-
162
- # Create ReAct agent
163
- self.llama_agent = ReActAgent.from_tools(
164
- [web_tool, calc_tool],
165
- llm=self.llama_llm,
166
- verbose=True,
167
- max_iterations=8
168
- )
169
-
170
- def setup_langgraph(self):
171
- """Setup LangGraph with OpenRouter or OpenAI"""
172
- if self.openrouter_key:
173
- print("🎯 Using OpenRouter with Gemini 2.0 cypher Exp for LangGraph")
174
- # For LangGraph, we need to use OpenAI-compatible format
175
- self.langgraph_llm = ChatOpenAI(
176
- model="google/gemini-2.0-cypher-exp:free",
177
- openai_api_key=self.openrouter_key,
178
- openai_api_base="https://openrouter.ai/api/v1",
179
- temperature=0.1,
180
- max_tokens=2048
181
- )
182
- elif self.openai_key:
183
- print("🎯 Using OpenAI for LangGraph")
184
- self.langgraph_llm = ChatOpenAI(
185
- model="gpt-4o-mini",
186
- api_key=self.openai_key,
187
- temperature=0.1
188
- )
189
- else:
190
- raise Exception("No API key available for LangGraph")
191
-
192
- # Create tools
193
- def web_search(query: str) -> str:
194
- """Search the web for information"""
195
- return self.comprehensive_web_search(query)
196
-
197
- def calculator(expression: str) -> str:
198
- """Calculate mathematical expressions safely"""
199
- return self.safe_calculate(expression)
200
-
201
- def process_video(url: str) -> str:
202
- """Process YouTube video URLs"""
203
- if 'youtube.com' in url:
204
- video_id = re.search(r'v=([a-zA-Z0-9_-]+)', url)
205
- if video_id:
206
- # Search for video information
207
- search_query = f"YouTube video {video_id.group(1)} content summary transcript"
208
- return self.comprehensive_web_search(search_query)
209
- return "Video processing requires additional tools"
210
-
211
- tools = [web_search, calculator, process_video]
212
-
213
- # Create LangGraph agent
214
- self.langgraph_agent = create_react_agent(
215
- self.langgraph_llm,
216
- tools,
217
- state_modifier="You are a GAIA benchmark agent. Provide precise answers. For numbers: no commas, no units unless requested. For strings: no articles (a/an/the)."
218
- )
219
-
220
- def comprehensive_web_search(self, query: str, max_results: int = 4) -> str:
221
- """Search using all available engines"""
222
- print(f"🔍 Searching: {query}")
223
- all_results = []
224
-
225
- # Try Tavily first
226
- if self.tavily:
227
- try:
228
- tavily_results = self.tavily.search(query[:350], max_results=2)
229
- if tavily_results and 'results' in tavily_results:
230
- for result in tavily_results['results']:
231
- all_results.append(f"Tavily: {result.get('title', '')}\n{result.get('content', '')}")
232
- print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
233
- except Exception as e:
234
- print(f"❌ Tavily error: {e}")
235
-
236
- # Try Exa
237
- if self.exa and len(all_results) < max_results:
238
- try:
239
- exa_results = self.exa.search_and_contents(query[:200], num_results=2)
240
- if exa_results and hasattr(exa_results, 'results'):
241
- for result in exa_results.results:
242
- title = getattr(result, 'title', '')
243
- text = getattr(result, 'text', '')
244
- all_results.append(f"Exa: {title}\n{text}")
245
- print(f"📊 Exa: {len(exa_results.results)} results")
246
- except Exception as e:
247
- print(f"❌ Exa error: {e}")
248
-
249
- # Wikipedia search
250
- try:
251
- wiki_terms = self.extract_key_terms(query)[:100]
252
- wiki_results = wikipedia.search(wiki_terms, results=2)
253
- if wiki_results:
254
- page = wikipedia.page(wiki_results[0])
255
- all_results.append(f"Wikipedia: {page.title}\n{page.summary}")
256
- print(f"📊 Wikipedia: {len(wiki_results)} results")
257
- except Exception as e:
258
- print(f"❌ Wikipedia error: {e}")
259
-
260
- # DuckDuckGo fallback
261
- if len(all_results) < max_results:
262
- try:
263
- remaining = max_results - len(all_results)
264
- ddg_results = list(self.ddgs.text(query, max_results=remaining))
265
- for result in ddg_results:
266
- all_results.append(f"DuckDuckGo: {result.get('title', '')}\n{result.get('body', '')}")
267
- print(f"📊 DuckDuckGo: {len(ddg_results)} results")
268
- except Exception as e:
269
- print(f"❌ DuckDuckGo error: {e}")
270
-
271
- return "\n\n".join(all_results) if all_results else "No search results found"
272
-
273
- def extract_key_terms(self, text: str) -> str:
274
- """Extract key terms for better search"""
275
- # Remove question patterns
276
- text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
277
- text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
278
-
279
- # Extract proper nouns and years
280
- proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
281
- years = re.findall(r'\b(19|20)\d{2}\b', text)
282
-
283
- key_terms = proper_nouns[:5] + years[:2]
284
- return ' '.join(key_terms) if key_terms else text[:100]
285
-
286
- def safe_calculate(self, expression: str) -> str:
287
- """Safe mathematical calculation"""
288
- try:
289
- # Only allow safe characters
290
- allowed_chars = set('0123456789+-*/().= ')
291
- if all(c in allowed_chars for c in expression):
292
- result = eval(expression)
293
- return str(int(result) if isinstance(result, float) and result.is_integer() else result)
294
- else:
295
- return "Invalid expression"
296
- except Exception as e:
297
- return f"Calculation error: {e}"
298
-
299
- def choose_framework(self, question: str) -> str:
300
- """Choose best framework for the question"""
301
- if not self.available_frameworks:
302
- return "fallback"
303
-
304
- question_lower = question.lower()
305
-
306
- # For multi-step reasoning, prefer LangGraph
307
- if any(word in question_lower for word in ['step', 'process', 'analyze', 'between', 'how many']):
308
- if "langgraph" in self.available_frameworks:
309
- return "langgraph"
310
-
311
- # For knowledge tasks, prefer LlamaIndex
312
- if any(word in question_lower for word in ['wikipedia', 'who', 'what', 'when', 'where']):
313
- if "llamaindex" in self.available_frameworks:
314
- return "llamaindex"
315
-
316
- # Default to first available
317
- return self.available_frameworks[0]
318
-
319
- def solve_with_llamaindex(self, question: str) -> str:
320
- """Solve using LlamaIndex"""
321
- print("🔧 Using LlamaIndex framework")
322
- try:
323
- response = self.llama_agent.chat(question)
324
- return str(response)
325
- except Exception as e:
326
- print(f"❌ LlamaIndex error: {e}")
327
- return self.fallback_solve(question)
328
-
329
- def solve_with_langgraph(self, question: str) -> str:
330
- """Solve using LangGraph"""
331
- print("🔧 Using LangGraph framework")
332
- try:
333
- result = self.langgraph_agent.invoke({
334
- "messages": [{"role": "user", "content": question}]
335
- })
336
- # Extract final message
337
- if "messages" in result and result["messages"]:
338
- return result["messages"][-1]["content"]
339
- return str(result)
340
- except Exception as e:
341
- print(f"❌ LangGraph error: {e}")
342
- return self.fallback_solve(question)
343
-
344
- def fallback_solve(self, question: str) -> str:
345
- """Fallback solving without frameworks"""
346
- print("🔧 Using fallback approach")
347
-
348
- # Handle special cases
349
- if ".rewsna eht sa" in question:
350
- return "right"
351
-
352
- # Math questions
353
- if any(op in question for op in ['+', '-', '*', '/']):
354
- numbers = re.findall(r'\d+', question)
355
- if len(numbers) >= 2:
356
- try:
357
- a, b = int(numbers[0]), int(numbers[1])
358
- if '+' in question:
359
- return str(a + b)
360
- elif '*' in question:
361
- return str(a * b)
362
- elif '-' in question:
363
- return str(a - b)
364
- elif '/' in question:
365
- return str(a // b) # Integer division for GAIA
366
- except:
367
- pass
368
-
369
- # Search and extract basic patterns
370
- search_results = self.comprehensive_web_search(question)
371
- return self.extract_basic_answer(question, search_results)
372
-
373
- def extract_basic_answer(self, question: str, text: str) -> str:
374
- """Extract basic answers from text"""
375
- question_lower = question.lower()
376
-
377
- # Numbers
378
- if any(word in question_lower for word in ['how many', 'count', 'number']):
379
- numbers = re.findall(r'\b\d+\b', text)
380
- if numbers:
381
- return numbers[0]
382
-
383
- # Names
384
- if 'who' in question_lower:
385
- names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
386
- if names:
387
- return names[0]
388
-
389
- # Places
390
- if 'capital' in question_lower:
391
- # Look for "capital is X" or "X is the capital"
392
- capital_match = re.search(r'capital.*?is\s+([A-Z][a-z]+)|([A-Z][a-z]+)\s+is\s+the\s+capital', text)
393
- if capital_match:
394
- return capital_match.group(1) or capital_match.group(2)
395
-
396
- return "Unable to determine answer"
397
-
398
- def format_gaia_answer(self, answer: str) -> str:
399
- """Format answer for GAIA requirements"""
400
- if not answer or "unable" in answer.lower() or "error" in answer.lower():
401
- return "Unable to determine answer"
402
-
403
- # Clean up
404
- answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
405
- answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
406
- answer = re.sub(r'[.!?]+$', '', answer)
407
- answer = ' '.join(answer.split())
408
-
409
- return answer
410
-
411
- def __call__(self, question: str) -> str:
412
- """Main entry point"""
413
- print(f"🎯 Simplified GAIA Agent processing: {question[:100]}...")
414
-
415
- try:
416
- # Choose framework
417
- framework = self.choose_framework(question)
418
- print(f"🎛️ Selected approach: {framework}")
419
-
420
- # Route to appropriate solver
421
- if framework == "llamaindex" and hasattr(self, 'llama_agent'):
422
- answer = self.solve_with_llamaindex(question)
423
- elif framework == "langgraph" and hasattr(self, 'langgraph_agent'):
424
- answer = self.solve_with_langgraph(question)
425
- else:
426
- answer = self.fallback_solve(question)
427
-
428
- # Format for GAIA
429
- final_answer = self.format_gaia_answer(answer)
430
- print(f"✅ Final answer: {final_answer}")
431
- return final_answer
432
-
433
- except Exception as e:
434
- print(f"❌ Agent error: {e}")
435
- return "Error processing question"
436
-
437
-
438
- # Create aliases for compatibility
439
- BasicAgent = SimplifiedGAIAAgent
440
- GAIAAgent = SimplifiedGAIAAgent
441
- FrameworkGAIAAgent = SimplifiedGAIAAgent
442
-
443
-
444
- if __name__ == "__main__":
445
- # Test the agent
446
- agent = SimplifiedGAIAAgent()
447
-
448
- test_questions = [
449
- "What is 25 * 4?",
450
- "Who was the first person to walk on the moon?",
451
- "What is the capital of France?",
452
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
453
- ]
454
-
455
- print("\n" + "="*60)
456
- print("Testing Simplified GAIA Agent")
457
- print("="*60)
458
-
459
- for i, question in enumerate(test_questions, 1):
460
- print(f"\n{i}. Testing: {question}")
461
- answer = agent(question)
462
- print(f" Final Answer: {answer}")
463
- print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_agent.py DELETED
@@ -1,665 +0,0 @@
1
- import re
2
- import wikipedia
3
- from ddgs import DDGS
4
- import requests
5
- import json
6
- from datetime import datetime
7
- import os
8
-
9
- # Import additional search engines
10
- try:
11
- from exa_py import Exa
12
- EXA_AVAILABLE = True
13
- except ImportError:
14
- EXA_AVAILABLE = False
15
- print("Exa not available - install with: pip install exa-py")
16
-
17
- try:
18
- from tavily import TavilyClient
19
- TAVILY_AVAILABLE = True
20
- except ImportError:
21
- TAVILY_AVAILABLE = False
22
- print("Tavily not available - install with: pip install tavily-python")
23
-
24
- # Import the multi-LLM consensus GAIA agent
25
- from consensus_gaia_agent import ConsensusGAIAAgent
26
-
27
- class SimpleAgent:
28
- """A simple, direct agent that trusts good search results"""
29
- def __init__(self):
30
- print("SimpleAgent initialized - direct search and extraction approach.")
31
- self.ddgs = DDGS()
32
-
33
- # Initialize Exa if available
34
- if EXA_AVAILABLE:
35
- exa_api_key = os.getenv("EXA_API_KEY")
36
- if exa_api_key:
37
- self.exa = Exa(api_key=exa_api_key)
38
- print("✅ Exa search engine initialized")
39
- else:
40
- self.exa = None
41
- print("⚠️ EXA_API_KEY not found in environment")
42
- else:
43
- self.exa = None
44
-
45
- # Initialize Tavily if available
46
- if TAVILY_AVAILABLE:
47
- tavily_api_key = os.getenv("TAVILY_API_KEY")
48
- if tavily_api_key:
49
- self.tavily = TavilyClient(api_key=tavily_api_key)
50
- print("✅ Tavily search engine initialized")
51
- else:
52
- self.tavily = None
53
- print("⚠️ TAVILY_API_KEY not found in environment")
54
- else:
55
- self.tavily = None
56
-
57
- self.system_prompt = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
58
-
59
- def search_web_comprehensive(self, query, max_results=3):
60
- """Search using multiple engines for comprehensive results"""
61
- all_results = []
62
-
63
- # Truncate query for Tavily (400 char limit)
64
- tavily_query = query[:350] if len(query) > 350 else query
65
-
66
- # Try Tavily first (usually most relevant)
67
- if self.tavily:
68
- try:
69
- print(f" 🔍 TAVILY SEARCH: '{tavily_query}'")
70
- tavily_results = self.tavily.search(tavily_query, max_results=max_results)
71
- if tavily_results and 'results' in tavily_results:
72
- for result in tavily_results['results']:
73
- all_results.append({
74
- "title": result.get("title", ""),
75
- "body": result.get("content", ""),
76
- "href": result.get("url", ""),
77
- "source": "Tavily"
78
- })
79
- print(f" 📊 Tavily found {len(tavily_results['results'])} results")
80
- except Exception as e:
81
- print(f" ❌ Tavily search error: {e}")
82
-
83
- # Try Exa next (good for academic/factual content)
84
- if self.exa and len(all_results) < max_results:
85
- try:
86
- # Use shorter query for Exa too
87
- exa_query = query[:200] if len(query) > 200 else query
88
- print(f" 🔍 EXA SEARCH: '{exa_query}'")
89
- exa_results = self.exa.search(exa_query, num_results=max_results-len(all_results), include_text=True)
90
- if exa_results and hasattr(exa_results, 'results'):
91
- for result in exa_results.results:
92
- all_results.append({
93
- "title": result.title if hasattr(result, 'title') else "",
94
- "body": result.text if hasattr(result, 'text') else "",
95
- "href": result.url if hasattr(result, 'url') else "",
96
- "source": "Exa"
97
- })
98
- print(f" 📊 Exa found {len(exa_results.results)} results")
99
- except Exception as e:
100
- print(f" ❌ Exa search error: {e}")
101
-
102
- # Fallback to DuckDuckGo if needed
103
- if len(all_results) < max_results:
104
- try:
105
- print(f" 🌐 DUCKDUCKGO SEARCH: '{query[:100]}...'")
106
- ddg_results = list(self.ddgs.text(query, max_results=max_results-len(all_results)))
107
- for result in ddg_results:
108
- all_results.append({
109
- "title": result.get("title", ""),
110
- "body": result.get("body", ""),
111
- "href": result.get("href", ""),
112
- "source": "DuckDuckGo"
113
- })
114
- print(f" 📊 DuckDuckGo found {len(ddg_results)} results")
115
- except Exception as e:
116
- print(f" ❌ DuckDuckGo search error: {e}")
117
-
118
- print(f" ✅ Total results from all engines: {len(all_results)}")
119
- return all_results[:max_results]
120
-
121
- def search_web(self, query, max_results=3):
122
- """Search the web using multiple engines with fallback"""
123
- # Use comprehensive search if any premium engines are available
124
- if self.tavily or self.exa:
125
- return self.search_web_comprehensive(query, max_results)
126
-
127
- # Fallback to original DuckDuckGo only
128
- print(f" 🌐 WEB SEARCH: '{query}'")
129
- try:
130
- results = list(self.ddgs.text(query, max_results=max_results))
131
- print(f" 📊 Found {len(results)} web results")
132
- return [{"title": r["title"], "body": r["body"], "href": r["href"], "source": "DuckDuckGo"} for r in results]
133
- except Exception as e:
134
- print(f" ❌ Web search error: {e}")
135
- return []
136
-
137
- def preprocess_question(self, question):
138
- """Preprocess question to handle special cases"""
139
- question = question.strip()
140
-
141
- # Check if text is reversed (common GAIA trick)
142
- if question.count(' ') > 3: # Only check multi-word questions
143
- words = question.split()
144
- # Check if it looks like reversed English
145
- if words[0].islower() and words[-1][0].isupper():
146
- reversed_question = ' '.join(reversed(words))[::-1]
147
- print(f" 🔄 DETECTED REVERSED TEXT: '{reversed_question}'")
148
- return reversed_question
149
-
150
- return question
151
-
152
- def generate_search_query(self, question):
153
- """Generate optimized search query from question"""
154
- # Remove question-specific instructions for cleaner search
155
- question = re.sub(r'You can use.*?wikipedia\.', '', question, flags=re.IGNORECASE)
156
- question = re.sub(r'Please provide.*?notation\.', '', question, flags=re.IGNORECASE)
157
- question = re.sub(r'Give.*?answer\.', '', question, flags=re.IGNORECASE)
158
- question = re.sub(r'Express.*?places\.', '', question, flags=re.IGNORECASE)
159
-
160
- # Limit length for Wikipedia (max 300 chars)
161
- if len(question) > 250:
162
- # Extract key terms
163
- key_terms = []
164
- # Look for proper nouns (capitalized words)
165
- proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
166
- key_terms.extend(proper_nouns[:3]) # Take first 3
167
-
168
- # Look for years
169
- years = re.findall(r'\b(19|20)\d{2}\b', question)
170
- key_terms.extend(years[:2])
171
-
172
- # Look for numbers
173
- numbers = re.findall(r'\b\d+\b', question)
174
- key_terms.extend(numbers[:2])
175
-
176
- if key_terms:
177
- return ' '.join(key_terms)
178
- else:
179
- # Fallback: take first meaningful words
180
- words = question.split()[:10]
181
- return ' '.join(words)
182
-
183
- return question
184
-
185
- def search_wikipedia(self, query):
186
- """Search Wikipedia for information"""
187
- # Generate optimized query
188
- search_query = self.generate_search_query(query)
189
- print(f" 📖 WIKIPEDIA SEARCH: '{search_query}'")
190
-
191
- try:
192
- search_results = wikipedia.search(search_query, results=3)
193
- if not search_results:
194
- print(f" ❌ No Wikipedia results found")
195
- return None
196
-
197
- print(f" 📋 Wikipedia found: {search_results}")
198
- page = wikipedia.page(search_results[0])
199
- result = {
200
- "title": page.title,
201
- "summary": wikipedia.summary(search_results[0], sentences=3),
202
- "content": page.content[:2000],
203
- "url": page.url
204
- }
205
- print(f" ✅ Using page: {result['title']}")
206
- return result
207
- except Exception as e:
208
- print(f" ❌ Wikipedia search error: {e}")
209
- return None
210
-
211
- def calculate_math(self, question):
212
- """Handle math questions with direct calculation"""
213
- print(f" 🧮 CALCULATOR: Processing math question")
214
-
215
- numbers = re.findall(r'\d+\.?\d*', question)
216
- if len(numbers) < 2:
217
- return None
218
-
219
- nums = [float(n) if '.' in n else int(n) for n in numbers]
220
- print(f" 📊 Numbers found: {nums}")
221
-
222
- question_lower = question.lower()
223
-
224
- if '+' in question or 'add' in question_lower or 'plus' in question_lower:
225
- result = sum(nums)
226
- print(f" ➕ {' + '.join(map(str, nums))} = {result}")
227
- return str(int(result) if result.is_integer() else result)
228
-
229
- elif '-' in question or 'subtract' in question_lower or 'minus' in question_lower:
230
- result = nums[0] - nums[1]
231
- print(f" ➖ {nums[0]} - {nums[1]} = {result}")
232
- return str(int(result) if result.is_integer() else result)
233
-
234
- elif '*' in question or 'multiply' in question_lower or 'times' in question_lower:
235
- result = nums[0] * nums[1]
236
- print(f" ✖️ {nums[0]} * {nums[1]} = {result}")
237
- return str(int(result) if result.is_integer() else result)
238
-
239
- elif '/' in question or 'divide' in question_lower:
240
- if nums[1] != 0:
241
- result = nums[0] / nums[1]
242
- print(f" ➗ {nums[0]} / {nums[1]} = {result}")
243
- return str(int(result) if result.is_integer() else result)
244
- else:
245
- return "Cannot divide by zero"
246
-
247
- return None
248
-
249
- def extract_final_answer(self, question, search_results, wiki_result):
250
- """Extract answers following GAIA format requirements"""
251
- print(f" 🎯 EXTRACTING ANSWERS WITH GAIA FORMATTING")
252
-
253
- # Combine all available text
254
- all_text = question # Include original question for context
255
- if wiki_result:
256
- all_text += f" {wiki_result['summary']} {wiki_result['content'][:1000]}"
257
-
258
- for result in search_results:
259
- all_text += f" {result['body']}"
260
-
261
- question_lower = question.lower()
262
-
263
- # Handle reversed text first
264
- if ".rewsna eht sa" in question or "dnatsrednu uoy fI" in question:
265
- # This is the reversed question asking for opposite of "left"
266
- print(f" 🔄 Reversed text question - answer is 'right'")
267
- return "right"
268
-
269
- # Math questions - return just the number
270
- if any(op in question for op in ['+', '-', '*', '/', 'calculate', 'add', 'subtract', 'multiply', 'divide']):
271
- math_result = self.calculate_math(question)
272
- if math_result and math_result != "Cannot divide by zero":
273
- # Remove any non-numeric formatting for GAIA
274
- result = re.sub(r'[^\d.-]', '', str(math_result))
275
- print(f" 🧮 Math result: {result}")
276
- return result
277
-
278
- # Years/dates - return just the year
279
- if 'when' in question_lower or 'year' in question_lower or 'built' in question_lower:
280
- years = re.findall(r'\b(1[0-9]{3}|20[0-9]{2})\b', all_text)
281
- if years:
282
- # For historical events, prefer earlier years
283
- if 'jfk' in question_lower or 'kennedy' in question_lower:
284
- valid_years = [y for y in years if '1960' <= y <= '1970']
285
- if valid_years:
286
- print(f" 📅 JFK-related year: {valid_years[0]}")
287
- return valid_years[0]
288
-
289
- # Count frequency and return most common
290
- year_counts = {}
291
- for year in years:
292
- year_counts[year] = year_counts.get(year, 0) + 1
293
- best_year = max(year_counts.items(), key=lambda x: x[1])[0]
294
- print(f" 📅 Best year: {best_year}")
295
- return best_year
296
-
297
- # Names - look for proper names, return without articles
298
- if 'who' in question_lower:
299
- # Try specific patterns first
300
- name_patterns = [
301
- r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:was|is|became)\s+the\s+first',
302
- r'the\s+first.*?(?:was|is)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)',
303
- r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:stepped|walked|landed)',
304
- ]
305
-
306
- for pattern in name_patterns:
307
- matches = re.findall(pattern, all_text, re.IGNORECASE)
308
- if matches:
309
- name = matches[0]
310
- print(f" 👤 Found name: {name}")
311
- return name
312
-
313
- # Fallback: extract common names
314
- common_names = re.findall(r'\b(Neil Armstrong|John Kennedy|Albert Einstein|Marie Curie|Leonardo da Vinci)\b', all_text, re.IGNORECASE)
315
- if common_names:
316
- print(f" 👤 Common name: {common_names[0]}")
317
- return common_names[0]
318
-
319
- # Capital cities - return city name only
320
- if 'capital' in question_lower:
321
- capital_patterns = [
322
- r'capital.*?is\s+([A-Z][a-z]+)',
323
- r'([A-Z][a-z]+)\s+is\s+the\s+capital',
324
- r'capital.*?([A-Z][a-z]+)',
325
- ]
326
-
327
- for pattern in capital_patterns:
328
- matches = re.findall(pattern, all_text)
329
- if matches:
330
- city = matches[0]
331
- # Filter out common non-city words
332
- if city not in ['The', 'Capital', 'City', 'France', 'Australia', 'Country']:
333
- print(f" 🏙️ Capital city: {city}")
334
- return city
335
-
336
- # Height/measurements - extract numbers with potential units
337
- if 'tall' in question_lower or 'height' in question_lower:
338
- # Look for measurements
339
- height_patterns = [
340
- r'(\d+(?:\.\d+)?)\s*(?:meters?|metres?|m|feet|ft)',
341
- r'(\d+(?:\.\d+)?)\s*(?:meter|metre)\s*tall',
342
- ]
343
-
344
- for pattern in height_patterns:
345
- matches = re.findall(pattern, all_text)
346
- if matches:
347
- height = matches[0]
348
- print(f" 📏 Height found: {height}")
349
- return height
350
-
351
- # Mountain names
352
- if 'mountain' in question_lower or 'highest' in question_lower:
353
- mountain_names = re.findall(r'\b(Mount\s+Everest|Everest|K2|Denali|Mont\s+Blanc)\b', all_text, re.IGNORECASE)
354
- if mountain_names:
355
- mountain = mountain_names[0]
356
- print(f" 🏔️ Mountain: {mountain}")
357
- return mountain
358
-
359
- # Tower names
360
- if 'tower' in question_lower and 'paris' in question_lower:
361
- tower_names = re.findall(r'\b(Eiffel\s+Tower|Tour\s+Eiffel)\b', all_text, re.IGNORECASE)
362
- if tower_names:
363
- print(f" 🗼 Tower: Eiffel Tower")
364
- return "Eiffel Tower"
365
-
366
- # Album counts - look for numbers
367
- if 'album' in question_lower and 'how many' in question_lower:
368
- numbers = re.findall(r'\b([0-9]|[1-2][0-9])\b', all_text) # Reasonable album count range
369
- if numbers:
370
- count = numbers[0]
371
- print(f" 💿 Album count: {count}")
372
- return count
373
-
374
- print(f" ❌ No specific answer found")
375
- return "Unable to determine answer"
376
-
377
- def process_question(self, question):
378
- """Main processing - enhanced with GAIA formatting"""
379
- print(f"Processing: {question}")
380
-
381
- # Preprocess question for special cases
382
- processed_question = self.preprocess_question(question)
383
-
384
- # Handle math questions directly with GAIA formatting
385
- if any(word in processed_question.lower() for word in ['calculate', 'add', 'subtract', 'multiply', 'divide', '+', '-', '*', '/']):
386
- math_result = self.calculate_math(processed_question)
387
- if math_result:
388
- # Return clean number format for GAIA
389
- result = re.sub(r'[^\d.-]', '', str(math_result))
390
- return result
391
-
392
- # For other questions, search and extract with GAIA formatting
393
- search_results = self.search_web(processed_question, max_results=4)
394
- wiki_result = self.search_wikipedia(processed_question)
395
-
396
- # Extract answer using enhanced patterns
397
- answer = self.extract_final_answer(processed_question, search_results, wiki_result)
398
-
399
- # Clean up answer for GAIA format
400
- if answer and answer != "Unable to determine answer":
401
- # Remove articles and common prefixes
402
- answer = re.sub(r'^(The |A |An )', '', answer, flags=re.IGNORECASE)
403
- # Remove trailing punctuation
404
- answer = re.sub(r'[.!?]+$', '', answer)
405
- # Clean up extra whitespace
406
- answer = ' '.join(answer.split())
407
-
408
- return answer
409
-
410
- def __call__(self, question: str) -> str:
411
- print(f"SimpleAgent processing: {question[:100]}...")
412
-
413
- try:
414
- answer = self.process_question(question)
415
- print(f"Final answer: {answer}")
416
- return answer
417
- except Exception as e:
418
- print(f"Error: {e}")
419
- return "Error processing question"
420
-
421
-
422
- def run_gaia_evaluation():
423
- """Run the full GAIA evaluation and output results to markdown"""
424
- print("🚀 Starting GAIA Level 1 Evaluation")
425
- print("=" * 50)
426
-
427
- # Initialize agent
428
- agent = ConsensusGAIAAgent() # Use the multi-LLM consensus agent
429
-
430
- # API endpoints
431
- api_url = "https://agents-course-unit4-scoring.hf.space"
432
- questions_url = f"{api_url}/questions"
433
- submit_url = f"{api_url}/submit"
434
-
435
- # Username for submission
436
- username = os.getenv("HF_USERNAME", "test_user")
437
- agent_code = "local_testing"
438
-
439
- # Fetch questions
440
- print(f"📥 Fetching questions from: {questions_url}")
441
- try:
442
- response = requests.get(questions_url, timeout=15)
443
- response.raise_for_status()
444
- questions_data = response.json()
445
- print(f"✅ Fetched {len(questions_data)} questions")
446
- except Exception as e:
447
- print(f"❌ Error fetching questions: {e}")
448
- return
449
-
450
- # Process questions
451
- results_log = []
452
- answers_payload = []
453
- start_time = datetime.now()
454
-
455
- print(f"\n🔄 Processing {len(questions_data)} questions...")
456
- print("-" * 50)
457
-
458
- for i, item in enumerate(questions_data, 1):
459
- task_id = item.get("task_id")
460
- question_text = item.get("question")
461
-
462
- if not task_id or question_text is None:
463
- print(f"⚠️ Skipping item {i} with missing data")
464
- continue
465
-
466
- print(f"\n📝 Question {i}/{len(questions_data)} (ID: {task_id})")
467
- print(f"Q: {question_text[:100]}...")
468
-
469
- try:
470
- question_start = datetime.now()
471
- submitted_answer = agent(question_text)
472
- processing_time = (datetime.now() - question_start).total_seconds()
473
-
474
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
475
- results_log.append({
476
- "question_num": i,
477
- "task_id": task_id,
478
- "question": question_text,
479
- "answer": submitted_answer,
480
- "processing_time": processing_time
481
- })
482
-
483
- print(f"✅ Answer: {submitted_answer}")
484
- print(f"⏱️ Processing time: {processing_time:.2f}s")
485
-
486
- except Exception as e:
487
- print(f"❌ Error processing question {i}: {e}")
488
- results_log.append({
489
- "question_num": i,
490
- "task_id": task_id,
491
- "question": question_text,
492
- "answer": f"ERROR: {e}",
493
- "processing_time": 0
494
- })
495
-
496
- print("-" * 30)
497
-
498
- total_time = (datetime.now() - start_time).total_seconds()
499
- print(f"\n🏁 Completed processing in {total_time:.2f} seconds")
500
-
501
- # Submit answers
502
- if answers_payload:
503
- print(f"\n📤 Submitting {len(answers_payload)} answers...")
504
- submission_data = {
505
- "username": username,
506
- "agent_code": agent_code,
507
- "answers": answers_payload
508
- }
509
-
510
- try:
511
- response = requests.post(submit_url, json=submission_data, timeout=60)
512
- response.raise_for_status()
513
- result_data = response.json()
514
- print("✅ Submission successful!")
515
-
516
- # Extract score data
517
- score = result_data.get('score', 'N/A')
518
- correct_count = result_data.get('correct_count', '?')
519
- total_attempted = result_data.get('total_attempted', '?')
520
- message = result_data.get('message', 'No message received.')
521
-
522
- print(f"🎯 Score: {score}% ({correct_count}/{total_attempted} correct)")
523
-
524
- except Exception as e:
525
- print(f"❌ Submission failed: {e}")
526
- score = "Submission Failed"
527
- correct_count = "?"
528
- total_attempted = len(answers_payload)
529
- message = str(e)
530
- else:
531
- score = "No Answers"
532
- correct_count = 0
533
- total_attempted = 0
534
- message = "No answers were generated"
535
-
536
- # Generate markdown report
537
- timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
538
- filename = f"gaia_evaluation_report_{timestamp}.md"
539
-
540
- print(f"\n📄 Generating report: {filename}")
541
-
542
- markdown_content = f"""# GAIA Level 1 Evaluation Report
543
-
544
- **Date:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
545
- **Agent:** SimpleAgent (Direct Search & Pattern Matching)
546
- **Username:** {username}
547
- **Total Questions:** {len(questions_data)}
548
- **Processing Time:** {total_time:.2f} seconds
549
-
550
- ## 📊 Results Summary
551
-
552
- - **Overall Score:** {score}%
553
- - **Correct Answers:** {correct_count}/{total_attempted}
554
- - **Average Time per Question:** {total_time/len(questions_data):.2f} seconds
555
- - **Status:** {message}
556
-
557
- ## 🎯 Agent Performance
558
-
559
- The SimpleAgent uses a direct approach with:
560
- - 🌐 Web search via DuckDuckGo
561
- - 📖 Wikipedia integration
562
- - 🧮 Calculator for math questions
563
- - 🎯 Pattern-based answer extraction
564
-
565
- ## 📋 Detailed Results
566
-
567
- | # | Task ID | Question | Answer | Time (s) |
568
- |---|---------|----------|--------|----------|
569
- """
570
-
571
- for result in results_log:
572
- question_preview = result['question'][:80] + "..." if len(result['question']) > 80 else result['question']
573
- answer_preview = str(result['answer'])[:50] + "..." if len(str(result['answer'])) > 50 else str(result['answer'])
574
-
575
- # Escape markdown special characters
576
- question_preview = question_preview.replace("|", "\\|").replace("\n", " ")
577
- answer_preview = answer_preview.replace("|", "\\|").replace("\n", " ")
578
-
579
- markdown_content += f"| {result['question_num']} | {result['task_id']} | {question_preview} | {answer_preview} | {result['processing_time']:.2f} |\n"
580
-
581
- markdown_content += f"""
582
-
583
- ## 🔍 Analysis
584
-
585
- ### Strengths
586
- - ✅ Handles basic math questions accurately
587
- - ✅ Good web search integration
588
- - ✅ Pattern matching for common question types
589
- - ✅ Detailed logging for debugging
590
-
591
- ### Areas for Improvement
592
- - 🔄 Handle multimedia content (videos, images, audio)
593
- - 🔄 Better extraction for complex questions
594
- - 🔄 Improve Wikipedia search relevance
595
- - 🔄 Add more sophisticated reasoning
596
-
597
- ### Question Types Performance
598
- """
599
-
600
- # Analyze performance by question type
601
- math_questions = [r for r in results_log if any(word in r['question'].lower() for word in ['calculate', '+', '-', '*', '/', 'add', 'subtract', 'multiply', 'divide'])]
602
- who_questions = [r for r in results_log if 'who' in r['question'].lower()]
603
- when_questions = [r for r in results_log if 'when' in r['question'].lower() or 'year' in r['question'].lower()]
604
- capital_questions = [r for r in results_log if 'capital' in r['question'].lower()]
605
-
606
- if math_questions:
607
- markdown_content += f"- **Math Questions:** {len(math_questions)} questions\n"
608
- if who_questions:
609
- markdown_content += f"- **Who Questions:** {len(who_questions)} questions\n"
610
- if when_questions:
611
- markdown_content += f"- **When/Year Questions:** {len(when_questions)} questions\n"
612
- if capital_questions:
613
- markdown_content += f"- **Capital Questions:** {len(capital_questions)} questions\n"
614
-
615
- markdown_content += f"""
616
-
617
- ---
618
- *Report generated by SimpleAgent GAIA Evaluation Tool*
619
- *Timestamp: {timestamp}*
620
- """
621
-
622
- # Write markdown file
623
- try:
624
- with open(filename, 'w', encoding='utf-8') as f:
625
- f.write(markdown_content)
626
- print(f"✅ Report saved to: {filename}")
627
- print(f"📊 Final Score: {score}% ({correct_count}/{total_attempted} correct)")
628
-
629
- except Exception as e:
630
- print(f"❌ Error saving report: {e}")
631
- print("📄 Report content:")
632
- print(markdown_content[:1000] + "..." if len(markdown_content) > 1000 else markdown_content)
633
-
634
-
635
- # Use the multi-LLM consensus GAIA agent as drop-in replacement
636
- BasicAgent = ConsensusGAIAAgent
637
-
638
- # Test the agent
639
- if __name__ == "__main__":
640
- import sys
641
-
642
- if len(sys.argv) > 1 and sys.argv[1] == "--gaia":
643
- # Run full GAIA evaluation
644
- run_gaia_evaluation()
645
- else:
646
- # Run quick tests
647
- agent = ConsensusGAIAAgent() # Use the multi-LLM consensus agent
648
-
649
- test_questions = [
650
- "What is 15 + 27?",
651
- "When was the Eiffel Tower built?",
652
- "Who was the first person to walk on the moon?",
653
- "What is the capital of France?"
654
- ]
655
-
656
- print("Testing Simple Direct Agent:")
657
- print("=" * 40)
658
-
659
- for i, question in enumerate(test_questions, 1):
660
- print(f"\n{i}. Question: {question}")
661
- answer = agent(question)
662
- print(f" Answer: {answer}")
663
- print("-" * 25)
664
-
665
- print(f"\n💡 To run full GAIA evaluation: python {sys.argv[0]} --gaia")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_exa_fix.py DELETED
@@ -1,47 +0,0 @@
1
- import os
2
- import sys
3
-
4
- try:
5
- from exa_py import Exa
6
- EXA_AVAILABLE = True
7
- except ImportError:
8
- EXA_AVAILABLE = False
9
- print("Exa not available - install with: pip install exa-py")
10
- sys.exit(1)
11
-
12
- def test_exa_search():
13
- """Test Exa search_and_contents method"""
14
- print("Testing Exa search_and_contents method...")
15
-
16
- # Initialize Exa
17
- exa_api_key = os.getenv("EXA_API_KEY")
18
- if not exa_api_key:
19
- print("❌ EXA_API_KEY not found in environment")
20
- return
21
-
22
- exa = Exa(api_key=exa_api_key)
23
- query = "artificial intelligence"
24
-
25
- # Try with search_and_contents method
26
- try:
27
- print(f"\n🔍 Using search_and_contents method")
28
- results = exa.search_and_contents(query, num_results=2)
29
-
30
- if results and hasattr(results, 'results'):
31
- print(f"✅ Search successful! Found {len(results.results)} results")
32
- for i, result in enumerate(results.results, 1):
33
- print(f"\nResult {i}:")
34
- print(f"Title: {getattr(result, 'title', 'N/A')}")
35
- print(f"URL: {getattr(result, 'url', 'N/A')}")
36
- print(f"Has text attribute: {hasattr(result, 'text')}")
37
- if hasattr(result, 'text') and result.text:
38
- print(f"Text snippet: {result.text[:100]}...")
39
- else:
40
- print("Text attribute is None or empty")
41
- else:
42
- print("❌ No results found")
43
- except Exception as e:
44
- print(f"❌ Error: {e}")
45
-
46
- if __name__ == "__main__":
47
- test_exa_search()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/final_answer.py DELETED
@@ -1,14 +0,0 @@
1
- from typing import Any, Optional
2
- from smolagents.tools import Tool
3
-
4
- class FinalAnswerTool(Tool):
5
- name = "final_answer"
6
- description = "Provides a final answer to the given problem."
7
- inputs = {'answer': {'type': 'any', 'description': 'The final answer to the problem'}}
8
- output_type = "any"
9
-
10
- def forward(self, answer: Any) -> Any:
11
- return answer
12
-
13
- def __init__(self, *args, **kwargs):
14
- self.is_initialized = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/visit_webpage.py DELETED
@@ -1,45 +0,0 @@
1
- from typing import Any, Optional
2
- from smolagents.tools import Tool
3
- import requests
4
- import markdownify
5
- import smolagents
6
-
7
- class VisitWebpageTool(Tool):
8
- name = "visit_webpage"
9
- description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
10
- inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
11
- output_type = "string"
12
-
13
- def forward(self, url: str) -> str:
14
- try:
15
- import requests
16
- from markdownify import markdownify
17
- from requests.exceptions import RequestException
18
-
19
- from smolagents.utils import truncate_content
20
- except ImportError as e:
21
- raise ImportError(
22
- "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
23
- ) from e
24
- try:
25
- # Send a GET request to the URL with a 20-second timeout
26
- response = requests.get(url, timeout=20)
27
- response.raise_for_status() # Raise an exception for bad status codes
28
-
29
- # Convert the HTML content to Markdown
30
- markdown_content = markdownify(response.text).strip()
31
-
32
- # Remove multiple line breaks
33
- markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
34
-
35
- return truncate_content(markdown_content, 10000)
36
-
37
- except requests.exceptions.Timeout:
38
- return "The request timed out. Please try again later or check the URL."
39
- except RequestException as e:
40
- return f"Error fetching the webpage: {str(e)}"
41
- except Exception as e:
42
- return f"An unexpected error occurred: {str(e)}"
43
-
44
- def __init__(self, *args, **kwargs):
45
- self.is_initialized = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/web_search.py DELETED
@@ -1,27 +0,0 @@
1
- from typing import Any, Optional
2
- from smolagents.tools import Tool
3
- import duckduckgo_search
4
-
5
- class DuckDuckGoSearchTool(Tool):
6
- name = "web_search"
7
- description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
8
- inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
9
- output_type = "string"
10
-
11
- def __init__(self, max_results=10, **kwargs):
12
- super().__init__()
13
- self.max_results = max_results
14
- try:
15
- from duckduckgo_search import DDGS
16
- except ImportError as e:
17
- raise ImportError(
18
- "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
19
- ) from e
20
- self.ddgs = DDGS(**kwargs)
21
-
22
- def forward(self, query: str) -> str:
23
- results = self.ddgs.text(query, max_results=self.max_results)
24
- if len(results) == 0:
25
- raise Exception("No results found! Try a less restrictive/shorter query.")
26
- postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
27
- return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock DELETED
The diff for this file is too large to render. See raw diff
 
verify_exa_fix.py DELETED
@@ -1,85 +0,0 @@
1
- import os
2
- import sys
3
- import importlib
4
-
5
- # List of modules to test
6
- modules_to_test = [
7
- "consensus_gaia_agent",
8
- "advanced_agent",
9
- "app",
10
- "gaia_agent",
11
- "simplified_gaia_agent",
12
- "framework_gaia_agent"
13
- ]
14
-
15
- def verify_fix():
16
- """Verify that all modules are using search_and_contents instead of search with text=True"""
17
- print("Verifying Exa API parameter fix...")
18
-
19
- # Check if Exa is available
20
- try:
21
- from exa_py import Exa
22
- EXA_AVAILABLE = True
23
- except ImportError:
24
- print("❌ Exa not available - install with: pip install exa-py")
25
- return
26
-
27
- # Initialize Exa
28
- exa_api_key = os.getenv("EXA_API_KEY")
29
- if not exa_api_key:
30
- print("❌ EXA_API_KEY not found in environment")
31
- return
32
-
33
- # Test each module
34
- for module_name in modules_to_test:
35
- print(f"\nChecking {module_name}...")
36
- try:
37
- # Import the module
38
- module = importlib.import_module(module_name)
39
-
40
- # Check if the module has a class that uses Exa
41
- for attr_name in dir(module):
42
- attr = getattr(module, attr_name)
43
- if isinstance(attr, type) and attr_name not in ["Exa", "TavilyClient", "DDGS"]:
44
- # Check if this class has an __init__ method
45
- if hasattr(attr, "__init__"):
46
- print(f" - Found class: {attr_name}")
47
-
48
- # Create an instance of the class
49
- try:
50
- instance = attr()
51
-
52
- # Check if the instance has an exa attribute
53
- if hasattr(instance, "exa"):
54
- print(f" ✅ Class has exa attribute")
55
-
56
- # Check if we can run a search
57
- try:
58
- query = "artificial intelligence"
59
- print(f" 🔍 Testing search with query: '{query}'")
60
-
61
- # This will work if the class is using search_and_contents
62
- results = instance.exa.search_and_contents(query, num_results=1)
63
-
64
- if results and hasattr(results, 'results'):
65
- print(f" ✅ Search successful! Found {len(results.results)} results")
66
- for result in results.results:
67
- if hasattr(result, 'text') and result.text:
68
- print(f" ✅ Result has text content")
69
- else:
70
- print(f" ❌ Result does not have text content")
71
- else:
72
- print(f" ❌ No results found")
73
- except Exception as e:
74
- print(f" ❌ Search error: {e}")
75
- else:
76
- print(f" ⚠️ Class does not have exa attribute")
77
- except Exception as e:
78
- print(f" ❌ Could not create instance: {e}")
79
- except Exception as e:
80
- print(f"❌ Error checking {module_name}: {e}")
81
-
82
- print("\nVerification complete!")
83
-
84
- if __name__ == "__main__":
85
- verify_fix()