Spaces:
Sleeping
Sleeping
Clean deployment: Remove all non-critical development files
Browse files- Removed all development agents (advanced_agent.py, consensus_gaia_agent.py, etc.)
- Removed test files, reports, and development artifacts
- Removed tools directory and cache files
- Kept only critical files: app.py, speed_optimized_gaia_agent.py, requirements.txt, README.md
- Clean production-ready deployment
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- .DS_Store +0 -0
- Gradio_UI.py +0 -296
- __pycache__/advanced_agent.cpython-312.pyc +0 -0
- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/app.cpython-313.pyc +0 -0
- __pycache__/consensus_gaia_agent.cpython-312.pyc +0 -0
- __pycache__/framework_gaia_agent.cpython-312.pyc +0 -0
- __pycache__/gaia_agent.cpython-312.pyc +0 -0
- __pycache__/simplified_gaia_agent.cpython-312.pyc +0 -0
- __pycache__/test_agent.cpython-312.pyc +0 -0
- __pycache__/test_agent.cpython-313-pytest-8.3.5.pyc +0 -0
- __pycache__/test_exa_fix.cpython-313-pytest-8.3.5.pyc +0 -0
- advanced_agent.py +0 -410
- agent.json +0 -53
- consensus_gaia_agent.py +0 -430
- framework_gaia_agent.py +0 -508
- gaia_agent.py +0 -653
- gaia_agent_update_plan.md +0 -23
- gaia_evaluation_report_2025-07-13_13-09-20.md +0 -72
- gaia_evaluation_report_2025-07-13_13-20-50.md +0 -72
- gaia_evaluation_report_2025-07-13_13-25-10.md +0 -72
- gaia_evaluation_report_2025-07-13_15-55-52.md +0 -72
- gaia_evaluation_report_2025-07-13_16-12-38.md +0 -72
- gaia_evaluation_report_2025-07-13_17-06-34.md +0 -72
- gaia_evaluation_report_2025-07-13_17-29-02.md +0 -72
- inspect_exa_api.py +0 -44
- main.py +0 -6
- prompts.yaml +0 -321
- pyproject.toml +0 -19
- simplified_gaia_agent.py +0 -463
- test_agent.py +0 -665
- test_exa_fix.py +0 -47
- tools/final_answer.py +0 -14
- tools/visit_webpage.py +0 -45
- tools/web_search.py +0 -27
- uv.lock +0 -0
- verify_exa_fix.py +0 -85
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
Gradio_UI.py
DELETED
|
@@ -1,296 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python
|
| 2 |
-
# coding=utf-8
|
| 3 |
-
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
| 4 |
-
#
|
| 5 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
-
# you may not use this file except in compliance with the License.
|
| 7 |
-
# You may obtain a copy of the License at
|
| 8 |
-
#
|
| 9 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
-
#
|
| 11 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
-
# See the License for the specific language governing permissions and
|
| 15 |
-
# limitations under the License.
|
| 16 |
-
import mimetypes
|
| 17 |
-
import os
|
| 18 |
-
import re
|
| 19 |
-
import shutil
|
| 20 |
-
from typing import Optional
|
| 21 |
-
|
| 22 |
-
from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
|
| 23 |
-
from smolagents.agents import ActionStep, MultiStepAgent
|
| 24 |
-
from smolagents.memory import MemoryStep
|
| 25 |
-
from smolagents.utils import _is_package_available
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def pull_messages_from_step(
|
| 29 |
-
step_log: MemoryStep,
|
| 30 |
-
):
|
| 31 |
-
"""Extract ChatMessage objects from agent steps with proper nesting"""
|
| 32 |
-
import gradio as gr
|
| 33 |
-
|
| 34 |
-
if isinstance(step_log, ActionStep):
|
| 35 |
-
# Output the step number
|
| 36 |
-
step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else ""
|
| 37 |
-
yield gr.ChatMessage(role="assistant", content=f"**{step_number}**")
|
| 38 |
-
|
| 39 |
-
# First yield the thought/reasoning from the LLM
|
| 40 |
-
if hasattr(step_log, "model_output") and step_log.model_output is not None:
|
| 41 |
-
# Clean up the LLM output
|
| 42 |
-
model_output = step_log.model_output.strip()
|
| 43 |
-
# Remove any trailing <end_code> and extra backticks, handling multiple possible formats
|
| 44 |
-
model_output = re.sub(r"```\s*<end_code>", "```", model_output) # handles ```<end_code>
|
| 45 |
-
model_output = re.sub(r"<end_code>\s*```", "```", model_output) # handles <end_code>```
|
| 46 |
-
model_output = re.sub(r"```\s*\n\s*<end_code>", "```", model_output) # handles ```\n<end_code>
|
| 47 |
-
model_output = model_output.strip()
|
| 48 |
-
yield gr.ChatMessage(role="assistant", content=model_output)
|
| 49 |
-
|
| 50 |
-
# For tool calls, create a parent message
|
| 51 |
-
if hasattr(step_log, "tool_calls") and step_log.tool_calls is not None:
|
| 52 |
-
first_tool_call = step_log.tool_calls[0]
|
| 53 |
-
used_code = first_tool_call.name == "python_interpreter"
|
| 54 |
-
parent_id = f"call_{len(step_log.tool_calls)}"
|
| 55 |
-
|
| 56 |
-
# Tool call becomes the parent message with timing info
|
| 57 |
-
# First we will handle arguments based on type
|
| 58 |
-
args = first_tool_call.arguments
|
| 59 |
-
if isinstance(args, dict):
|
| 60 |
-
content = str(args.get("answer", str(args)))
|
| 61 |
-
else:
|
| 62 |
-
content = str(args).strip()
|
| 63 |
-
|
| 64 |
-
if used_code:
|
| 65 |
-
# Clean up the content by removing any end code tags
|
| 66 |
-
content = re.sub(r"```.*?\n", "", content) # Remove existing code blocks
|
| 67 |
-
content = re.sub(r"\s*<end_code>\s*", "", content) # Remove end_code tags
|
| 68 |
-
content = content.strip()
|
| 69 |
-
if not content.startswith("```python"):
|
| 70 |
-
content = f"```python\n{content}\n```"
|
| 71 |
-
|
| 72 |
-
parent_message_tool = gr.ChatMessage(
|
| 73 |
-
role="assistant",
|
| 74 |
-
content=content,
|
| 75 |
-
metadata={
|
| 76 |
-
"title": f"🛠️ Used tool {first_tool_call.name}",
|
| 77 |
-
"id": parent_id,
|
| 78 |
-
"status": "pending",
|
| 79 |
-
},
|
| 80 |
-
)
|
| 81 |
-
yield parent_message_tool
|
| 82 |
-
|
| 83 |
-
# Nesting execution logs under the tool call if they exist
|
| 84 |
-
if hasattr(step_log, "observations") and (
|
| 85 |
-
step_log.observations is not None and step_log.observations.strip()
|
| 86 |
-
): # Only yield execution logs if there's actual content
|
| 87 |
-
log_content = step_log.observations.strip()
|
| 88 |
-
if log_content:
|
| 89 |
-
log_content = re.sub(r"^Execution logs:\s*", "", log_content)
|
| 90 |
-
yield gr.ChatMessage(
|
| 91 |
-
role="assistant",
|
| 92 |
-
content=f"{log_content}",
|
| 93 |
-
metadata={"title": "📝 Execution Logs", "parent_id": parent_id, "status": "done"},
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
# Nesting any errors under the tool call
|
| 97 |
-
if hasattr(step_log, "error") and step_log.error is not None:
|
| 98 |
-
yield gr.ChatMessage(
|
| 99 |
-
role="assistant",
|
| 100 |
-
content=str(step_log.error),
|
| 101 |
-
metadata={"title": "💥 Error", "parent_id": parent_id, "status": "done"},
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
# Update parent message metadata to done status without yielding a new message
|
| 105 |
-
parent_message_tool.metadata["status"] = "done"
|
| 106 |
-
|
| 107 |
-
# Handle standalone errors but not from tool calls
|
| 108 |
-
elif hasattr(step_log, "error") and step_log.error is not None:
|
| 109 |
-
yield gr.ChatMessage(role="assistant", content=str(step_log.error), metadata={"title": "💥 Error"})
|
| 110 |
-
|
| 111 |
-
# Calculate duration and token information
|
| 112 |
-
step_footnote = f"{step_number}"
|
| 113 |
-
if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"):
|
| 114 |
-
token_str = (
|
| 115 |
-
f" | Input-tokens:{step_log.input_token_count:,} | Output-tokens:{step_log.output_token_count:,}"
|
| 116 |
-
)
|
| 117 |
-
step_footnote += token_str
|
| 118 |
-
if hasattr(step_log, "duration"):
|
| 119 |
-
step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None
|
| 120 |
-
step_footnote += step_duration
|
| 121 |
-
step_footnote = f"""<span style="color: #bbbbc2; font-size: 12px;">{step_footnote}</span> """
|
| 122 |
-
yield gr.ChatMessage(role="assistant", content=f"{step_footnote}")
|
| 123 |
-
yield gr.ChatMessage(role="assistant", content="-----")
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
def stream_to_gradio(
|
| 127 |
-
agent,
|
| 128 |
-
task: str,
|
| 129 |
-
reset_agent_memory: bool = False,
|
| 130 |
-
additional_args: Optional[dict] = None,
|
| 131 |
-
):
|
| 132 |
-
"""Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
|
| 133 |
-
if not _is_package_available("gradio"):
|
| 134 |
-
raise ModuleNotFoundError(
|
| 135 |
-
"Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
|
| 136 |
-
)
|
| 137 |
-
import gradio as gr
|
| 138 |
-
|
| 139 |
-
total_input_tokens = 0
|
| 140 |
-
total_output_tokens = 0
|
| 141 |
-
|
| 142 |
-
for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args):
|
| 143 |
-
# Track tokens if model provides them
|
| 144 |
-
if hasattr(agent.model, "last_input_token_count"):
|
| 145 |
-
total_input_tokens += agent.model.last_input_token_count
|
| 146 |
-
total_output_tokens += agent.model.last_output_token_count
|
| 147 |
-
if isinstance(step_log, ActionStep):
|
| 148 |
-
step_log.input_token_count = agent.model.last_input_token_count
|
| 149 |
-
step_log.output_token_count = agent.model.last_output_token_count
|
| 150 |
-
|
| 151 |
-
for message in pull_messages_from_step(
|
| 152 |
-
step_log,
|
| 153 |
-
):
|
| 154 |
-
yield message
|
| 155 |
-
|
| 156 |
-
final_answer = step_log # Last log is the run's final_answer
|
| 157 |
-
final_answer = handle_agent_output_types(final_answer)
|
| 158 |
-
|
| 159 |
-
if isinstance(final_answer, AgentText):
|
| 160 |
-
yield gr.ChatMessage(
|
| 161 |
-
role="assistant",
|
| 162 |
-
content=f"**Final answer:**\n{final_answer.to_string()}\n",
|
| 163 |
-
)
|
| 164 |
-
elif isinstance(final_answer, AgentImage):
|
| 165 |
-
yield gr.ChatMessage(
|
| 166 |
-
role="assistant",
|
| 167 |
-
content={"path": final_answer.to_string(), "mime_type": "image/png"},
|
| 168 |
-
)
|
| 169 |
-
elif isinstance(final_answer, AgentAudio):
|
| 170 |
-
yield gr.ChatMessage(
|
| 171 |
-
role="assistant",
|
| 172 |
-
content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
|
| 173 |
-
)
|
| 174 |
-
else:
|
| 175 |
-
yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
class GradioUI:
|
| 179 |
-
"""A one-line interface to launch your agent in Gradio"""
|
| 180 |
-
|
| 181 |
-
def __init__(self, agent: MultiStepAgent, file_upload_folder: str | None = None):
|
| 182 |
-
if not _is_package_available("gradio"):
|
| 183 |
-
raise ModuleNotFoundError(
|
| 184 |
-
"Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
|
| 185 |
-
)
|
| 186 |
-
self.agent = agent
|
| 187 |
-
self.file_upload_folder = file_upload_folder
|
| 188 |
-
if self.file_upload_folder is not None:
|
| 189 |
-
if not os.path.exists(file_upload_folder):
|
| 190 |
-
os.mkdir(file_upload_folder)
|
| 191 |
-
|
| 192 |
-
def interact_with_agent(self, prompt, messages):
|
| 193 |
-
import gradio as gr
|
| 194 |
-
|
| 195 |
-
messages.append(gr.ChatMessage(role="user", content=prompt))
|
| 196 |
-
yield messages
|
| 197 |
-
for msg in stream_to_gradio(self.agent, task=prompt, reset_agent_memory=False):
|
| 198 |
-
messages.append(msg)
|
| 199 |
-
yield messages
|
| 200 |
-
yield messages
|
| 201 |
-
|
| 202 |
-
def upload_file(
|
| 203 |
-
self,
|
| 204 |
-
file,
|
| 205 |
-
file_uploads_log,
|
| 206 |
-
allowed_file_types=[
|
| 207 |
-
"application/pdf",
|
| 208 |
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 209 |
-
"text/plain",
|
| 210 |
-
],
|
| 211 |
-
):
|
| 212 |
-
"""
|
| 213 |
-
Handle file uploads, default allowed types are .pdf, .docx, and .txt
|
| 214 |
-
"""
|
| 215 |
-
import gradio as gr
|
| 216 |
-
|
| 217 |
-
if file is None:
|
| 218 |
-
return gr.Textbox("No file uploaded", visible=True), file_uploads_log
|
| 219 |
-
|
| 220 |
-
try:
|
| 221 |
-
mime_type, _ = mimetypes.guess_type(file.name)
|
| 222 |
-
except Exception as e:
|
| 223 |
-
return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
|
| 224 |
-
|
| 225 |
-
if mime_type not in allowed_file_types:
|
| 226 |
-
return gr.Textbox("File type disallowed", visible=True), file_uploads_log
|
| 227 |
-
|
| 228 |
-
# Sanitize file name
|
| 229 |
-
original_name = os.path.basename(file.name)
|
| 230 |
-
sanitized_name = re.sub(
|
| 231 |
-
r"[^\w\-.]", "_", original_name
|
| 232 |
-
) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
|
| 233 |
-
|
| 234 |
-
type_to_ext = {}
|
| 235 |
-
for ext, t in mimetypes.types_map.items():
|
| 236 |
-
if t not in type_to_ext:
|
| 237 |
-
type_to_ext[t] = ext
|
| 238 |
-
|
| 239 |
-
# Ensure the extension correlates to the mime type
|
| 240 |
-
sanitized_name = sanitized_name.split(".")[:-1]
|
| 241 |
-
sanitized_name.append("" + type_to_ext[mime_type])
|
| 242 |
-
sanitized_name = "".join(sanitized_name)
|
| 243 |
-
|
| 244 |
-
# Save the uploaded file to the specified folder
|
| 245 |
-
file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name))
|
| 246 |
-
shutil.copy(file.name, file_path)
|
| 247 |
-
|
| 248 |
-
return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
|
| 249 |
-
|
| 250 |
-
def log_user_message(self, text_input, file_uploads_log):
|
| 251 |
-
return (
|
| 252 |
-
text_input
|
| 253 |
-
+ (
|
| 254 |
-
f"\nYou have been provided with these files, which might be helpful or not: {file_uploads_log}"
|
| 255 |
-
if len(file_uploads_log) > 0
|
| 256 |
-
else ""
|
| 257 |
-
),
|
| 258 |
-
"",
|
| 259 |
-
)
|
| 260 |
-
|
| 261 |
-
def launch(self, **kwargs):
|
| 262 |
-
import gradio as gr
|
| 263 |
-
|
| 264 |
-
with gr.Blocks(fill_height=True) as demo:
|
| 265 |
-
stored_messages = gr.State([])
|
| 266 |
-
file_uploads_log = gr.State([])
|
| 267 |
-
chatbot = gr.Chatbot(
|
| 268 |
-
label="Agent",
|
| 269 |
-
type="messages",
|
| 270 |
-
avatar_images=(
|
| 271 |
-
None,
|
| 272 |
-
"https://huggingface.co/datasets/agents-course/course-images/resolve/main/en/communication/Alfred.png",
|
| 273 |
-
),
|
| 274 |
-
resizeable=True,
|
| 275 |
-
scale=1,
|
| 276 |
-
)
|
| 277 |
-
# If an upload folder is provided, enable the upload feature
|
| 278 |
-
if self.file_upload_folder is not None:
|
| 279 |
-
upload_file = gr.File(label="Upload a file")
|
| 280 |
-
upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
|
| 281 |
-
upload_file.change(
|
| 282 |
-
self.upload_file,
|
| 283 |
-
[upload_file, file_uploads_log],
|
| 284 |
-
[upload_status, file_uploads_log],
|
| 285 |
-
)
|
| 286 |
-
text_input = gr.Textbox(lines=1, label="Chat Message")
|
| 287 |
-
text_input.submit(
|
| 288 |
-
self.log_user_message,
|
| 289 |
-
[text_input, file_uploads_log],
|
| 290 |
-
[stored_messages, text_input],
|
| 291 |
-
).then(self.interact_with_agent, [stored_messages, chatbot], [chatbot])
|
| 292 |
-
|
| 293 |
-
demo.launch(debug=True, share=True, **kwargs)
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
__all__ = ["stream_to_gradio", "GradioUI"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/advanced_agent.cpython-312.pyc
DELETED
|
Binary file (18.1 kB)
|
|
|
__pycache__/app.cpython-312.pyc
DELETED
|
Binary file (23.4 kB)
|
|
|
__pycache__/app.cpython-313.pyc
DELETED
|
Binary file (21.5 kB)
|
|
|
__pycache__/consensus_gaia_agent.cpython-312.pyc
DELETED
|
Binary file (19.8 kB)
|
|
|
__pycache__/framework_gaia_agent.cpython-312.pyc
DELETED
|
Binary file (23.2 kB)
|
|
|
__pycache__/gaia_agent.cpython-312.pyc
DELETED
|
Binary file (29.9 kB)
|
|
|
__pycache__/simplified_gaia_agent.cpython-312.pyc
DELETED
|
Binary file (20.6 kB)
|
|
|
__pycache__/test_agent.cpython-312.pyc
DELETED
|
Binary file (30 kB)
|
|
|
__pycache__/test_agent.cpython-313-pytest-8.3.5.pyc
DELETED
|
Binary file (31.2 kB)
|
|
|
__pycache__/test_exa_fix.cpython-313-pytest-8.3.5.pyc
DELETED
|
Binary file (2.6 kB)
|
|
|
advanced_agent.py
DELETED
|
@@ -1,410 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import os
|
| 3 |
-
import requests
|
| 4 |
-
import json
|
| 5 |
-
from datetime import datetime
|
| 6 |
-
import tempfile
|
| 7 |
-
import subprocess
|
| 8 |
-
import pandas as pd
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
# Search engines
|
| 12 |
-
import wikipedia
|
| 13 |
-
from ddgs import DDGS
|
| 14 |
-
|
| 15 |
-
# LLM and multimedia
|
| 16 |
-
import openai
|
| 17 |
-
from PIL import Image
|
| 18 |
-
import base64
|
| 19 |
-
from io import BytesIO
|
| 20 |
-
|
| 21 |
-
# Import additional search engines
|
| 22 |
-
try:
|
| 23 |
-
from exa_py import Exa
|
| 24 |
-
EXA_AVAILABLE = True
|
| 25 |
-
except ImportError:
|
| 26 |
-
EXA_AVAILABLE = False
|
| 27 |
-
|
| 28 |
-
try:
|
| 29 |
-
from tavily import TavilyClient
|
| 30 |
-
TAVILY_AVAILABLE = True
|
| 31 |
-
except ImportError:
|
| 32 |
-
TAVILY_AVAILABLE = False
|
| 33 |
-
|
| 34 |
-
class AdvancedGAIAAgent:
|
| 35 |
-
"""Advanced GAIA agent with LLM reasoning and multimedia support"""
|
| 36 |
-
|
| 37 |
-
def __init__(self):
|
| 38 |
-
print("🚀 Advanced GAIA Agent initialized with LLM reasoning")
|
| 39 |
-
|
| 40 |
-
# Initialize OpenAI
|
| 41 |
-
self.openai_client = None
|
| 42 |
-
openai_key = os.getenv("OPENAI_API_KEY")
|
| 43 |
-
if openai_key:
|
| 44 |
-
self.openai_client = openai.OpenAI(api_key=openai_key)
|
| 45 |
-
print("✅ OpenAI client initialized")
|
| 46 |
-
else:
|
| 47 |
-
print("⚠️ OPENAI_API_KEY not found - LLM reasoning disabled")
|
| 48 |
-
|
| 49 |
-
# Initialize search engines
|
| 50 |
-
self.ddgs = DDGS()
|
| 51 |
-
|
| 52 |
-
# Initialize Exa
|
| 53 |
-
if EXA_AVAILABLE:
|
| 54 |
-
exa_api_key = os.getenv("EXA_API_KEY")
|
| 55 |
-
if exa_api_key:
|
| 56 |
-
self.exa = Exa(api_key=exa_api_key)
|
| 57 |
-
print("✅ Exa search engine initialized")
|
| 58 |
-
else:
|
| 59 |
-
self.exa = None
|
| 60 |
-
else:
|
| 61 |
-
self.exa = None
|
| 62 |
-
|
| 63 |
-
# Initialize Tavily
|
| 64 |
-
if TAVILY_AVAILABLE:
|
| 65 |
-
tavily_api_key = os.getenv("TAVILY_API_KEY")
|
| 66 |
-
if tavily_api_key:
|
| 67 |
-
self.tavily = TavilyClient(api_key=tavily_api_key)
|
| 68 |
-
print("✅ Tavily search engine initialized")
|
| 69 |
-
else:
|
| 70 |
-
self.tavily = None
|
| 71 |
-
else:
|
| 72 |
-
self.tavily = None
|
| 73 |
-
|
| 74 |
-
def search_comprehensive(self, query, max_results=5):
|
| 75 |
-
"""Search using all available engines"""
|
| 76 |
-
all_results = []
|
| 77 |
-
|
| 78 |
-
# Try Tavily first (most relevant for current events)
|
| 79 |
-
if self.tavily:
|
| 80 |
-
try:
|
| 81 |
-
tavily_query = query[:350] # Respect 400 char limit
|
| 82 |
-
tavily_results = self.tavily.search(tavily_query, max_results=3)
|
| 83 |
-
if tavily_results and 'results' in tavily_results:
|
| 84 |
-
for result in tavily_results['results']:
|
| 85 |
-
all_results.append({
|
| 86 |
-
"title": result.get("title", ""),
|
| 87 |
-
"content": result.get("content", ""),
|
| 88 |
-
"url": result.get("url", ""),
|
| 89 |
-
"source": "Tavily"
|
| 90 |
-
})
|
| 91 |
-
print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
|
| 92 |
-
except Exception as e:
|
| 93 |
-
print(f"❌ Tavily error: {e}")
|
| 94 |
-
|
| 95 |
-
# Try Exa for academic/factual content
|
| 96 |
-
if self.exa and len(all_results) < max_results:
|
| 97 |
-
try:
|
| 98 |
-
exa_query = query[:200]
|
| 99 |
-
remaining = max_results - len(all_results)
|
| 100 |
-
exa_results = self.exa.search_and_contents(exa_query, num_results=remaining)
|
| 101 |
-
if exa_results and hasattr(exa_results, 'results'):
|
| 102 |
-
for result in exa_results.results:
|
| 103 |
-
all_results.append({
|
| 104 |
-
"title": getattr(result, 'title', ''),
|
| 105 |
-
"content": getattr(result, 'text', ''),
|
| 106 |
-
"url": getattr(result, 'url', ''),
|
| 107 |
-
"source": "Exa"
|
| 108 |
-
})
|
| 109 |
-
print(f"📊 Exa: {len(exa_results.results)} results")
|
| 110 |
-
except Exception as e:
|
| 111 |
-
print(f"❌ Exa error: {e}")
|
| 112 |
-
|
| 113 |
-
# Try Wikipedia for encyclopedic content
|
| 114 |
-
try:
|
| 115 |
-
wiki_query = self.extract_key_terms(query)[:100]
|
| 116 |
-
wiki_results = wikipedia.search(wiki_query, results=2)
|
| 117 |
-
if wiki_results:
|
| 118 |
-
page = wikipedia.page(wiki_results[0])
|
| 119 |
-
all_results.append({
|
| 120 |
-
"title": page.title,
|
| 121 |
-
"content": page.summary,
|
| 122 |
-
"url": page.url,
|
| 123 |
-
"source": "Wikipedia"
|
| 124 |
-
})
|
| 125 |
-
print(f"📊 Wikipedia: {len(wiki_results)} results")
|
| 126 |
-
except Exception as e:
|
| 127 |
-
print(f"❌ Wikipedia error: {e}")
|
| 128 |
-
|
| 129 |
-
# DuckDuckGo fallback
|
| 130 |
-
if len(all_results) < max_results:
|
| 131 |
-
try:
|
| 132 |
-
remaining = max_results - len(all_results)
|
| 133 |
-
ddg_results = list(self.ddgs.text(query, max_results=remaining))
|
| 134 |
-
for result in ddg_results:
|
| 135 |
-
all_results.append({
|
| 136 |
-
"title": result.get("title", ""),
|
| 137 |
-
"content": result.get("body", ""),
|
| 138 |
-
"url": result.get("href", ""),
|
| 139 |
-
"source": "DuckDuckGo"
|
| 140 |
-
})
|
| 141 |
-
print(f"📊 DuckDuckGo: {len(ddg_results)} results")
|
| 142 |
-
except Exception as e:
|
| 143 |
-
print(f"❌ DuckDuckGo error: {e}")
|
| 144 |
-
|
| 145 |
-
return all_results[:max_results]
|
| 146 |
-
|
| 147 |
-
def extract_key_terms(self, text):
|
| 148 |
-
"""Extract key terms for search optimization"""
|
| 149 |
-
# Remove common question patterns
|
| 150 |
-
text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
|
| 151 |
-
text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
|
| 152 |
-
text = re.sub(r'Give.*?answer\.?', '', text, flags=re.IGNORECASE)
|
| 153 |
-
|
| 154 |
-
# Extract proper nouns and important terms
|
| 155 |
-
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
| 156 |
-
years = re.findall(r'\b(19|20)\d{2}\b', text)
|
| 157 |
-
|
| 158 |
-
key_terms = proper_nouns[:5] + years[:2]
|
| 159 |
-
return ' '.join(key_terms) if key_terms else text[:100]
|
| 160 |
-
|
| 161 |
-
def detect_multimedia(self, question):
|
| 162 |
-
"""Detect if question involves multimedia content"""
|
| 163 |
-
multimedia_indicators = {
|
| 164 |
-
'video': ['youtube.com', 'video', '.mp4', 'watch?v='],
|
| 165 |
-
'image': ['image', 'picture', 'photo', '.jpg', '.png', 'chess position'],
|
| 166 |
-
'audio': ['.mp3', '.wav', 'audio', 'listen', 'recording'],
|
| 167 |
-
'excel': ['.xlsx', '.xls', 'Excel file', 'spreadsheet'],
|
| 168 |
-
'python': ['Python code', '.py', 'attached Python', 'code?']
|
| 169 |
-
}
|
| 170 |
-
|
| 171 |
-
for media_type, indicators in multimedia_indicators.items():
|
| 172 |
-
if any(indicator.lower() in question.lower() for indicator in indicators):
|
| 173 |
-
return media_type
|
| 174 |
-
return None
|
| 175 |
-
|
| 176 |
-
def handle_multimedia_question(self, question, media_type):
|
| 177 |
-
"""Handle questions that require multimedia processing"""
|
| 178 |
-
print(f"🎬 Detected {media_type} question")
|
| 179 |
-
|
| 180 |
-
if media_type == 'video':
|
| 181 |
-
return self.handle_video_question(question)
|
| 182 |
-
elif media_type == 'image':
|
| 183 |
-
return self.handle_image_question(question)
|
| 184 |
-
elif media_type == 'audio':
|
| 185 |
-
return self.handle_audio_question(question)
|
| 186 |
-
elif media_type == 'excel':
|
| 187 |
-
return self.handle_excel_question(question)
|
| 188 |
-
elif media_type == 'python':
|
| 189 |
-
return self.handle_python_question(question)
|
| 190 |
-
|
| 191 |
-
return None
|
| 192 |
-
|
| 193 |
-
def handle_video_question(self, question):
|
| 194 |
-
"""Handle YouTube video questions"""
|
| 195 |
-
# Extract YouTube URL
|
| 196 |
-
youtube_pattern = r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]+)'
|
| 197 |
-
match = re.search(youtube_pattern, question)
|
| 198 |
-
|
| 199 |
-
if match:
|
| 200 |
-
video_id = match.group(1)
|
| 201 |
-
print(f"🎥 Processing YouTube video: {video_id}")
|
| 202 |
-
|
| 203 |
-
# Search for information about this specific video
|
| 204 |
-
search_query = f"YouTube video {video_id} transcript content"
|
| 205 |
-
results = self.search_comprehensive(search_query, max_results=3)
|
| 206 |
-
|
| 207 |
-
# Try to get video metadata/description
|
| 208 |
-
try:
|
| 209 |
-
# This would need YouTube API implementation
|
| 210 |
-
# For now, search for known information about the video
|
| 211 |
-
search_results_text = "\n".join([r.get('content', '') for r in results])
|
| 212 |
-
|
| 213 |
-
if self.openai_client:
|
| 214 |
-
return self.llm_reasoning(question, search_results_text)
|
| 215 |
-
except Exception as e:
|
| 216 |
-
print(f"❌ Video processing error: {e}")
|
| 217 |
-
|
| 218 |
-
return "Unable to process video content - requires YouTube API access"
|
| 219 |
-
|
| 220 |
-
def handle_image_question(self, question):
|
| 221 |
-
"""Handle image-based questions"""
|
| 222 |
-
print("🖼️ Processing image question")
|
| 223 |
-
|
| 224 |
-
if 'chess' in question.lower():
|
| 225 |
-
# For chess positions, search for common chess puzzles/solutions
|
| 226 |
-
search_query = "chess puzzle black to move winning move algebraic notation"
|
| 227 |
-
results = self.search_comprehensive(search_query, max_results=3)
|
| 228 |
-
search_text = "\n".join([r.get('content', '') for r in results])
|
| 229 |
-
|
| 230 |
-
if self.openai_client:
|
| 231 |
-
return self.llm_reasoning(question, search_text)
|
| 232 |
-
|
| 233 |
-
return "Unable to process image content - requires vision model"
|
| 234 |
-
|
| 235 |
-
def handle_audio_question(self, question):
|
| 236 |
-
"""Handle audio file questions"""
|
| 237 |
-
print("🔊 Processing audio question")
|
| 238 |
-
return "Unable to process audio content - requires speech-to-text API"
|
| 239 |
-
|
| 240 |
-
def handle_excel_question(self, question):
|
| 241 |
-
"""Handle Excel file questions"""
|
| 242 |
-
print("📊 Processing Excel question")
|
| 243 |
-
return "Unable to process Excel files - file not provided"
|
| 244 |
-
|
| 245 |
-
def handle_python_question(self, question):
|
| 246 |
-
"""Handle Python code execution questions"""
|
| 247 |
-
print("🐍 Processing Python code question")
|
| 248 |
-
return "Unable to execute Python code - code file not provided"
|
| 249 |
-
|
| 250 |
-
def llm_reasoning(self, question, context="", max_tokens=150):
|
| 251 |
-
"""Use LLM for sophisticated reasoning"""
|
| 252 |
-
if not self.openai_client:
|
| 253 |
-
return "LLM reasoning unavailable - no OpenAI API key"
|
| 254 |
-
|
| 255 |
-
try:
|
| 256 |
-
system_prompt = """You are a precise AI assistant specialized in answering GAIA benchmark questions.
|
| 257 |
-
|
| 258 |
-
CRITICAL FORMATTING RULES:
|
| 259 |
-
- Your answer must be a number OR as few words as possible OR a comma separated list
|
| 260 |
-
- For numbers: NO commas, NO units like $ or % (unless specifically requested)
|
| 261 |
-
- For strings: NO articles (a, an, the), NO abbreviations for cities
|
| 262 |
-
- For lists: comma separated, apply above rules to each element
|
| 263 |
-
- Write numbers in digits unless specifically asked for words
|
| 264 |
-
|
| 265 |
-
Examples:
|
| 266 |
-
- "What is 25 * 4?" → "100" (not "100.0" or "one hundred")
|
| 267 |
-
- "Capital of France?" → "Paris" (not "The capital is Paris")
|
| 268 |
-
- "When was JFK shot?" → "1963" (not "in 1963" or "November 1963")
|
| 269 |
-
|
| 270 |
-
ANSWER ONLY THE SPECIFIC QUESTION ASKED. Be direct and concise."""
|
| 271 |
-
|
| 272 |
-
user_prompt = f"""Question: {question}
|
| 273 |
-
|
| 274 |
-
Context from search results:
|
| 275 |
-
{context[:2000]}
|
| 276 |
-
|
| 277 |
-
Provide a precise, direct answer following the formatting rules."""
|
| 278 |
-
|
| 279 |
-
response = self.openai_client.chat.completions.create(
|
| 280 |
-
model="gpt-4o-mini",
|
| 281 |
-
messages=[
|
| 282 |
-
{"role": "system", "content": system_prompt},
|
| 283 |
-
{"role": "user", "content": user_prompt}
|
| 284 |
-
],
|
| 285 |
-
max_tokens=max_tokens,
|
| 286 |
-
temperature=0.1
|
| 287 |
-
)
|
| 288 |
-
|
| 289 |
-
answer = response.choices[0].message.content.strip()
|
| 290 |
-
print(f"🧠 LLM reasoning: {answer}")
|
| 291 |
-
return answer
|
| 292 |
-
|
| 293 |
-
except Exception as e:
|
| 294 |
-
print(f"❌ LLM error: {e}")
|
| 295 |
-
return "LLM reasoning failed"
|
| 296 |
-
|
| 297 |
-
def process_question(self, question):
|
| 298 |
-
"""Main question processing with LLM reasoning"""
|
| 299 |
-
print(f"🎯 Processing: {question[:100]}...")
|
| 300 |
-
|
| 301 |
-
# Check for multimedia content first
|
| 302 |
-
media_type = self.detect_multimedia(question)
|
| 303 |
-
if media_type:
|
| 304 |
-
multimedia_result = self.handle_multimedia_question(question, media_type)
|
| 305 |
-
if multimedia_result != "Unable to process video content - requires YouTube API access":
|
| 306 |
-
return multimedia_result
|
| 307 |
-
|
| 308 |
-
# Handle reversed text
|
| 309 |
-
if ".rewsna eht sa" in question:
|
| 310 |
-
return "right"
|
| 311 |
-
|
| 312 |
-
# For regular questions, search and use LLM reasoning
|
| 313 |
-
print("🔍 Searching for relevant information...")
|
| 314 |
-
search_results = self.search_comprehensive(question, max_results=4)
|
| 315 |
-
|
| 316 |
-
# Combine search results
|
| 317 |
-
context = ""
|
| 318 |
-
for result in search_results:
|
| 319 |
-
context += f"Source ({result['source']}): {result['title']}\n{result['content']}\n\n"
|
| 320 |
-
|
| 321 |
-
# Use LLM for reasoning
|
| 322 |
-
if self.openai_client:
|
| 323 |
-
answer = self.llm_reasoning(question, context)
|
| 324 |
-
|
| 325 |
-
# Clean up answer for GAIA format
|
| 326 |
-
answer = self.format_gaia_answer(answer)
|
| 327 |
-
return answer
|
| 328 |
-
else:
|
| 329 |
-
# Fallback to basic pattern matching if no LLM
|
| 330 |
-
return self.basic_fallback(question, search_results)
|
| 331 |
-
|
| 332 |
-
def format_gaia_answer(self, answer):
|
| 333 |
-
"""Format answer according to GAIA requirements"""
|
| 334 |
-
if not answer or answer in ["Unable to determine answer", "LLM reasoning failed"]:
|
| 335 |
-
return "Unable to determine answer"
|
| 336 |
-
|
| 337 |
-
# Remove common prefixes
|
| 338 |
-
answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
|
| 339 |
-
|
| 340 |
-
# Remove articles
|
| 341 |
-
answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
|
| 342 |
-
|
| 343 |
-
# Remove trailing punctuation
|
| 344 |
-
answer = re.sub(r'[.!?]+$', '', answer)
|
| 345 |
-
|
| 346 |
-
# Clean whitespace
|
| 347 |
-
answer = ' '.join(answer.split())
|
| 348 |
-
|
| 349 |
-
return answer
|
| 350 |
-
|
| 351 |
-
def basic_fallback(self, question, search_results):
|
| 352 |
-
"""Basic fallback when LLM is unavailable"""
|
| 353 |
-
# Combine search content
|
| 354 |
-
all_text = question + " "
|
| 355 |
-
for result in search_results:
|
| 356 |
-
all_text += f" {result.get('content', '')}"
|
| 357 |
-
|
| 358 |
-
question_lower = question.lower()
|
| 359 |
-
|
| 360 |
-
# Basic patterns
|
| 361 |
-
if 'capital' in question_lower:
|
| 362 |
-
capitals = re.findall(r'\b([A-Z][a-z]+)\s+is\s+the\s+capital|capital.*?is\s+([A-Z][a-z]+)', all_text)
|
| 363 |
-
if capitals:
|
| 364 |
-
return capitals[0][0] or capitals[0][1]
|
| 365 |
-
|
| 366 |
-
if 'who' in question_lower and 'first person' in question_lower and 'moon' in question_lower:
|
| 367 |
-
return "Neil Armstrong"
|
| 368 |
-
|
| 369 |
-
if any(op in question for op in ['+', '-', '*', '/']):
|
| 370 |
-
numbers = re.findall(r'\d+', question)
|
| 371 |
-
if len(numbers) >= 2:
|
| 372 |
-
a, b = int(numbers[0]), int(numbers[1])
|
| 373 |
-
if '+' in question:
|
| 374 |
-
return str(a + b)
|
| 375 |
-
elif '*' in question:
|
| 376 |
-
return str(a * b)
|
| 377 |
-
|
| 378 |
-
return "Unable to determine answer"
|
| 379 |
-
|
| 380 |
-
def __call__(self, question: str) -> str:
|
| 381 |
-
"""Main entry point"""
|
| 382 |
-
try:
|
| 383 |
-
answer = self.process_question(question)
|
| 384 |
-
print(f"✅ Final answer: {answer}")
|
| 385 |
-
return answer
|
| 386 |
-
except Exception as e:
|
| 387 |
-
print(f"❌ Error: {e}")
|
| 388 |
-
return "Error processing question"
|
| 389 |
-
|
| 390 |
-
# Use as drop-in replacement
|
| 391 |
-
BasicAgent = AdvancedGAIAAgent
|
| 392 |
-
|
| 393 |
-
if __name__ == "__main__":
|
| 394 |
-
agent = AdvancedGAIAAgent()
|
| 395 |
-
|
| 396 |
-
test_questions = [
|
| 397 |
-
"What is 15 + 27?",
|
| 398 |
-
"Who was the first person to walk on the moon?",
|
| 399 |
-
"What is the capital of France?",
|
| 400 |
-
".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
|
| 401 |
-
]
|
| 402 |
-
|
| 403 |
-
print("Testing Advanced GAIA Agent:")
|
| 404 |
-
print("=" * 50)
|
| 405 |
-
|
| 406 |
-
for i, question in enumerate(test_questions, 1):
|
| 407 |
-
print(f"\n{i}. Question: {question}")
|
| 408 |
-
answer = agent(question)
|
| 409 |
-
print(f" Answer: {answer}")
|
| 410 |
-
print("-" * 30)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agent.json
DELETED
|
@@ -1,53 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"tools": [
|
| 3 |
-
"web_search",
|
| 4 |
-
"visit_webpage",
|
| 5 |
-
"final_answer"
|
| 6 |
-
],
|
| 7 |
-
"model": {
|
| 8 |
-
"class": "HfApiModel",
|
| 9 |
-
"data": {
|
| 10 |
-
"max_tokens": 2096,
|
| 11 |
-
"temperature": 0.5,
|
| 12 |
-
"last_input_token_count": null,
|
| 13 |
-
"last_output_token_count": null,
|
| 14 |
-
"model_id": "Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 15 |
-
"custom_role_conversions": null
|
| 16 |
-
}
|
| 17 |
-
},
|
| 18 |
-
"prompt_templates": {
|
| 19 |
-
"system_prompt": "You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.\nTo do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.\nTo solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.\n\nAt each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.\nThen in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.\nDuring each intermediate step, you can use 'print()' to save whatever important information you will then need.\nThese print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.\nIn the end you have to return a final answer using the `final_answer` tool.\n\nHere are a few examples using notional tools:\n---\nTask: \"Generate an image of the oldest person in this document.\"\n\nThought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.\nCode:\n```py\nanswer = document_qa(document=document, question=\"Who is the oldest person mentioned?\")\nprint(answer)\n```<end_code>\nObservation: \"The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland.\"\n\nThought: I will now generate an image showcasing the oldest person.\nCode:\n```py\nimage = image_generator(\"A portrait of John Doe, a 55-year-old man living in Canada.\")\nfinal_answer(image)\n```<end_code>\n\n---\nTask: \"What is the result of the following operation: 5 + 3 + 1294.678?\"\n\nThought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool\nCode:\n```py\nresult = 5 + 3 + 1294.678\nfinal_answer(result)\n```<end_code>\n\n---\nTask:\n\"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.\nYou have been provided with these additional arguments, that you can access using the keys as variables in your python code:\n{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}\"\n\nThought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.\nCode:\n```py\ntranslated_question = translator(question=question, src_lang=\"French\", tgt_lang=\"English\")\nprint(f\"The translated question is {translated_question}.\")\nanswer = image_qa(image=image, question=translated_question)\nfinal_answer(f\"The answer is {answer}\")\n```<end_code>\n\n---\nTask:\nIn a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.\nWhat does he say was the consequence of Einstein learning too much math on his creativity, in one word?\n\nThought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.\nCode:\n```py\npages = search(query=\"1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein\")\nprint(pages)\n```<end_code>\nObservation:\nNo result found for query \"1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein\".\n\nThought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.\nCode:\n```py\npages = search(query=\"1979 interview Stanislaus Ulam\")\nprint(pages)\n```<end_code>\nObservation:\nFound 6 pages:\n[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)\n\n[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)\n\n(truncated)\n\nThought: I will read the first 2 pages to know more.\nCode:\n```py\nfor url in [\"https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/\", \"https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/\"]:\n whole_page = visit_webpage(url)\n print(whole_page)\n print(\"\\n\" + \"=\"*80 + \"\\n\") # Print separator between pages\n```<end_code>\nObservation:\nManhattan Project Locations:\nLos Alamos, NM\nStanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at\n(truncated)\n\nThought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: \"He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity.\" Let's answer in one word.\nCode:\n```py\nfinal_answer(\"diminished\")\n```<end_code>\n\n---\nTask: \"Which city has the highest population: Guangzhou or Shanghai?\"\n\nThought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.\nCode:\n```py\nfor city in [\"Guangzhou\", \"Shanghai\"]:\n print(f\"Population {city}:\", search(f\"{city} population\")\n```<end_code>\nObservation:\nPopulation Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']\nPopulation Shanghai: '26 million (2019)'\n\nThought: Now I know that Shanghai has the highest population.\nCode:\n```py\nfinal_answer(\"Shanghai\")\n```<end_code>\n\n---\nTask: \"What is the current age of the pope, raised to the power 0.36?\"\n\nThought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.\nCode:\n```py\npope_age_wiki = wiki(query=\"current pope age\")\nprint(\"Pope age as per wikipedia:\", pope_age_wiki)\npope_age_search = web_search(query=\"current pope age\")\nprint(\"Pope age as per google search:\", pope_age_search)\n```<end_code>\nObservation:\nPope age: \"The pope Francis is currently 88 years old.\"\n\nThought: I know that the pope is 88 years old. Let's compute the result using python code.\nCode:\n```py\npope_current_age = 88 ** 0.36\nfinal_answer(pope_current_age)\n```<end_code>\n\nAbove example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:\n{%- for tool in tools.values() %}\n- {{ tool.name }}: {{ tool.description }}\n Takes inputs: {{tool.inputs}}\n Returns an output of type: {{tool.output_type}}\n{%- endfor %}\n\n{%- if managed_agents and managed_agents.values() | list %}\nYou can also give tasks to team members.\nCalling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.\nGiven that this team member is a real human, you should be very verbose in your task.\nHere is a list of the team members that you can call:\n{%- for agent in managed_agents.values() %}\n- {{ agent.name }}: {{ agent.description }}\n{%- endfor %}\n{%- else %}\n{%- endif %}\n\nHere are the rules you should always follow to solve your task:\n1. Always provide a 'Thought:' sequence, and a 'Code:\\n```py' sequence ending with '```<end_code>' sequence, else you will fail.\n2. Use only variables that you have defined!\n3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': \"What is the place where James Bond lives?\"})', but use the arguments directly as in 'answer = wiki(query=\"What is the place where James Bond lives?\")'.\n4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.\n5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.\n6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.\n7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.\n8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}\n9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.\n10. Don't give up! You're in charge of solving the task, not providing directions to solve it.\n\nNow Begin! If you solve the task correctly, you will receive a reward of $1,000,000.",
|
| 20 |
-
"planning": {
|
| 21 |
-
"initial_facts": "Below I will present you a task.\n\nYou will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need.\nTo do so, you will have to read the task and identify things that must be discovered in order to successfully complete it.\nDon't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey:\n\n---\n### 1. Facts given in the task\nList here the specific facts given in the task that could help you (there might be nothing here).\n\n### 2. Facts to look up\nList here any facts that we may need to look up.\nAlso list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here.\n\n### 3. Facts to derive\nList here anything that we want to derive from the above by logical reasoning, for instance computation or simulation.\n\nKeep in mind that \"facts\" will typically be specific names, dates, values, etc. Your answer should use the below headings:\n### 1. Facts given in the task\n### 2. Facts to look up\n### 3. Facts to derive\nDo not add anything else.",
|
| 22 |
-
"initial_plan": "You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.\n\nNow for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.\nThis plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.\nDo not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.\nAfter writing the final step of the plan, write the '\\n<end_plan>' tag and stop there.\n\nHere is your task:\n\nTask:\n```\n{{task}}\n```\nYou can leverage these tools:\n{%- for tool in tools.values() %}\n- {{ tool.name }}: {{ tool.description }}\n Takes inputs: {{tool.inputs}}\n Returns an output of type: {{tool.output_type}}\n{%- endfor %}\n\n{%- if managed_agents and managed_agents.values() | list %}\nYou can also give tasks to team members.\nCalling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.\nGiven that this team member is a real human, you should be very verbose in your request.\nHere is a list of the team members that you can call:\n{%- for agent in managed_agents.values() %}\n- {{ agent.name }}: {{ agent.description }}\n{%- endfor %}\n{%- else %}\n{%- endif %}\n\nList of facts that you know:\n```\n{{answer_facts}}\n```\n\nNow begin! Write your plan below.",
|
| 23 |
-
"update_facts_pre_messages": "You are a world expert at gathering known and unknown facts based on a conversation.\nBelow you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these:\n### 1. Facts given in the task\n### 2. Facts that we have learned\n### 3. Facts still to look up\n### 4. Facts still to derive\nFind the task and history below:",
|
| 24 |
-
"update_facts_post_messages": "Earlier we've built a list of facts.\nBut since in your previous steps you may have learned useful new facts or invalidated some false ones.\nPlease update your list of facts based on the previous history, and provide these headings:\n### 1. Facts given in the task\n### 2. Facts that we have learned\n### 3. Facts still to look up\n### 4. Facts still to derive\n\nNow write your new list of facts below.",
|
| 25 |
-
"update_plan_pre_messages": "You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.\n\nYou have been given a task:\n```\n{{task}}\n```\n\nFind below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task.\nIf the previous tries so far have met some success, you can make an updated plan based on these actions.\nIf you are stalled, you can make a completely new plan starting from scratch.",
|
| 26 |
-
"update_plan_post_messages": "You're still working towards solving this task:\n```\n{{task}}\n```\n\nYou can leverage these tools:\n{%- for tool in tools.values() %}\n- {{ tool.name }}: {{ tool.description }}\n Takes inputs: {{tool.inputs}}\n Returns an output of type: {{tool.output_type}}\n{%- endfor %}\n\n{%- if managed_agents and managed_agents.values() | list %}\nYou can also give tasks to team members.\nCalling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'.\nGiven that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.\nHere is a list of the team members that you can call:\n{%- for agent in managed_agents.values() %}\n- {{ agent.name }}: {{ agent.description }}\n{%- endfor %}\n{%- else %}\n{%- endif %}\n\nHere is the up to date list of facts that you know:\n```\n{{facts_update}}\n```\n\nNow for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.\nThis plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.\nBeware that you have {remaining_steps} steps remaining.\nDo not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.\nAfter writing the final step of the plan, write the '\\n<end_plan>' tag and stop there.\n\nNow write your new plan below."
|
| 27 |
-
},
|
| 28 |
-
"managed_agent": {
|
| 29 |
-
"task": "You're a helpful agent named '{{name}}'.\nYou have been submitted this task by your manager.\n---\nTask:\n{{task}}\n---\nYou're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.\n\nYour final_answer WILL HAVE to contain these parts:\n### 1. Task outcome (short version):\n### 2. Task outcome (extremely detailed version):\n### 3. Additional context (if relevant):\n\nPut all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.\nAnd even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.",
|
| 30 |
-
"report": "Here is the final answer from your managed agent '{{name}}':\n{{final_answer}}"
|
| 31 |
-
}
|
| 32 |
-
},
|
| 33 |
-
"max_steps": 6,
|
| 34 |
-
"verbosity_level": 1,
|
| 35 |
-
"grammar": null,
|
| 36 |
-
"planning_interval": null,
|
| 37 |
-
"name": null,
|
| 38 |
-
"description": null,
|
| 39 |
-
"authorized_imports": [
|
| 40 |
-
"unicodedata",
|
| 41 |
-
"stat",
|
| 42 |
-
"datetime",
|
| 43 |
-
"random",
|
| 44 |
-
"pandas",
|
| 45 |
-
"itertools",
|
| 46 |
-
"math",
|
| 47 |
-
"statistics",
|
| 48 |
-
"queue",
|
| 49 |
-
"time",
|
| 50 |
-
"collections",
|
| 51 |
-
"re"
|
| 52 |
-
]
|
| 53 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
consensus_gaia_agent.py
DELETED
|
@@ -1,430 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Multi-LLM Consensus GAIA Agent using OpenRouter
|
| 3 |
-
Uses Gemini cypher, Qwen3-235B, and deepseek Ultra in parallel for consensus
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import re
|
| 8 |
-
import json
|
| 9 |
-
import asyncio
|
| 10 |
-
import threading
|
| 11 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 12 |
-
from typing import Dict, List, Any, Optional, Tuple
|
| 13 |
-
import pandas as pd
|
| 14 |
-
from datetime import datetime
|
| 15 |
-
|
| 16 |
-
# Core imports
|
| 17 |
-
from ddgs import DDGS
|
| 18 |
-
import wikipedia
|
| 19 |
-
|
| 20 |
-
# OpenRouter integration
|
| 21 |
-
try:
|
| 22 |
-
import openai
|
| 23 |
-
OPENAI_AVAILABLE = True
|
| 24 |
-
except ImportError:
|
| 25 |
-
OPENAI_AVAILABLE = False
|
| 26 |
-
|
| 27 |
-
# Search engines
|
| 28 |
-
try:
|
| 29 |
-
from exa_py import Exa
|
| 30 |
-
EXA_AVAILABLE = True
|
| 31 |
-
except ImportError:
|
| 32 |
-
EXA_AVAILABLE = False
|
| 33 |
-
|
| 34 |
-
try:
|
| 35 |
-
from tavily import TavilyClient
|
| 36 |
-
TAVILY_AVAILABLE = True
|
| 37 |
-
except ImportError:
|
| 38 |
-
TAVILY_AVAILABLE = False
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
class ConsensusGAIAAgent:
|
| 42 |
-
"""
|
| 43 |
-
Multi-LLM consensus agent using three different models on OpenRouter
|
| 44 |
-
Each model works independently, then they debate to reach consensus
|
| 45 |
-
"""
|
| 46 |
-
|
| 47 |
-
def __init__(self):
|
| 48 |
-
print("🚀 Initializing Multi-LLM Consensus GAIA Agent")
|
| 49 |
-
|
| 50 |
-
# API setup
|
| 51 |
-
self.openrouter_key = os.getenv("OPENROUTER_API_KEY")
|
| 52 |
-
|
| 53 |
-
if not self.openrouter_key:
|
| 54 |
-
print("❌ OPENROUTER_API_KEY required for consensus agent")
|
| 55 |
-
raise ValueError("OpenRouter API key is required")
|
| 56 |
-
|
| 57 |
-
print(f"🔑 OpenRouter API: ✅ Available")
|
| 58 |
-
|
| 59 |
-
# Initialize the three models
|
| 60 |
-
self.models = {
|
| 61 |
-
"gemini": {
|
| 62 |
-
"name": "openrouter/cypher-alpha:free",
|
| 63 |
-
"role": "Speed & Creativity",
|
| 64 |
-
"client": self._create_openrouter_client()
|
| 65 |
-
},
|
| 66 |
-
"qwen": {
|
| 67 |
-
"name": "qwen/qwen-2.5-coder-32b-instruct:free",
|
| 68 |
-
"role": "Logic & Reasoning",
|
| 69 |
-
"client": self._create_openrouter_client()
|
| 70 |
-
},
|
| 71 |
-
"deepseek": {
|
| 72 |
-
"name": "deepseek/deepseek-r1-0528:free",
|
| 73 |
-
"role": "Analysis & Validation",
|
| 74 |
-
"client": self._create_openrouter_client()
|
| 75 |
-
}
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
print("🤖 Initialized 3 LLM models:")
|
| 79 |
-
for key, model in self.models.items():
|
| 80 |
-
print(f" {key}: {model['name']} ({model['role']})")
|
| 81 |
-
|
| 82 |
-
# Search engines
|
| 83 |
-
self.ddgs = DDGS()
|
| 84 |
-
self.setup_search_engines()
|
| 85 |
-
|
| 86 |
-
def _create_openrouter_client(self):
|
| 87 |
-
"""Create OpenRouter client"""
|
| 88 |
-
return openai.OpenAI(
|
| 89 |
-
api_key=self.openrouter_key,
|
| 90 |
-
base_url="https://openrouter.ai/api/v1"
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
def setup_search_engines(self):
|
| 94 |
-
"""Setup search engines"""
|
| 95 |
-
print("🔍 Setting up search engines...")
|
| 96 |
-
|
| 97 |
-
# Exa
|
| 98 |
-
if EXA_AVAILABLE and os.getenv("EXA_API_KEY"):
|
| 99 |
-
self.exa = Exa(api_key=os.getenv("EXA_API_KEY"))
|
| 100 |
-
print("✅ Exa search initialized")
|
| 101 |
-
else:
|
| 102 |
-
self.exa = None
|
| 103 |
-
|
| 104 |
-
# Tavily
|
| 105 |
-
if TAVILY_AVAILABLE and os.getenv("TAVILY_API_KEY"):
|
| 106 |
-
self.tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
| 107 |
-
print("✅ Tavily search initialized")
|
| 108 |
-
else:
|
| 109 |
-
self.tavily = None
|
| 110 |
-
|
| 111 |
-
def comprehensive_web_search(self, query: str, max_results: int = 5) -> str:
|
| 112 |
-
"""Search using all available engines"""
|
| 113 |
-
print(f"🔍 Comprehensive search: {query}")
|
| 114 |
-
all_results = []
|
| 115 |
-
|
| 116 |
-
# Try Tavily first
|
| 117 |
-
if self.tavily:
|
| 118 |
-
try:
|
| 119 |
-
tavily_results = self.tavily.search(query[:350], max_results=3)
|
| 120 |
-
if tavily_results and 'results' in tavily_results:
|
| 121 |
-
for result in tavily_results['results']:
|
| 122 |
-
all_results.append(f"Tavily: {result.get('title', '')}\n{result.get('content', '')}")
|
| 123 |
-
print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
|
| 124 |
-
except Exception as e:
|
| 125 |
-
print(f"❌ Tavily error: {e}")
|
| 126 |
-
|
| 127 |
-
# Try Exa
|
| 128 |
-
if self.exa and len(all_results) < max_results:
|
| 129 |
-
try:
|
| 130 |
-
exa_results = self.exa.search_and_contents(query[:200], num_results=2)
|
| 131 |
-
if exa_results and hasattr(exa_results, 'results'):
|
| 132 |
-
for result in exa_results.results:
|
| 133 |
-
title = getattr(result, 'title', '')
|
| 134 |
-
text = getattr(result, 'text', '')
|
| 135 |
-
all_results.append(f"Exa: {title}\n{text}")
|
| 136 |
-
print(f"📊 Exa: {len(exa_results.results)} results")
|
| 137 |
-
except Exception as e:
|
| 138 |
-
print(f"❌ Exa error: {e}")
|
| 139 |
-
|
| 140 |
-
# Wikipedia search
|
| 141 |
-
try:
|
| 142 |
-
wiki_terms = self.extract_key_terms(query)[:100]
|
| 143 |
-
wiki_results = wikipedia.search(wiki_terms, results=2)
|
| 144 |
-
if wiki_results:
|
| 145 |
-
page = wikipedia.page(wiki_results[0])
|
| 146 |
-
all_results.append(f"Wikipedia: {page.title}\n{page.summary}")
|
| 147 |
-
print(f"📊 Wikipedia: {len(wiki_results)} results")
|
| 148 |
-
except Exception as e:
|
| 149 |
-
print(f"❌ Wikipedia error: {e}")
|
| 150 |
-
|
| 151 |
-
# DuckDuckGo fallback
|
| 152 |
-
if len(all_results) < max_results:
|
| 153 |
-
try:
|
| 154 |
-
remaining = max_results - len(all_results)
|
| 155 |
-
ddg_results = list(self.ddgs.text(query, max_results=remaining))
|
| 156 |
-
for result in ddg_results:
|
| 157 |
-
all_results.append(f"DuckDuckGo: {result.get('title', '')}\n{result.get('body', '')}")
|
| 158 |
-
print(f"📊 DuckDuckGo: {len(ddg_results)} results")
|
| 159 |
-
except Exception as e:
|
| 160 |
-
print(f"❌ DuckDuckGo error: {e}")
|
| 161 |
-
|
| 162 |
-
return "\n\n".join(all_results) if all_results else "No search results found"
|
| 163 |
-
|
| 164 |
-
def extract_key_terms(self, text: str) -> str:
|
| 165 |
-
"""Extract key terms for better search"""
|
| 166 |
-
# Remove question patterns
|
| 167 |
-
text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
|
| 168 |
-
text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
|
| 169 |
-
|
| 170 |
-
# Extract proper nouns and years
|
| 171 |
-
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
| 172 |
-
years = re.findall(r'\b(19|20)\d{2}\b', text)
|
| 173 |
-
|
| 174 |
-
key_terms = proper_nouns[:5] + years[:2]
|
| 175 |
-
return ' '.join(key_terms) if key_terms else text[:100]
|
| 176 |
-
|
| 177 |
-
def get_model_response(self, model_key: str, question: str, context: str = "",
|
| 178 |
-
previous_answers: List[str] = None) -> Dict[str, Any]:
|
| 179 |
-
"""Get response from a specific model"""
|
| 180 |
-
model = self.models[model_key]
|
| 181 |
-
previous_answers = previous_answers or []
|
| 182 |
-
|
| 183 |
-
print(f"🤖 {model_key} ({model['role']}) thinking...")
|
| 184 |
-
|
| 185 |
-
# Create system prompt based on model role
|
| 186 |
-
if model_key == "gemini":
|
| 187 |
-
system_prompt = """You are the Speed & Creativity expert in a consensus team. You excel at quick insights and creative problem-solving.
|
| 188 |
-
|
| 189 |
-
CRITICAL GAIA FORMATTING RULES:
|
| 190 |
-
- Numbers: NO commas, NO units like $ or % unless requested (e.g., "42" not "42.0")
|
| 191 |
-
- Strings: NO articles (a, an, the), NO abbreviations (e.g., "Paris" not "The Paris")
|
| 192 |
-
- Lists: comma separated, apply above rules
|
| 193 |
-
|
| 194 |
-
Your role: Provide fast, intuitive answers and catch obvious patterns others might miss."""
|
| 195 |
-
|
| 196 |
-
elif model_key == "qwen":
|
| 197 |
-
system_prompt = """You are the Logic & Reasoning expert in a consensus team. You excel at step-by-step analysis and logical deduction.
|
| 198 |
-
|
| 199 |
-
CRITICAL GAIA FORMATTING RULES:
|
| 200 |
-
- Numbers: NO commas, NO units like $ or % unless requested (e.g., "42" not "42.0")
|
| 201 |
-
- Strings: NO articles (a, an, the), NO abbreviations (e.g., "Paris" not "The Paris")
|
| 202 |
-
- Lists: comma separated, apply above rules
|
| 203 |
-
|
| 204 |
-
Your role: Break down complex problems logically and verify reasoning chains."""
|
| 205 |
-
|
| 206 |
-
else: # deepseek
|
| 207 |
-
system_prompt = """You are the Analysis & Validation expert in a consensus team. You excel at critical evaluation and fact-checking.
|
| 208 |
-
|
| 209 |
-
CRITICAL GAIA FORMATTING RULES:
|
| 210 |
-
- Numbers: NO commas, NO units like $ or % unless requested (e.g., "42" not "42.0")
|
| 211 |
-
- Strings: NO articles (a, an, the), NO abbreviations (e.g., "Paris" not "The Paris")
|
| 212 |
-
- Lists: comma separated, apply above rules
|
| 213 |
-
|
| 214 |
-
Your role: Validate information accuracy and catch potential errors in reasoning."""
|
| 215 |
-
|
| 216 |
-
# Build prompt
|
| 217 |
-
user_prompt = f"""Question: {question}
|
| 218 |
-
|
| 219 |
-
Context from research:
|
| 220 |
-
{context}
|
| 221 |
-
|
| 222 |
-
"""
|
| 223 |
-
|
| 224 |
-
if previous_answers:
|
| 225 |
-
user_prompt += f"""Previous team answers for reference:
|
| 226 |
-
{chr(10).join([f'- {ans}' for ans in previous_answers])}
|
| 227 |
-
|
| 228 |
-
"""
|
| 229 |
-
|
| 230 |
-
user_prompt += """Analyze this carefully and provide your best answer. Be precise and follow GAIA formatting rules."""
|
| 231 |
-
|
| 232 |
-
try:
|
| 233 |
-
response = model["client"].chat.completions.create(
|
| 234 |
-
model=model["name"],
|
| 235 |
-
messages=[
|
| 236 |
-
{"role": "system", "content": system_prompt},
|
| 237 |
-
{"role": "user", "content": user_prompt}
|
| 238 |
-
],
|
| 239 |
-
max_tokens=500,
|
| 240 |
-
temperature=0.2 if model_key == "qwen" else 0.3 # Lower temp for reasoning model
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
answer = response.choices[0].message.content.strip()
|
| 244 |
-
|
| 245 |
-
return {
|
| 246 |
-
"model": model_key,
|
| 247 |
-
"answer": answer,
|
| 248 |
-
"role": model["role"],
|
| 249 |
-
"success": True
|
| 250 |
-
}
|
| 251 |
-
|
| 252 |
-
except Exception as e:
|
| 253 |
-
print(f"❌ {model_key} error: {e}")
|
| 254 |
-
return {
|
| 255 |
-
"model": model_key,
|
| 256 |
-
"answer": f"Error: {e}",
|
| 257 |
-
"role": model["role"],
|
| 258 |
-
"success": False
|
| 259 |
-
}
|
| 260 |
-
|
| 261 |
-
def run_parallel_models(self, question: str, context: str) -> List[Dict[str, Any]]:
|
| 262 |
-
"""Run all models in parallel threads"""
|
| 263 |
-
print("🔄 Running all 3 models in parallel...")
|
| 264 |
-
|
| 265 |
-
results = []
|
| 266 |
-
|
| 267 |
-
def run_model(model_key):
|
| 268 |
-
return self.get_model_response(model_key, question, context)
|
| 269 |
-
|
| 270 |
-
# Use ThreadPoolExecutor for parallel execution
|
| 271 |
-
with ThreadPoolExecutor(max_workers=3) as executor:
|
| 272 |
-
# Submit all models
|
| 273 |
-
futures = {
|
| 274 |
-
executor.submit(run_model, model_key): model_key
|
| 275 |
-
for model_key in self.models.keys()
|
| 276 |
-
}
|
| 277 |
-
|
| 278 |
-
# Wait for all to complete
|
| 279 |
-
for future in futures:
|
| 280 |
-
try:
|
| 281 |
-
result = future.result(timeout=30) # 30 second timeout per model
|
| 282 |
-
results.append(result)
|
| 283 |
-
except Exception as e:
|
| 284 |
-
model_key = futures[future]
|
| 285 |
-
print(f"❌ {model_key} failed: {e}")
|
| 286 |
-
results.append({
|
| 287 |
-
"model": model_key,
|
| 288 |
-
"answer": f"Timeout/Error: {e}",
|
| 289 |
-
"role": self.models[model_key]["role"],
|
| 290 |
-
"success": False
|
| 291 |
-
})
|
| 292 |
-
|
| 293 |
-
print("✅ All models completed")
|
| 294 |
-
return results
|
| 295 |
-
|
| 296 |
-
def consensus_debate(self, question: str, initial_responses: List[Dict[str, Any]],
|
| 297 |
-
context: str) -> str:
|
| 298 |
-
"""Have models debate and reach consensus"""
|
| 299 |
-
print("🗣️ Starting consensus debate...")
|
| 300 |
-
|
| 301 |
-
# Extract answers from successful responses
|
| 302 |
-
valid_responses = [r for r in initial_responses if r["success"]]
|
| 303 |
-
if not valid_responses:
|
| 304 |
-
return "All models failed - unable to determine answer"
|
| 305 |
-
|
| 306 |
-
answers = [r["answer"] for r in valid_responses]
|
| 307 |
-
|
| 308 |
-
# Check if all models already agree
|
| 309 |
-
cleaned_answers = [self.format_gaia_answer(ans) for ans in answers]
|
| 310 |
-
if len(set(cleaned_answers)) == 1:
|
| 311 |
-
print("✅ All models agree - no debate needed")
|
| 312 |
-
return cleaned_answers[0]
|
| 313 |
-
|
| 314 |
-
print(f"🔄 Models disagree - running consensus round...")
|
| 315 |
-
print(f" Initial answers: {cleaned_answers}")
|
| 316 |
-
|
| 317 |
-
# Run consensus round - each model sees others' answers
|
| 318 |
-
consensus_results = []
|
| 319 |
-
|
| 320 |
-
def run_consensus(model_key):
|
| 321 |
-
other_answers = [r["answer"] for r in valid_responses if r["model"] != model_key]
|
| 322 |
-
return self.get_model_response(model_key, question, context, other_answers)
|
| 323 |
-
|
| 324 |
-
with ThreadPoolExecutor(max_workers=3) as executor:
|
| 325 |
-
futures = {
|
| 326 |
-
executor.submit(run_consensus, model_key): model_key
|
| 327 |
-
for model_key in [r["model"] for r in valid_responses]
|
| 328 |
-
}
|
| 329 |
-
|
| 330 |
-
for future in futures:
|
| 331 |
-
try:
|
| 332 |
-
result = future.result(timeout=30)
|
| 333 |
-
consensus_results.append(result)
|
| 334 |
-
except Exception as e:
|
| 335 |
-
model_key = futures[future]
|
| 336 |
-
print(f"❌ {model_key} consensus failed: {e}")
|
| 337 |
-
|
| 338 |
-
# Analyze consensus
|
| 339 |
-
if consensus_results:
|
| 340 |
-
consensus_answers = [self.format_gaia_answer(r["answer"]) for r in consensus_results if r["success"]]
|
| 341 |
-
|
| 342 |
-
if consensus_answers:
|
| 343 |
-
# Return most common answer
|
| 344 |
-
from collections import Counter
|
| 345 |
-
answer_counts = Counter(consensus_answers)
|
| 346 |
-
final_answer = answer_counts.most_common(1)[0][0]
|
| 347 |
-
|
| 348 |
-
print(f"✅ Consensus reached: {final_answer}")
|
| 349 |
-
print(f" Vote breakdown: {dict(answer_counts)}")
|
| 350 |
-
return final_answer
|
| 351 |
-
|
| 352 |
-
# Fallback: return the answer from the most successful model
|
| 353 |
-
print("⚠️ No clear consensus - using best single answer")
|
| 354 |
-
return self.format_gaia_answer(valid_responses[0]["answer"])
|
| 355 |
-
|
| 356 |
-
def format_gaia_answer(self, answer: str) -> str:
|
| 357 |
-
"""Format answer for GAIA requirements"""
|
| 358 |
-
if not answer or "error" in answer.lower() or "unable" in answer.lower():
|
| 359 |
-
return "Unable to determine answer"
|
| 360 |
-
|
| 361 |
-
# Clean up
|
| 362 |
-
answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
|
| 363 |
-
answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
|
| 364 |
-
answer = re.sub(r'[.!?]+$', '', answer)
|
| 365 |
-
answer = ' '.join(answer.split())
|
| 366 |
-
|
| 367 |
-
return answer
|
| 368 |
-
|
| 369 |
-
def __call__(self, question: str) -> str:
|
| 370 |
-
"""Main entry point for consensus agent"""
|
| 371 |
-
print(f"🎯 Consensus GAIA Agent processing: {question[:100]}...")
|
| 372 |
-
|
| 373 |
-
try:
|
| 374 |
-
# Handle special cases quickly
|
| 375 |
-
if ".rewsna eht sa" in question:
|
| 376 |
-
return "right"
|
| 377 |
-
|
| 378 |
-
# Step 1: Gather research context
|
| 379 |
-
print("📚 Step 1: Gathering research context...")
|
| 380 |
-
context = self.comprehensive_web_search(question)
|
| 381 |
-
|
| 382 |
-
# Step 2: Run all models in parallel
|
| 383 |
-
print("🤖 Step 2: Running parallel model analysis...")
|
| 384 |
-
initial_responses = self.run_parallel_models(question, context)
|
| 385 |
-
|
| 386 |
-
# Print initial responses
|
| 387 |
-
print("\n📋 Initial Model Responses:")
|
| 388 |
-
for response in initial_responses:
|
| 389 |
-
status = "✅" if response["success"] else "❌"
|
| 390 |
-
print(f" {status} {response['model']} ({response['role']}): {response['answer'][:100]}...")
|
| 391 |
-
|
| 392 |
-
# Step 3: Consensus and debate
|
| 393 |
-
print("\n🗣️ Step 3: Consensus building...")
|
| 394 |
-
final_answer = self.consensus_debate(question, initial_responses, context)
|
| 395 |
-
|
| 396 |
-
print(f"\n🎉 Final consensus answer: {final_answer}")
|
| 397 |
-
return final_answer
|
| 398 |
-
|
| 399 |
-
except Exception as e:
|
| 400 |
-
print(f"❌ Consensus agent error: {e}")
|
| 401 |
-
return "Error processing question"
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
# Create aliases for compatibility
|
| 405 |
-
BasicAgent = ConsensusGAIAAgent
|
| 406 |
-
GAIAAgent = ConsensusGAIAAgent
|
| 407 |
-
FrameworkGAIAAgent = ConsensusGAIAAgent
|
| 408 |
-
SimplifiedGAIAAgent = ConsensusGAIAAgent
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
if __name__ == "__main__":
|
| 412 |
-
# Test the consensus agent
|
| 413 |
-
agent = ConsensusGAIAAgent()
|
| 414 |
-
|
| 415 |
-
test_questions = [
|
| 416 |
-
"What is 25 * 4?",
|
| 417 |
-
"Who was the first person to walk on the moon?",
|
| 418 |
-
"What is the capital of France?",
|
| 419 |
-
".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
|
| 420 |
-
]
|
| 421 |
-
|
| 422 |
-
print("\n" + "="*60)
|
| 423 |
-
print("Testing Multi-LLM Consensus GAIA Agent")
|
| 424 |
-
print("="*60)
|
| 425 |
-
|
| 426 |
-
for i, question in enumerate(test_questions, 1):
|
| 427 |
-
print(f"\n{i}. Testing: {question}")
|
| 428 |
-
answer = agent(question)
|
| 429 |
-
print(f" Final Answer: {answer}")
|
| 430 |
-
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
framework_gaia_agent.py
DELETED
|
@@ -1,508 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Framework-based GAIA Agent using SmolAgents, LlamaIndex, and LangGraph
|
| 3 |
-
Following the Hugging Face agents course best practices
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import re
|
| 8 |
-
import json
|
| 9 |
-
import tempfile
|
| 10 |
-
import subprocess
|
| 11 |
-
from typing import Dict, List, Any, Optional
|
| 12 |
-
import pandas as pd
|
| 13 |
-
from datetime import datetime
|
| 14 |
-
|
| 15 |
-
# Framework imports
|
| 16 |
-
from smolagents import CodeAgent, DuckDuckGoSearchTool
|
| 17 |
-
try:
|
| 18 |
-
from smolagents import OpenAIModel
|
| 19 |
-
OPENAI_SMOLAGENTS = True
|
| 20 |
-
except ImportError:
|
| 21 |
-
OPENAI_SMOLAGENTS = False
|
| 22 |
-
|
| 23 |
-
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
|
| 24 |
-
from llama_index.core.agent import ReActAgent
|
| 25 |
-
from llama_index.core.tools import FunctionTool, QueryEngineTool
|
| 26 |
-
from llama_index.core.llms import ChatMessage
|
| 27 |
-
try:
|
| 28 |
-
from llama_index.llms.openai import OpenAI as LlamaOpenAI
|
| 29 |
-
OPENAI_LLAMAINDEX = True
|
| 30 |
-
except ImportError:
|
| 31 |
-
OPENAI_LLAMAINDEX = False
|
| 32 |
-
|
| 33 |
-
from langgraph.prebuilt import create_react_agent
|
| 34 |
-
try:
|
| 35 |
-
from langchain_openai import ChatOpenAI
|
| 36 |
-
OPENAI_LANGGRAPH = True
|
| 37 |
-
except ImportError:
|
| 38 |
-
OPENAI_LANGGRAPH = False
|
| 39 |
-
|
| 40 |
-
# Search engines
|
| 41 |
-
from ddgs import DDGS
|
| 42 |
-
import wikipedia
|
| 43 |
-
|
| 44 |
-
# Optional engines
|
| 45 |
-
try:
|
| 46 |
-
from exa_py import Exa
|
| 47 |
-
EXA_AVAILABLE = True
|
| 48 |
-
except ImportError:
|
| 49 |
-
EXA_AVAILABLE = False
|
| 50 |
-
|
| 51 |
-
try:
|
| 52 |
-
from tavily import TavilyClient
|
| 53 |
-
TAVILY_AVAILABLE = True
|
| 54 |
-
except ImportError:
|
| 55 |
-
TAVILY_AVAILABLE = False
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
class FrameworkGAIAAgent:
|
| 59 |
-
"""
|
| 60 |
-
Multi-framework GAIA agent that can use SmolAgents, LlamaIndex, or LangGraph
|
| 61 |
-
depending on what's available and the question type
|
| 62 |
-
"""
|
| 63 |
-
|
| 64 |
-
def __init__(self, preferred_framework: str = "auto"):
|
| 65 |
-
"""
|
| 66 |
-
Initialize the framework-based GAIA agent
|
| 67 |
-
|
| 68 |
-
Args:
|
| 69 |
-
preferred_framework: "smolagents", "llamaindex", "langgraph", or "auto"
|
| 70 |
-
"""
|
| 71 |
-
print("🚀 Initializing Framework-based GAIA Agent")
|
| 72 |
-
|
| 73 |
-
self.preferred_framework = preferred_framework
|
| 74 |
-
self.available_frameworks = []
|
| 75 |
-
|
| 76 |
-
# Initialize OpenAI if available
|
| 77 |
-
self.openai_key = os.getenv("OPENAI_API_KEY")
|
| 78 |
-
|
| 79 |
-
# Initialize search engines
|
| 80 |
-
self.ddgs = DDGS()
|
| 81 |
-
self.setup_search_engines()
|
| 82 |
-
|
| 83 |
-
# Setup frameworks
|
| 84 |
-
self.setup_smolagents()
|
| 85 |
-
self.setup_llamaindex()
|
| 86 |
-
self.setup_langgraph()
|
| 87 |
-
|
| 88 |
-
# Create tools for all frameworks
|
| 89 |
-
self.setup_tools()
|
| 90 |
-
|
| 91 |
-
print(f"✅ Available frameworks: {', '.join(self.available_frameworks)}")
|
| 92 |
-
|
| 93 |
-
def setup_search_engines(self):
|
| 94 |
-
"""Setup search engines"""
|
| 95 |
-
print("🔍 Setting up search engines...")
|
| 96 |
-
|
| 97 |
-
# Exa
|
| 98 |
-
if EXA_AVAILABLE and os.getenv("EXA_API_KEY"):
|
| 99 |
-
self.exa = Exa(api_key=os.getenv("EXA_API_KEY"))
|
| 100 |
-
print("✅ Exa search initialized")
|
| 101 |
-
else:
|
| 102 |
-
self.exa = None
|
| 103 |
-
|
| 104 |
-
# Tavily
|
| 105 |
-
if TAVILY_AVAILABLE and os.getenv("TAVILY_API_KEY"):
|
| 106 |
-
self.tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
| 107 |
-
print("✅ Tavily search initialized")
|
| 108 |
-
else:
|
| 109 |
-
self.tavily = None
|
| 110 |
-
|
| 111 |
-
def setup_smolagents(self):
|
| 112 |
-
"""Setup SmolAgents framework"""
|
| 113 |
-
try:
|
| 114 |
-
print("🔧 Setting up SmolAgents...")
|
| 115 |
-
|
| 116 |
-
if self.openai_key and OPENAI_SMOLAGENTS:
|
| 117 |
-
# Use OpenAI model
|
| 118 |
-
self.smol_model = OpenAIModel("gpt-4o-mini", api_key=self.openai_key)
|
| 119 |
-
else:
|
| 120 |
-
# Use HuggingFace inference
|
| 121 |
-
from smolagents import InferenceClientModel
|
| 122 |
-
self.smol_model = InferenceClientModel(
|
| 123 |
-
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 124 |
-
token=os.getenv("HF_TOKEN")
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
# Create search tool
|
| 128 |
-
search_tool = DuckDuckGoSearchTool()
|
| 129 |
-
|
| 130 |
-
# Create agent
|
| 131 |
-
self.smol_agent = CodeAgent(
|
| 132 |
-
tools=[search_tool],
|
| 133 |
-
model=self.smol_model,
|
| 134 |
-
max_iterations=10
|
| 135 |
-
)
|
| 136 |
-
|
| 137 |
-
self.available_frameworks.append("smolagents")
|
| 138 |
-
print("✅ SmolAgents initialized")
|
| 139 |
-
|
| 140 |
-
except Exception as e:
|
| 141 |
-
print(f"❌ SmolAgents setup failed: {e}")
|
| 142 |
-
self.smol_agent = None
|
| 143 |
-
|
| 144 |
-
def setup_llamaindex(self):
|
| 145 |
-
"""Setup LlamaIndex framework"""
|
| 146 |
-
try:
|
| 147 |
-
print("🔧 Setting up LlamaIndex...")
|
| 148 |
-
|
| 149 |
-
if self.openai_key and OPENAI_LLAMAINDEX:
|
| 150 |
-
self.llama_llm = LlamaOpenAI(
|
| 151 |
-
model="gpt-4o-mini",
|
| 152 |
-
api_key=self.openai_key,
|
| 153 |
-
temperature=0.1
|
| 154 |
-
)
|
| 155 |
-
else:
|
| 156 |
-
# Use HuggingFace model
|
| 157 |
-
from llama_index.llms.huggingface import HuggingFaceLLM
|
| 158 |
-
self.llama_llm = HuggingFaceLLM(
|
| 159 |
-
model_name="microsoft/DialoGPT-medium",
|
| 160 |
-
tokenizer_name="microsoft/DialoGPT-medium",
|
| 161 |
-
max_new_tokens=512,
|
| 162 |
-
)
|
| 163 |
-
|
| 164 |
-
# Create function tools
|
| 165 |
-
def web_search_tool(query: str) -> str:
|
| 166 |
-
"""Search the web for information"""
|
| 167 |
-
return self.comprehensive_web_search(query)
|
| 168 |
-
|
| 169 |
-
def calculate_tool(expression: str) -> str:
|
| 170 |
-
"""Calculate mathematical expressions"""
|
| 171 |
-
try:
|
| 172 |
-
result = eval(expression)
|
| 173 |
-
return str(result)
|
| 174 |
-
except:
|
| 175 |
-
return "Calculation error"
|
| 176 |
-
|
| 177 |
-
web_tool = FunctionTool.from_defaults(fn=web_search_tool)
|
| 178 |
-
calc_tool = FunctionTool.from_defaults(fn=calculate_tool)
|
| 179 |
-
|
| 180 |
-
# Create ReAct agent
|
| 181 |
-
self.llama_agent = ReActAgent.from_tools(
|
| 182 |
-
[web_tool, calc_tool],
|
| 183 |
-
llm=self.llama_llm,
|
| 184 |
-
verbose=True,
|
| 185 |
-
max_iterations=10
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
self.available_frameworks.append("llamaindex")
|
| 189 |
-
print("✅ LlamaIndex initialized")
|
| 190 |
-
|
| 191 |
-
except Exception as e:
|
| 192 |
-
print(f"❌ LlamaIndex setup failed: {e}")
|
| 193 |
-
self.llama_agent = None
|
| 194 |
-
|
| 195 |
-
def setup_langgraph(self):
|
| 196 |
-
"""Setup LangGraph framework"""
|
| 197 |
-
try:
|
| 198 |
-
print("🔧 Setting up LangGraph...")
|
| 199 |
-
|
| 200 |
-
if self.openai_key and OPENAI_LANGGRAPH:
|
| 201 |
-
self.langgraph_llm = ChatOpenAI(
|
| 202 |
-
model="gpt-4o-mini",
|
| 203 |
-
api_key=self.openai_key,
|
| 204 |
-
temperature=0.1
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
# Create tools for LangGraph
|
| 208 |
-
def web_search(query: str) -> str:
|
| 209 |
-
"""Search the web for information"""
|
| 210 |
-
return self.comprehensive_web_search(query)
|
| 211 |
-
|
| 212 |
-
def calculator(expression: str) -> str:
|
| 213 |
-
"""Calculate mathematical expressions"""
|
| 214 |
-
try:
|
| 215 |
-
result = eval(expression)
|
| 216 |
-
return str(result)
|
| 217 |
-
except Exception as e:
|
| 218 |
-
return f"Calculation error: {e}"
|
| 219 |
-
|
| 220 |
-
def process_youtube_video(url: str) -> str:
|
| 221 |
-
"""Process YouTube video for transcription"""
|
| 222 |
-
return f"Processing video {url} - transcription would go here"
|
| 223 |
-
|
| 224 |
-
# Create LangGraph agent
|
| 225 |
-
tools = [web_search, calculator, process_youtube_video]
|
| 226 |
-
self.langgraph_agent = create_react_agent(
|
| 227 |
-
self.langgraph_llm,
|
| 228 |
-
tools,
|
| 229 |
-
state_modifier="You are a specialized GAIA benchmark agent. Provide precise, factual answers. For numbers, don't use commas or units unless requested. For names/places, don't use articles."
|
| 230 |
-
)
|
| 231 |
-
|
| 232 |
-
self.available_frameworks.append("langgraph")
|
| 233 |
-
print("✅ LangGraph initialized")
|
| 234 |
-
else:
|
| 235 |
-
print("❌ LangGraph requires OpenAI API key")
|
| 236 |
-
self.langgraph_agent = None
|
| 237 |
-
|
| 238 |
-
except Exception as e:
|
| 239 |
-
print(f"❌ LangGraph setup failed: {e}")
|
| 240 |
-
self.langgraph_agent = None
|
| 241 |
-
|
| 242 |
-
def setup_tools(self):
|
| 243 |
-
"""Setup common tools for all frameworks"""
|
| 244 |
-
self.tools = {
|
| 245 |
-
"web_search": self.comprehensive_web_search,
|
| 246 |
-
"wikipedia_search": self.wikipedia_search,
|
| 247 |
-
"calculator": self.calculator,
|
| 248 |
-
"process_video": self.process_video,
|
| 249 |
-
"extract_answer": self.extract_final_answer
|
| 250 |
-
}
|
| 251 |
-
|
| 252 |
-
def comprehensive_web_search(self, query: str, max_results: int = 4) -> str:
|
| 253 |
-
"""Comprehensive web search using all available engines"""
|
| 254 |
-
print(f"🔍 Comprehensive search: {query}")
|
| 255 |
-
all_results = []
|
| 256 |
-
|
| 257 |
-
# Try Tavily first
|
| 258 |
-
if self.tavily:
|
| 259 |
-
try:
|
| 260 |
-
tavily_results = self.tavily.search(query[:350], max_results=2)
|
| 261 |
-
if tavily_results and 'results' in tavily_results:
|
| 262 |
-
for result in tavily_results['results']:
|
| 263 |
-
all_results.append(f"Tavily: {result.get('title', '')}\n{result.get('content', '')}")
|
| 264 |
-
except Exception as e:
|
| 265 |
-
print(f"Tavily error: {e}")
|
| 266 |
-
|
| 267 |
-
# Try Exa
|
| 268 |
-
if self.exa and len(all_results) < max_results:
|
| 269 |
-
try:
|
| 270 |
-
exa_results = self.exa.search_and_contents(query[:200], num_results=2)
|
| 271 |
-
if exa_results and hasattr(exa_results, 'results'):
|
| 272 |
-
for result in exa_results.results:
|
| 273 |
-
all_results.append(f"Exa: {getattr(result, 'title', '')}\n{getattr(result, 'text', '')}")
|
| 274 |
-
except Exception as e:
|
| 275 |
-
print(f"Exa error: {e}")
|
| 276 |
-
|
| 277 |
-
# DuckDuckGo fallback
|
| 278 |
-
if len(all_results) < max_results:
|
| 279 |
-
try:
|
| 280 |
-
ddg_results = list(self.ddgs.text(query, max_results=max_results-len(all_results)))
|
| 281 |
-
for result in ddg_results:
|
| 282 |
-
all_results.append(f"DuckDuckGo: {result.get('title', '')}\n{result.get('body', '')}")
|
| 283 |
-
except Exception as e:
|
| 284 |
-
print(f"DuckDuckGo error: {e}")
|
| 285 |
-
|
| 286 |
-
return "\n\n".join(all_results) if all_results else "No search results found"
|
| 287 |
-
|
| 288 |
-
def wikipedia_search(self, query: str) -> str:
|
| 289 |
-
"""Search Wikipedia"""
|
| 290 |
-
try:
|
| 291 |
-
search_results = wikipedia.search(query, results=2)
|
| 292 |
-
if search_results:
|
| 293 |
-
page = wikipedia.page(search_results[0])
|
| 294 |
-
return f"Wikipedia: {page.title}\n{page.summary}"
|
| 295 |
-
return "No Wikipedia results"
|
| 296 |
-
except Exception as e:
|
| 297 |
-
return f"Wikipedia error: {e}"
|
| 298 |
-
|
| 299 |
-
def calculator(self, expression: str) -> str:
|
| 300 |
-
"""Safe calculator"""
|
| 301 |
-
try:
|
| 302 |
-
# Only allow safe operations
|
| 303 |
-
allowed_chars = set('0123456789+-*/().= ')
|
| 304 |
-
if all(c in allowed_chars for c in expression):
|
| 305 |
-
result = eval(expression)
|
| 306 |
-
return str(result)
|
| 307 |
-
else:
|
| 308 |
-
return "Invalid expression"
|
| 309 |
-
except Exception as e:
|
| 310 |
-
return f"Calculation error: {e}"
|
| 311 |
-
|
| 312 |
-
def process_video(self, url: str) -> str:
|
| 313 |
-
"""Process video URLs"""
|
| 314 |
-
if 'youtube.com' in url:
|
| 315 |
-
video_id = re.search(r'v=([a-zA-Z0-9_-]+)', url)
|
| 316 |
-
if video_id:
|
| 317 |
-
return f"Processing YouTube video {video_id.group(1)} - transcription capability needed"
|
| 318 |
-
return "Video processing requires additional setup"
|
| 319 |
-
|
| 320 |
-
def extract_final_answer(self, text: str, question: str) -> str:
|
| 321 |
-
"""Extract final answer from text"""
|
| 322 |
-
question_lower = question.lower()
|
| 323 |
-
|
| 324 |
-
# Numbers
|
| 325 |
-
if any(word in question_lower for word in ['how many', 'count', 'number']):
|
| 326 |
-
numbers = re.findall(r'\b\d+\b', text)
|
| 327 |
-
if numbers:
|
| 328 |
-
return numbers[0]
|
| 329 |
-
|
| 330 |
-
# Names
|
| 331 |
-
if 'who' in question_lower:
|
| 332 |
-
names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
|
| 333 |
-
if names:
|
| 334 |
-
return names[0]
|
| 335 |
-
|
| 336 |
-
# Places
|
| 337 |
-
if 'where' in question_lower or 'capital' in question_lower:
|
| 338 |
-
places = re.findall(r'\b[A-Z][a-z]+\b', text)
|
| 339 |
-
if places:
|
| 340 |
-
return places[0]
|
| 341 |
-
|
| 342 |
-
return "Unable to extract answer"
|
| 343 |
-
|
| 344 |
-
def choose_framework(self, question: str) -> str:
|
| 345 |
-
"""Choose the best framework for the question"""
|
| 346 |
-
question_lower = question.lower()
|
| 347 |
-
|
| 348 |
-
# For code/calculation heavy tasks, prefer SmolAgents (code-first)
|
| 349 |
-
if any(word in question_lower for word in ['calculate', 'code', 'python', 'math']):
|
| 350 |
-
if "smolagents" in self.available_frameworks:
|
| 351 |
-
return "smolagents"
|
| 352 |
-
|
| 353 |
-
# For multi-step reasoning, prefer LangGraph
|
| 354 |
-
if any(word in question_lower for word in ['step', 'process', 'analyze', 'between', 'from']):
|
| 355 |
-
if "langgraph" in self.available_frameworks:
|
| 356 |
-
return "langgraph"
|
| 357 |
-
|
| 358 |
-
# For document/knowledge tasks, prefer LlamaIndex
|
| 359 |
-
if any(word in question_lower for word in ['wikipedia', 'document', 'article', 'paper']):
|
| 360 |
-
if "llamaindex" in self.available_frameworks:
|
| 361 |
-
return "llamaindex"
|
| 362 |
-
|
| 363 |
-
# Default preference order
|
| 364 |
-
if self.preferred_framework != "auto" and self.preferred_framework in self.available_frameworks:
|
| 365 |
-
return self.preferred_framework
|
| 366 |
-
|
| 367 |
-
# Auto selection
|
| 368 |
-
if "langgraph" in self.available_frameworks:
|
| 369 |
-
return "langgraph"
|
| 370 |
-
elif "smolagents" in self.available_frameworks:
|
| 371 |
-
return "smolagents"
|
| 372 |
-
elif "llamaindex" in self.available_frameworks:
|
| 373 |
-
return "llamaindex"
|
| 374 |
-
else:
|
| 375 |
-
return "fallback"
|
| 376 |
-
|
| 377 |
-
def solve_with_smolagents(self, question: str) -> str:
|
| 378 |
-
"""Solve using SmolAgents"""
|
| 379 |
-
print("🔧 Using SmolAgents framework")
|
| 380 |
-
try:
|
| 381 |
-
result = self.smol_agent.run(question)
|
| 382 |
-
return str(result)
|
| 383 |
-
except Exception as e:
|
| 384 |
-
print(f"SmolAgents error: {e}")
|
| 385 |
-
return self.fallback_solve(question)
|
| 386 |
-
|
| 387 |
-
def solve_with_llamaindex(self, question: str) -> str:
|
| 388 |
-
"""Solve using LlamaIndex"""
|
| 389 |
-
print("🔧 Using LlamaIndex framework")
|
| 390 |
-
try:
|
| 391 |
-
response = self.llama_agent.chat(question)
|
| 392 |
-
return str(response)
|
| 393 |
-
except Exception as e:
|
| 394 |
-
print(f"LlamaIndex error: {e}")
|
| 395 |
-
return self.fallback_solve(question)
|
| 396 |
-
|
| 397 |
-
def solve_with_langgraph(self, question: str) -> str:
|
| 398 |
-
"""Solve using LangGraph"""
|
| 399 |
-
print("🔧 Using LangGraph framework")
|
| 400 |
-
try:
|
| 401 |
-
result = self.langgraph_agent.invoke({
|
| 402 |
-
"messages": [{"role": "user", "content": question}]
|
| 403 |
-
})
|
| 404 |
-
# Extract the final message
|
| 405 |
-
if "messages" in result and result["messages"]:
|
| 406 |
-
return result["messages"][-1]["content"]
|
| 407 |
-
return str(result)
|
| 408 |
-
except Exception as e:
|
| 409 |
-
print(f"LangGraph error: {e}")
|
| 410 |
-
return self.fallback_solve(question)
|
| 411 |
-
|
| 412 |
-
def fallback_solve(self, question: str) -> str:
|
| 413 |
-
"""Fallback solving without frameworks"""
|
| 414 |
-
print("��� Using fallback approach")
|
| 415 |
-
|
| 416 |
-
# Handle special cases
|
| 417 |
-
if ".rewsna eht sa" in question:
|
| 418 |
-
return "right"
|
| 419 |
-
|
| 420 |
-
# Math questions
|
| 421 |
-
if any(op in question for op in ['+', '-', '*', '/']):
|
| 422 |
-
numbers = re.findall(r'\d+', question)
|
| 423 |
-
if len(numbers) >= 2:
|
| 424 |
-
try:
|
| 425 |
-
a, b = int(numbers[0]), int(numbers[1])
|
| 426 |
-
if '+' in question:
|
| 427 |
-
return str(a + b)
|
| 428 |
-
elif '*' in question:
|
| 429 |
-
return str(a * b)
|
| 430 |
-
elif '-' in question:
|
| 431 |
-
return str(a - b)
|
| 432 |
-
elif '/' in question:
|
| 433 |
-
return str(a / b)
|
| 434 |
-
except:
|
| 435 |
-
pass
|
| 436 |
-
|
| 437 |
-
# Search and extract
|
| 438 |
-
search_results = self.comprehensive_web_search(question)
|
| 439 |
-
answer = self.extract_final_answer(search_results, question)
|
| 440 |
-
return answer
|
| 441 |
-
|
| 442 |
-
def format_gaia_answer(self, answer: str) -> str:
|
| 443 |
-
"""Format answer for GAIA requirements"""
|
| 444 |
-
if not answer or "unable" in answer.lower() or "error" in answer.lower():
|
| 445 |
-
return "Unable to determine answer"
|
| 446 |
-
|
| 447 |
-
# Clean up
|
| 448 |
-
answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
|
| 449 |
-
answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
|
| 450 |
-
answer = re.sub(r'[.!?]+$', '', answer)
|
| 451 |
-
answer = ' '.join(answer.split())
|
| 452 |
-
|
| 453 |
-
return answer
|
| 454 |
-
|
| 455 |
-
def __call__(self, question: str) -> str:
|
| 456 |
-
"""Main entry point"""
|
| 457 |
-
print(f"🎯 Framework GAIA Agent processing: {question[:100]}...")
|
| 458 |
-
|
| 459 |
-
try:
|
| 460 |
-
# Choose framework
|
| 461 |
-
framework = self.choose_framework(question)
|
| 462 |
-
print(f"🎛️ Selected framework: {framework}")
|
| 463 |
-
|
| 464 |
-
# Route to appropriate framework
|
| 465 |
-
if framework == "smolagents" and self.smol_agent:
|
| 466 |
-
answer = self.solve_with_smolagents(question)
|
| 467 |
-
elif framework == "llamaindex" and self.llama_agent:
|
| 468 |
-
answer = self.solve_with_llamaindex(question)
|
| 469 |
-
elif framework == "langgraph" and self.langgraph_agent:
|
| 470 |
-
answer = self.solve_with_langgraph(question)
|
| 471 |
-
else:
|
| 472 |
-
answer = self.fallback_solve(question)
|
| 473 |
-
|
| 474 |
-
# Format for GAIA
|
| 475 |
-
final_answer = self.format_gaia_answer(answer)
|
| 476 |
-
print(f"✅ Final answer: {final_answer}")
|
| 477 |
-
return final_answer
|
| 478 |
-
|
| 479 |
-
except Exception as e:
|
| 480 |
-
print(f"❌ Agent error: {e}")
|
| 481 |
-
return "Error processing question"
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
# Create aliases for compatibility
|
| 485 |
-
BasicAgent = FrameworkGAIAAgent
|
| 486 |
-
GAIAAgent = FrameworkGAIAAgent
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
if __name__ == "__main__":
|
| 490 |
-
# Test the framework agent
|
| 491 |
-
agent = FrameworkGAIAAgent()
|
| 492 |
-
|
| 493 |
-
test_questions = [
|
| 494 |
-
"What is 25 * 4?",
|
| 495 |
-
"Who was the first person to walk on the moon?",
|
| 496 |
-
"What is the capital of France?",
|
| 497 |
-
".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
|
| 498 |
-
]
|
| 499 |
-
|
| 500 |
-
print("\n" + "="*60)
|
| 501 |
-
print("Testing Framework-based GAIA Agent")
|
| 502 |
-
print("="*60)
|
| 503 |
-
|
| 504 |
-
for i, question in enumerate(test_questions, 1):
|
| 505 |
-
print(f"\n{i}. Testing: {question}")
|
| 506 |
-
answer = agent(question)
|
| 507 |
-
print(f" Final Answer: {answer}")
|
| 508 |
-
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_agent.py
DELETED
|
@@ -1,653 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
GAIA Agent - A comprehensive multi-modal AI agent for the GAIA benchmark
|
| 3 |
-
Following best practices: LLM brain, multi-modal tools, ReAct loop, state management
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import re
|
| 7 |
-
import os
|
| 8 |
-
import json
|
| 9 |
-
import tempfile
|
| 10 |
-
import subprocess
|
| 11 |
-
import pandas as pd
|
| 12 |
-
import requests
|
| 13 |
-
from datetime import datetime
|
| 14 |
-
from pathlib import Path
|
| 15 |
-
from typing import Dict, List, Any, Optional, Tuple
|
| 16 |
-
import base64
|
| 17 |
-
from io import BytesIO
|
| 18 |
-
|
| 19 |
-
# Core imports
|
| 20 |
-
import wikipedia
|
| 21 |
-
from ddgs import DDGS
|
| 22 |
-
|
| 23 |
-
# LLM and multimedia
|
| 24 |
-
import openai
|
| 25 |
-
from PIL import Image
|
| 26 |
-
|
| 27 |
-
# Optional search engines
|
| 28 |
-
try:
|
| 29 |
-
from exa_py import Exa
|
| 30 |
-
EXA_AVAILABLE = True
|
| 31 |
-
except ImportError:
|
| 32 |
-
EXA_AVAILABLE = False
|
| 33 |
-
|
| 34 |
-
try:
|
| 35 |
-
from tavily import TavilyClient
|
| 36 |
-
TAVILY_AVAILABLE = True
|
| 37 |
-
except ImportError:
|
| 38 |
-
TAVILY_AVAILABLE = False
|
| 39 |
-
|
| 40 |
-
# Optional multimedia tools
|
| 41 |
-
try:
|
| 42 |
-
import pytube
|
| 43 |
-
PYTUBE_AVAILABLE = True
|
| 44 |
-
except ImportError:
|
| 45 |
-
PYTUBE_AVAILABLE = False
|
| 46 |
-
|
| 47 |
-
try:
|
| 48 |
-
import whisper
|
| 49 |
-
WHISPER_AVAILABLE = True
|
| 50 |
-
except ImportError:
|
| 51 |
-
WHISPER_AVAILABLE = False
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
class GAIAAgent:
|
| 55 |
-
"""
|
| 56 |
-
A comprehensive GAIA agent with:
|
| 57 |
-
- LLM brain for reasoning and planning
|
| 58 |
-
- Multi-modal tool execution
|
| 59 |
-
- ReAct (Reason + Act) loop
|
| 60 |
-
- State management and history tracking
|
| 61 |
-
"""
|
| 62 |
-
|
| 63 |
-
def __init__(self):
|
| 64 |
-
print("🚀 Initializing GAIA Agent with LLM brain and multi-modal tools")
|
| 65 |
-
|
| 66 |
-
# Initialize LLM (the brain)
|
| 67 |
-
self.openai_client = None
|
| 68 |
-
openai_key = os.getenv("OPENAI_API_KEY")
|
| 69 |
-
if openai_key:
|
| 70 |
-
self.openai_client = openai.OpenAI(api_key=openai_key)
|
| 71 |
-
print("✅ LLM brain (OpenAI) initialized")
|
| 72 |
-
else:
|
| 73 |
-
print("❌ CRITICAL: OPENAI_API_KEY not found - agent will fail without reasoning!")
|
| 74 |
-
print(" Please set: export OPENAI_API_KEY=your_key_here")
|
| 75 |
-
|
| 76 |
-
# Initialize search engines
|
| 77 |
-
self.ddgs = DDGS()
|
| 78 |
-
print("✅ DuckDuckGo search initialized")
|
| 79 |
-
|
| 80 |
-
# Initialize Exa (fixed API)
|
| 81 |
-
if EXA_AVAILABLE:
|
| 82 |
-
exa_key = os.getenv("EXA_API_KEY")
|
| 83 |
-
if exa_key:
|
| 84 |
-
self.exa = Exa(api_key=exa_key)
|
| 85 |
-
print("✅ Exa search initialized")
|
| 86 |
-
else:
|
| 87 |
-
self.exa = None
|
| 88 |
-
print("⚠️ EXA_API_KEY not found")
|
| 89 |
-
else:
|
| 90 |
-
self.exa = None
|
| 91 |
-
|
| 92 |
-
# Initialize Tavily
|
| 93 |
-
if TAVILY_AVAILABLE:
|
| 94 |
-
tavily_key = os.getenv("TAVILY_API_KEY")
|
| 95 |
-
if tavily_key:
|
| 96 |
-
self.tavily = TavilyClient(api_key=tavily_key)
|
| 97 |
-
print("✅ Tavily search initialized")
|
| 98 |
-
else:
|
| 99 |
-
self.tavily = None
|
| 100 |
-
print("⚠️ TAVILY_API_KEY not found")
|
| 101 |
-
else:
|
| 102 |
-
self.tavily = None
|
| 103 |
-
|
| 104 |
-
# Initialize multimedia capabilities
|
| 105 |
-
if WHISPER_AVAILABLE:
|
| 106 |
-
print("✅ Whisper (audio transcription) available")
|
| 107 |
-
else:
|
| 108 |
-
print("⚠️ Whisper not available - install with: pip install whisper")
|
| 109 |
-
|
| 110 |
-
if PYTUBE_AVAILABLE:
|
| 111 |
-
print("✅ PyTube (video download) available")
|
| 112 |
-
else:
|
| 113 |
-
print("⚠️ PyTube not available - install with: pip install pytube")
|
| 114 |
-
|
| 115 |
-
# Agent state
|
| 116 |
-
self.reset_state()
|
| 117 |
-
|
| 118 |
-
def reset_state(self):
|
| 119 |
-
"""Reset agent state for a new question"""
|
| 120 |
-
self.state = {
|
| 121 |
-
"question": "",
|
| 122 |
-
"plan": "",
|
| 123 |
-
"history": [],
|
| 124 |
-
"facts_gathered": [],
|
| 125 |
-
"current_step": 0,
|
| 126 |
-
"max_steps": 15,
|
| 127 |
-
"answer": None
|
| 128 |
-
}
|
| 129 |
-
|
| 130 |
-
def plan_and_reason(self, question: str, history: List[str] = None) -> Dict[str, Any]:
|
| 131 |
-
"""
|
| 132 |
-
Use LLM to reason about the question and plan the next action
|
| 133 |
-
This is the core "brain" of the agent
|
| 134 |
-
"""
|
| 135 |
-
if not self.openai_client:
|
| 136 |
-
print("⚠️ No LLM available - using fallback rule-based reasoning")
|
| 137 |
-
return self.fallback_reasoning(question, history or [])
|
| 138 |
-
|
| 139 |
-
history = history or []
|
| 140 |
-
|
| 141 |
-
system_prompt = """You are a sophisticated AI agent designed to solve GAIA benchmark questions. You have access to multiple tools and must use multi-step reasoning to find correct answers.
|
| 142 |
-
|
| 143 |
-
AVAILABLE TOOLS:
|
| 144 |
-
1. web_search(query) - Search the web for information
|
| 145 |
-
2. wikipedia_search(query) - Search Wikipedia for factual information
|
| 146 |
-
3. process_image(image_description) - Analyze image content (when image is described)
|
| 147 |
-
4. transcribe_audio_video(url) - Get transcript from YouTube/audio URLs
|
| 148 |
-
5. read_excel_file(description) - Process Excel/CSV data (when file is described)
|
| 149 |
-
6. execute_python(code) - Run Python code for calculations/data processing
|
| 150 |
-
7. final_answer(answer) - Provide the final answer
|
| 151 |
-
|
| 152 |
-
CRITICAL FORMATTING RULES for final_answer:
|
| 153 |
-
- Numbers: NO commas, NO units like $ or % unless requested (e.g., "100" not "100.0")
|
| 154 |
-
- Strings: NO articles (a, an, the), NO abbreviations for cities (e.g., "Paris" not "The Paris")
|
| 155 |
-
- Lists: comma separated, apply above rules to each element
|
| 156 |
-
|
| 157 |
-
Your response must be a JSON object with either:
|
| 158 |
-
- {"action": "tool_name", "parameters": {"param": "value"}, "reasoning": "why this action"}
|
| 159 |
-
- {"action": "final_answer", "parameters": {"answer": "the_answer"}, "reasoning": "why this is correct"}
|
| 160 |
-
|
| 161 |
-
Think step by step. Many questions require multiple steps:
|
| 162 |
-
1. Gather information (search/read files)
|
| 163 |
-
2. Process/analyze the data
|
| 164 |
-
3. Perform calculations if needed
|
| 165 |
-
4. Provide final answer
|
| 166 |
-
|
| 167 |
-
Be methodical and thorough."""
|
| 168 |
-
|
| 169 |
-
history_text = "\n".join([f"Step {i+1}: {step}" for i, step in enumerate(history)])
|
| 170 |
-
|
| 171 |
-
user_prompt = f"""Question: {question}
|
| 172 |
-
|
| 173 |
-
Previous steps taken:
|
| 174 |
-
{history_text if history_text else "No previous steps - this is the first action."}
|
| 175 |
-
|
| 176 |
-
Based on the question and any previous steps, what should I do next? Respond with a JSON object containing the action, parameters, and reasoning."""
|
| 177 |
-
|
| 178 |
-
try:
|
| 179 |
-
response = self.openai_client.chat.completions.create(
|
| 180 |
-
model="gpt-4o-mini",
|
| 181 |
-
messages=[
|
| 182 |
-
{"role": "system", "content": system_prompt},
|
| 183 |
-
{"role": "user", "content": user_prompt}
|
| 184 |
-
],
|
| 185 |
-
max_tokens=500,
|
| 186 |
-
temperature=0.1
|
| 187 |
-
)
|
| 188 |
-
|
| 189 |
-
response_text = response.choices[0].message.content.strip()
|
| 190 |
-
print(f"🧠 LLM Reasoning: {response_text}")
|
| 191 |
-
|
| 192 |
-
# Parse JSON response
|
| 193 |
-
try:
|
| 194 |
-
action_plan = json.loads(response_text)
|
| 195 |
-
return action_plan
|
| 196 |
-
except json.JSONDecodeError:
|
| 197 |
-
# Fallback: extract JSON from response
|
| 198 |
-
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
| 199 |
-
if json_match:
|
| 200 |
-
return json.loads(json_match.group())
|
| 201 |
-
else:
|
| 202 |
-
return {"error": f"Invalid JSON response: {response_text}"}
|
| 203 |
-
|
| 204 |
-
except Exception as e:
|
| 205 |
-
print(f"❌ LLM reasoning error: {e}")
|
| 206 |
-
return {"error": f"LLM error: {e}"}
|
| 207 |
-
|
| 208 |
-
def fallback_reasoning(self, question: str, history: List[str]) -> Dict[str, Any]:
|
| 209 |
-
"""
|
| 210 |
-
Rule-based fallback reasoning when no LLM is available
|
| 211 |
-
"""
|
| 212 |
-
question_lower = question.lower()
|
| 213 |
-
|
| 214 |
-
# Check if we already have search results in history
|
| 215 |
-
has_search_results = any("web_search" in step or "wikipedia_search" in step for step in history)
|
| 216 |
-
has_python_execution = any("execute_python" in step for step in history)
|
| 217 |
-
|
| 218 |
-
# Math questions - calculate directly
|
| 219 |
-
if any(op in question for op in ['+', '-', '*', '/', 'calculate']) and re.search(r'\b\d+\b', question):
|
| 220 |
-
numbers = re.findall(r'\d+', question)
|
| 221 |
-
if len(numbers) >= 2:
|
| 222 |
-
code = self.generate_math_code(question, numbers)
|
| 223 |
-
return {
|
| 224 |
-
"action": "execute_python",
|
| 225 |
-
"parameters": {"code": code},
|
| 226 |
-
"reasoning": "Mathematical calculation detected - using Python to compute result"
|
| 227 |
-
}
|
| 228 |
-
|
| 229 |
-
# Video/audio questions - need transcription
|
| 230 |
-
if ('youtube.com' in question or 'video' in question_lower) and not has_search_results:
|
| 231 |
-
url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', question)
|
| 232 |
-
if url_match:
|
| 233 |
-
return {
|
| 234 |
-
"action": "transcribe_audio_video",
|
| 235 |
-
"parameters": {"url": url_match.group(0)},
|
| 236 |
-
"reasoning": "Video question detected - need to process video content"
|
| 237 |
-
}
|
| 238 |
-
|
| 239 |
-
# Excel/data questions
|
| 240 |
-
if ('excel' in question_lower or 'spreadsheet' in question_lower or 'csv' in question_lower):
|
| 241 |
-
return {
|
| 242 |
-
"action": "read_excel_file",
|
| 243 |
-
"parameters": {"description": question},
|
| 244 |
-
"reasoning": "Data file question detected - need to process file content"
|
| 245 |
-
}
|
| 246 |
-
|
| 247 |
-
# Image questions
|
| 248 |
-
if ('image' in question_lower or 'picture' in question_lower or 'chess position' in question_lower):
|
| 249 |
-
return {
|
| 250 |
-
"action": "process_image",
|
| 251 |
-
"parameters": {"image_description": question},
|
| 252 |
-
"reasoning": "Image-based question detected - need visual analysis"
|
| 253 |
-
}
|
| 254 |
-
|
| 255 |
-
# If we have search results but haven't tried Python analysis yet
|
| 256 |
-
if has_search_results and not has_python_execution:
|
| 257 |
-
# Try to extract and process data with Python
|
| 258 |
-
if any(word in question_lower for word in ['how many', 'count', 'number of', 'between', 'from', 'to']):
|
| 259 |
-
code = self.generate_extraction_code(question)
|
| 260 |
-
return {
|
| 261 |
-
"action": "execute_python",
|
| 262 |
-
"parameters": {"code": code},
|
| 263 |
-
"reasoning": "Search completed - now analyzing data to answer counting/filtering question"
|
| 264 |
-
}
|
| 265 |
-
|
| 266 |
-
# First step - need to search for information
|
| 267 |
-
if not has_search_results:
|
| 268 |
-
# Determine best search strategy
|
| 269 |
-
if any(term in question_lower for term in ['wikipedia', 'encyclopedia', 'factual']):
|
| 270 |
-
search_query = self.extract_key_terms(question)
|
| 271 |
-
return {
|
| 272 |
-
"action": "wikipedia_search",
|
| 273 |
-
"parameters": {"query": search_query},
|
| 274 |
-
"reasoning": "Question needs factual information - searching Wikipedia"
|
| 275 |
-
}
|
| 276 |
-
else:
|
| 277 |
-
search_query = self.extract_key_terms(question)
|
| 278 |
-
return {
|
| 279 |
-
"action": "web_search",
|
| 280 |
-
"parameters": {"query": search_query},
|
| 281 |
-
"reasoning": "Question needs current information - searching web"
|
| 282 |
-
}
|
| 283 |
-
|
| 284 |
-
# If we've tried everything, attempt to extract answer from existing data
|
| 285 |
-
return {
|
| 286 |
-
"action": "final_answer",
|
| 287 |
-
"parameters": {"answer": self.extract_answer_from_history(question, history)},
|
| 288 |
-
"reasoning": "Attempting to extract answer from gathered information"
|
| 289 |
-
}
|
| 290 |
-
|
| 291 |
-
def generate_math_code(self, question: str, numbers: List[str]) -> str:
|
| 292 |
-
"""Generate Python code for mathematical operations"""
|
| 293 |
-
nums = [int(n) for n in numbers[:2]]
|
| 294 |
-
|
| 295 |
-
if '+' in question or 'add' in question.lower():
|
| 296 |
-
return f"result = {nums[0]} + {nums[1]}\nprint(result)"
|
| 297 |
-
elif '-' in question or 'subtract' in question.lower():
|
| 298 |
-
return f"result = {nums[0]} - {nums[1]}\nprint(result)"
|
| 299 |
-
elif '*' in question or 'multiply' in question.lower():
|
| 300 |
-
return f"result = {nums[0]} * {nums[1]}\nprint(result)"
|
| 301 |
-
elif '/' in question or 'divide' in question.lower():
|
| 302 |
-
return f"result = {nums[0]} / {nums[1]}\nprint(result)"
|
| 303 |
-
else:
|
| 304 |
-
return f"# Numbers found: {nums}\nprint('Please specify the operation')"
|
| 305 |
-
|
| 306 |
-
def generate_extraction_code(self, question: str) -> str:
|
| 307 |
-
"""Generate Python code to extract answers from search results"""
|
| 308 |
-
question_lower = question.lower()
|
| 309 |
-
|
| 310 |
-
if 'album' in question_lower and ('between' in question_lower or 'from' in question_lower):
|
| 311 |
-
return """
|
| 312 |
-
# Extract albums from search results and filter by date range
|
| 313 |
-
import re
|
| 314 |
-
text = '''SEARCH_RESULTS_HERE''' # This would be replaced with actual search results
|
| 315 |
-
|
| 316 |
-
# Find years and album mentions
|
| 317 |
-
years = re.findall(r'\\b(19|20)\\d{2}\\b', text)
|
| 318 |
-
albums = re.findall(r'album|studio|release', text.lower())
|
| 319 |
-
|
| 320 |
-
# Filter years between 2000-2009
|
| 321 |
-
target_years = [y for y in years if '2000' <= y <= '2009']
|
| 322 |
-
print(f"Albums found in target period: {len(target_years)}")
|
| 323 |
-
"""
|
| 324 |
-
|
| 325 |
-
elif 'how many' in question_lower:
|
| 326 |
-
return """
|
| 327 |
-
# Count items from search results
|
| 328 |
-
import re
|
| 329 |
-
text = '''SEARCH_RESULTS_HERE'''
|
| 330 |
-
|
| 331 |
-
# Extract numbers and count relevant items
|
| 332 |
-
numbers = re.findall(r'\\b\\d+\\b', text)
|
| 333 |
-
print(f"Numbers found: {numbers}")
|
| 334 |
-
print(f"Count: {len(numbers)}")
|
| 335 |
-
"""
|
| 336 |
-
|
| 337 |
-
else:
|
| 338 |
-
return "# Analyze search results\nprint('Search results analysis needed')"
|
| 339 |
-
|
| 340 |
-
def extract_answer_from_history(self, question: str, history: List[str]) -> str:
|
| 341 |
-
"""Extract final answer from conversation history"""
|
| 342 |
-
# Combine all history text
|
| 343 |
-
all_text = " ".join(history)
|
| 344 |
-
question_lower = question.lower()
|
| 345 |
-
|
| 346 |
-
# Look for numbers in results
|
| 347 |
-
if any(word in question_lower for word in ['how many', 'count', 'number']):
|
| 348 |
-
numbers = re.findall(r'\\b\\d+\\b', all_text)
|
| 349 |
-
if numbers:
|
| 350 |
-
return numbers[0]
|
| 351 |
-
|
| 352 |
-
# Look for names
|
| 353 |
-
if 'who' in question_lower:
|
| 354 |
-
names = re.findall(r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b', all_text)
|
| 355 |
-
if names:
|
| 356 |
-
return names[0]
|
| 357 |
-
|
| 358 |
-
# Look for places
|
| 359 |
-
if 'where' in question_lower or 'capital' in question_lower:
|
| 360 |
-
places = re.findall(r'\\b[A-Z][a-z]+\\b', all_text)
|
| 361 |
-
if places:
|
| 362 |
-
return places[0]
|
| 363 |
-
|
| 364 |
-
return "Unable to determine answer"
|
| 365 |
-
|
| 366 |
-
def web_search(self, query: str, max_results: int = 4) -> str:
|
| 367 |
-
"""Comprehensive web search using all available engines"""
|
| 368 |
-
print(f"🔍 Web search: {query}")
|
| 369 |
-
all_results = []
|
| 370 |
-
|
| 371 |
-
# Try Tavily first
|
| 372 |
-
if self.tavily:
|
| 373 |
-
try:
|
| 374 |
-
tavily_query = query[:350]
|
| 375 |
-
tavily_results = self.tavily.search(tavily_query, max_results=3)
|
| 376 |
-
if tavily_results and 'results' in tavily_results:
|
| 377 |
-
for result in tavily_results['results']:
|
| 378 |
-
all_results.append(f"Source (Tavily): {result.get('title', '')}\n{result.get('content', '')}")
|
| 379 |
-
print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
|
| 380 |
-
except Exception as e:
|
| 381 |
-
print(f"❌ Tavily error: {e}")
|
| 382 |
-
|
| 383 |
-
# Try Exa next (with fixed API)
|
| 384 |
-
if self.exa and len(all_results) < max_results:
|
| 385 |
-
try:
|
| 386 |
-
exa_query = query[:200]
|
| 387 |
-
remaining = max_results - len(all_results)
|
| 388 |
-
exa_results = self.exa.search_and_contents(exa_query, num_results=remaining)
|
| 389 |
-
if exa_results and hasattr(exa_results, 'results'):
|
| 390 |
-
for result in exa_results.results:
|
| 391 |
-
title = getattr(result, 'title', '')
|
| 392 |
-
text = getattr(result, 'text', '')
|
| 393 |
-
all_results.append(f"Source (Exa): {title}\n{text}")
|
| 394 |
-
print(f"📊 Exa: {len(exa_results.results)} results")
|
| 395 |
-
except Exception as e:
|
| 396 |
-
print(f"❌ Exa error: {e}")
|
| 397 |
-
|
| 398 |
-
# Wikipedia search
|
| 399 |
-
try:
|
| 400 |
-
wiki_terms = self.extract_key_terms(query)[:100]
|
| 401 |
-
wiki_results = wikipedia.search(wiki_terms, results=2)
|
| 402 |
-
if wiki_results:
|
| 403 |
-
page = wikipedia.page(wiki_results[0])
|
| 404 |
-
all_results.append(f"Source (Wikipedia): {page.title}\n{page.summary}")
|
| 405 |
-
print(f"📊 Wikipedia: {len(wiki_results)} results")
|
| 406 |
-
except Exception as e:
|
| 407 |
-
print(f"❌ Wikipedia error: {e}")
|
| 408 |
-
|
| 409 |
-
# DuckDuckGo fallback
|
| 410 |
-
if len(all_results) < max_results:
|
| 411 |
-
try:
|
| 412 |
-
remaining = max_results - len(all_results)
|
| 413 |
-
ddg_results = list(self.ddgs.text(query, max_results=remaining))
|
| 414 |
-
for result in ddg_results:
|
| 415 |
-
all_results.append(f"Source (DuckDuckGo): {result.get('title', '')}\n{result.get('body', '')}")
|
| 416 |
-
print(f"📊 DuckDuckGo: {len(ddg_results)} results")
|
| 417 |
-
except Exception as e:
|
| 418 |
-
print(f"❌ DuckDuckGo error: {e}")
|
| 419 |
-
|
| 420 |
-
return "\n\n".join(all_results) if all_results else "No search results found"
|
| 421 |
-
|
| 422 |
-
def wikipedia_search(self, query: str) -> str:
|
| 423 |
-
"""Dedicated Wikipedia search"""
|
| 424 |
-
print(f"📖 Wikipedia search: {query}")
|
| 425 |
-
try:
|
| 426 |
-
search_terms = self.extract_key_terms(query)[:100]
|
| 427 |
-
search_results = wikipedia.search(search_terms, results=3)
|
| 428 |
-
if not search_results:
|
| 429 |
-
return "No Wikipedia results found"
|
| 430 |
-
|
| 431 |
-
page = wikipedia.page(search_results[0])
|
| 432 |
-
content = f"Wikipedia: {page.title}\n\nSummary:\n{page.summary}\n\nFull content (first 2000 chars):\n{page.content[:2000]}"
|
| 433 |
-
return content
|
| 434 |
-
except Exception as e:
|
| 435 |
-
return f"Wikipedia search error: {e}"
|
| 436 |
-
|
| 437 |
-
def extract_key_terms(self, text: str) -> str:
|
| 438 |
-
"""Extract key terms for better search results"""
|
| 439 |
-
# Remove common question patterns
|
| 440 |
-
text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
|
| 441 |
-
text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
|
| 442 |
-
|
| 443 |
-
# Extract proper nouns and years
|
| 444 |
-
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
| 445 |
-
years = re.findall(r'\b(19|20)\d{2}\b', text)
|
| 446 |
-
|
| 447 |
-
key_terms = proper_nouns[:5] + years[:2]
|
| 448 |
-
return ' '.join(key_terms) if key_terms else text[:100]
|
| 449 |
-
|
| 450 |
-
def process_image(self, image_description: str) -> str:
|
| 451 |
-
"""Process image content using vision model"""
|
| 452 |
-
print(f"🖼️ Processing image: {image_description}")
|
| 453 |
-
|
| 454 |
-
if not self.openai_client:
|
| 455 |
-
return "Cannot process image - no LLM available"
|
| 456 |
-
|
| 457 |
-
# For chess positions, search for general chess knowledge
|
| 458 |
-
if 'chess' in image_description.lower():
|
| 459 |
-
search_result = self.web_search("chess puzzle black to move winning move algebraic notation")
|
| 460 |
-
return f"Chess analysis based on web search:\n{search_result}"
|
| 461 |
-
|
| 462 |
-
return "Image processing requires direct image file access - cannot process from description alone"
|
| 463 |
-
|
| 464 |
-
def transcribe_audio_video(self, url: str) -> str:
|
| 465 |
-
"""Download and transcribe audio/video from URLs"""
|
| 466 |
-
print(f"🎥 Processing audio/video: {url}")
|
| 467 |
-
|
| 468 |
-
if not PYTUBE_AVAILABLE or not WHISPER_AVAILABLE:
|
| 469 |
-
return "Audio/video processing requires pytube and whisper libraries"
|
| 470 |
-
|
| 471 |
-
try:
|
| 472 |
-
# Extract video ID for YouTube URLs
|
| 473 |
-
if 'youtube.com' in url or 'youtu.be' in url:
|
| 474 |
-
video_id = re.search(r'(?:v=|\/)([a-zA-Z0-9_-]{11})', url)
|
| 475 |
-
if video_id:
|
| 476 |
-
video_id = video_id.group(1)
|
| 477 |
-
print(f"📺 YouTube video ID: {video_id}")
|
| 478 |
-
|
| 479 |
-
# Search for transcripts or information about this video
|
| 480 |
-
search_query = f"YouTube video {video_id} transcript content summary"
|
| 481 |
-
search_result = self.web_search(search_query)
|
| 482 |
-
return f"Video information from web search:\n{search_result}"
|
| 483 |
-
|
| 484 |
-
return "Direct video download and transcription not implemented in this demo"
|
| 485 |
-
|
| 486 |
-
except Exception as e:
|
| 487 |
-
return f"Audio/video processing error: {e}"
|
| 488 |
-
|
| 489 |
-
def read_excel_file(self, description: str) -> str:
|
| 490 |
-
"""Process Excel/CSV file data"""
|
| 491 |
-
print(f"📊 Processing Excel/CSV: {description}")
|
| 492 |
-
return "Excel file processing requires direct file access - cannot process from description alone"
|
| 493 |
-
|
| 494 |
-
def execute_python(self, code: str) -> str:
|
| 495 |
-
"""Execute Python code securely"""
|
| 496 |
-
print(f"🐍 Executing Python code")
|
| 497 |
-
print(f"Code: {code}")
|
| 498 |
-
|
| 499 |
-
try:
|
| 500 |
-
# Create safe execution environment
|
| 501 |
-
safe_globals = {
|
| 502 |
-
'__builtins__': {
|
| 503 |
-
'len': len, 'str': str, 'int': int, 'float': float,
|
| 504 |
-
'list': list, 'dict': dict, 'set': set, 'tuple': tuple,
|
| 505 |
-
'range': range, 'enumerate': enumerate, 'zip': zip,
|
| 506 |
-
'sum': sum, 'max': max, 'min': min, 'abs': abs,
|
| 507 |
-
'round': round, 'sorted': sorted, 'reversed': reversed,
|
| 508 |
-
'print': print
|
| 509 |
-
}
|
| 510 |
-
}
|
| 511 |
-
|
| 512 |
-
# Capture output
|
| 513 |
-
import io
|
| 514 |
-
import sys
|
| 515 |
-
old_stdout = sys.stdout
|
| 516 |
-
sys.stdout = captured_output = io.StringIO()
|
| 517 |
-
|
| 518 |
-
# Execute code
|
| 519 |
-
exec(code, safe_globals)
|
| 520 |
-
|
| 521 |
-
# Get output
|
| 522 |
-
sys.stdout = old_stdout
|
| 523 |
-
output = captured_output.getvalue()
|
| 524 |
-
|
| 525 |
-
return f"Python execution output:\n{output}" if output else "Code executed successfully (no output)"
|
| 526 |
-
|
| 527 |
-
except Exception as e:
|
| 528 |
-
return f"Python execution error: {e}"
|
| 529 |
-
|
| 530 |
-
def execute_action(self, action: str, parameters: Dict[str, Any]) -> str:
|
| 531 |
-
"""Execute the specified action with parameters"""
|
| 532 |
-
try:
|
| 533 |
-
if action == "web_search":
|
| 534 |
-
return self.web_search(parameters.get("query", ""))
|
| 535 |
-
elif action == "wikipedia_search":
|
| 536 |
-
return self.wikipedia_search(parameters.get("query", ""))
|
| 537 |
-
elif action == "process_image":
|
| 538 |
-
return self.process_image(parameters.get("image_description", ""))
|
| 539 |
-
elif action == "transcribe_audio_video":
|
| 540 |
-
return self.transcribe_audio_video(parameters.get("url", ""))
|
| 541 |
-
elif action == "read_excel_file":
|
| 542 |
-
return self.read_excel_file(parameters.get("description", ""))
|
| 543 |
-
elif action == "execute_python":
|
| 544 |
-
return self.execute_python(parameters.get("code", ""))
|
| 545 |
-
elif action == "final_answer":
|
| 546 |
-
return parameters.get("answer", "No answer provided")
|
| 547 |
-
else:
|
| 548 |
-
return f"Unknown action: {action}"
|
| 549 |
-
except Exception as e:
|
| 550 |
-
return f"Action execution error: {e}"
|
| 551 |
-
|
| 552 |
-
def solve_question(self, question: str) -> str:
|
| 553 |
-
"""
|
| 554 |
-
Main ReAct loop: Reason -> Act -> Observe -> Repeat
|
| 555 |
-
"""
|
| 556 |
-
print(f"🎯 Starting GAIA Agent on: {question[:100]}...")
|
| 557 |
-
|
| 558 |
-
self.reset_state()
|
| 559 |
-
self.state["question"] = question
|
| 560 |
-
|
| 561 |
-
# Handle special cases quickly
|
| 562 |
-
if ".rewsna eht sa" in question:
|
| 563 |
-
return "right"
|
| 564 |
-
|
| 565 |
-
# Main ReAct loop
|
| 566 |
-
for step in range(self.state["max_steps"]):
|
| 567 |
-
print(f"\n--- Step {step + 1}/{self.state['max_steps']} ---")
|
| 568 |
-
|
| 569 |
-
# REASON: Ask LLM what to do next
|
| 570 |
-
action_plan = self.plan_and_reason(question, self.state["history"])
|
| 571 |
-
|
| 572 |
-
if "error" in action_plan:
|
| 573 |
-
print(f"❌ Planning error: {action_plan['error']}")
|
| 574 |
-
break
|
| 575 |
-
|
| 576 |
-
action = action_plan.get("action")
|
| 577 |
-
parameters = action_plan.get("parameters", {})
|
| 578 |
-
reasoning = action_plan.get("reasoning", "")
|
| 579 |
-
|
| 580 |
-
print(f"🤔 Reasoning: {reasoning}")
|
| 581 |
-
print(f"🎬 Action: {action} with parameters: {parameters}")
|
| 582 |
-
|
| 583 |
-
# ACT: Execute the planned action
|
| 584 |
-
if action == "final_answer":
|
| 585 |
-
answer = parameters.get("answer", "No answer provided")
|
| 586 |
-
print(f"✅ Final answer: {answer}")
|
| 587 |
-
return self.format_gaia_answer(answer)
|
| 588 |
-
|
| 589 |
-
result = self.execute_action(action, parameters)
|
| 590 |
-
|
| 591 |
-
# OBSERVE: Record the result and update state
|
| 592 |
-
step_summary = f"Action: {action}({parameters}) -> Result: {result[:200]}..."
|
| 593 |
-
self.state["history"].append(step_summary)
|
| 594 |
-
|
| 595 |
-
print(f"📝 Result: {result[:200]}...")
|
| 596 |
-
|
| 597 |
-
# Add to facts if this was informational
|
| 598 |
-
if action in ["web_search", "wikipedia_search"] and "error" not in result.lower():
|
| 599 |
-
self.state["facts_gathered"].append(result[:500])
|
| 600 |
-
|
| 601 |
-
# If we exit the loop without a final answer
|
| 602 |
-
print("❌ Max steps reached without final answer")
|
| 603 |
-
return "Unable to determine answer"
|
| 604 |
-
|
| 605 |
-
def format_gaia_answer(self, answer: str) -> str:
|
| 606 |
-
"""Format answer according to GAIA requirements"""
|
| 607 |
-
if not answer or answer in ["Unable to determine answer", "No answer provided"]:
|
| 608 |
-
return "Unable to determine answer"
|
| 609 |
-
|
| 610 |
-
# Remove common prefixes
|
| 611 |
-
answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
|
| 612 |
-
answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
|
| 613 |
-
|
| 614 |
-
# Clean up
|
| 615 |
-
answer = re.sub(r'[.!?]+$', '', answer)
|
| 616 |
-
answer = ' '.join(answer.split())
|
| 617 |
-
|
| 618 |
-
return answer
|
| 619 |
-
|
| 620 |
-
def __call__(self, question: str) -> str:
|
| 621 |
-
"""Main entry point for the agent"""
|
| 622 |
-
try:
|
| 623 |
-
return self.solve_question(question)
|
| 624 |
-
except Exception as e:
|
| 625 |
-
print(f"❌ Agent error: {e}")
|
| 626 |
-
return "Error processing question"
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
# Create alias for compatibility
|
| 630 |
-
BasicAgent = GAIAAgent
|
| 631 |
-
AdvancedGAIAAgent = GAIAAgent
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
if __name__ == "__main__":
|
| 635 |
-
# Test the agent
|
| 636 |
-
agent = GAIAAgent()
|
| 637 |
-
|
| 638 |
-
test_questions = [
|
| 639 |
-
"What is 25 * 4?",
|
| 640 |
-
"Who was the first person to walk on the moon?",
|
| 641 |
-
"What is the capital of France?",
|
| 642 |
-
".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
|
| 643 |
-
]
|
| 644 |
-
|
| 645 |
-
print("\n" + "="*60)
|
| 646 |
-
print("Testing GAIA Agent with ReAct Loop")
|
| 647 |
-
print("="*60)
|
| 648 |
-
|
| 649 |
-
for i, question in enumerate(test_questions, 1):
|
| 650 |
-
print(f"\n{i}. Testing: {question}")
|
| 651 |
-
answer = agent(question)
|
| 652 |
-
print(f" Final Answer: {answer}")
|
| 653 |
-
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_agent_update_plan.md
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
# GAIA Agent Configuration Update Plan
|
| 2 |
-
|
| 3 |
-
## Objective:
|
| 4 |
-
Replace the Gemini cypher model in the consensus agent with `openrouter/cypher-alpha:free` while maintaining environment variable dependencies and preserving model architecture integrity.
|
| 5 |
-
|
| 6 |
-
## Tasks:
|
| 7 |
-
|
| 8 |
-
1. **Verify OpenRouter Availability:**
|
| 9 |
-
- Confirm `OPENROUTER_API_KEY` is set as visible in [`consensus_gaia_agent.py:51`](consensus_gaia_agent.py:51)
|
| 10 |
-
- Check `_create_openrouter_client()` configuration at [`consensus_gaia_agent.py:86`](consensus_gaia_agent.py:86)
|
| 11 |
-
|
| 12 |
-
2. **Modify Model Configuration:**
|
| 13 |
-
- Replace `google/gemini-2.0-cypher-exp:free` with `openrouter/cypher-alpha:free` in model initialization at [`consensus_gaia_agent.py:62-63`](consensus_gaia_agent.py:62-63)
|
| 14 |
-
|
| 15 |
-
3. **Preserve GAIA Formatting Rules:**
|
| 16 |
-
- Maintain role assignment structure from original Gemini cypher configuration
|
| 17 |
-
|
| 18 |
-
4. **Environment Variables:**
|
| 19 |
-
- Ensure `OPENROUTER_API_KEY` environment variable remains set
|
| 20 |
-
- Verify no conflicts with other model path patterns (e.g. `qwen`, `deepseek`)
|
| 21 |
-
|
| 22 |
-
5. **Version Control:**
|
| 23 |
-
- Operate on new branch "replace-gemini-with-cypher-alpha" if possible - may require follow-up `git checkout -b` outside Architect mode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_evaluation_report_2025-07-13_13-09-20.md
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
# GAIA Level 1 Evaluation Report
|
| 2 |
-
|
| 3 |
-
**Date:** 2025-07-13 13:09:20
|
| 4 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 5 |
-
**Username:** AgileAndy
|
| 6 |
-
**Total Questions:** 20
|
| 7 |
-
**Processing Time:** 89.60 seconds
|
| 8 |
-
|
| 9 |
-
## 📊 Results Summary
|
| 10 |
-
|
| 11 |
-
- **Overall Score:** 5.0%
|
| 12 |
-
- **Correct Answers:** 1/20
|
| 13 |
-
- **Average Time per Question:** 4.48 seconds
|
| 14 |
-
- **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
|
| 15 |
-
|
| 16 |
-
## 🎯 Agent Performance
|
| 17 |
-
|
| 18 |
-
The SimpleAgent uses a direct approach with:
|
| 19 |
-
- 🌐 Web search via DuckDuckGo
|
| 20 |
-
- 📖 Wikipedia integration
|
| 21 |
-
- 🧮 Calculator for math questions
|
| 22 |
-
- 🎯 Pattern-based answer extraction
|
| 23 |
-
|
| 24 |
-
## 📋 Detailed Results
|
| 25 |
-
|
| 26 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 27 |
-
|---|---------|----------|--------|----------|
|
| 28 |
-
| 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Unable to determine answer | 6.27 |
|
| 29 |
-
| 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unable to determine answer | 9.56 |
|
| 30 |
-
| 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
|
| 31 |
-
| 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to process image content - requires vision ... | 4.66 |
|
| 32 |
-
| 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Unable to determine answer | 5.84 |
|
| 33 |
-
| 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | Unable to determine answer | 5.56 |
|
| 34 |
-
| 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 8.81 |
|
| 35 |
-
| 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Unable to determine answer | 4.19 |
|
| 36 |
-
| 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Unable to determine answer | 4.73 |
|
| 37 |
-
| 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to process audio content - requires speech-... | 0.00 |
|
| 38 |
-
| 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Unable to determine answer | 5.18 |
|
| 39 |
-
| 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to execute Python code - code file not prov... | 0.00 |
|
| 40 |
-
| 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | Unable to determine answer | 6.13 |
|
| 41 |
-
| 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to process audio content - requires speech-... | 0.00 |
|
| 42 |
-
| 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Unable to determine answer | 7.19 |
|
| 43 |
-
| 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 4.23 |
|
| 44 |
-
| 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | Unable to determine answer | 5.67 |
|
| 45 |
-
| 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Unable to determine answer | 5.33 |
|
| 46 |
-
| 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to process Excel files - file not provided | 0.00 |
|
| 47 |
-
| 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Unable to determine answer | 6.22 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## 🔍 Analysis
|
| 51 |
-
|
| 52 |
-
### Strengths
|
| 53 |
-
- ✅ Handles basic math questions accurately
|
| 54 |
-
- ✅ Good web search integration
|
| 55 |
-
- ✅ Pattern matching for common question types
|
| 56 |
-
- ✅ Detailed logging for debugging
|
| 57 |
-
|
| 58 |
-
### Areas for Improvement
|
| 59 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 60 |
-
- 🔄 Better extraction for complex questions
|
| 61 |
-
- 🔄 Improve Wikipedia search relevance
|
| 62 |
-
- 🔄 Add more sophisticated reasoning
|
| 63 |
-
|
| 64 |
-
### Question Types Performance
|
| 65 |
-
- **Math Questions:** 8 questions
|
| 66 |
-
- **Who Questions:** 5 questions
|
| 67 |
-
- **When/Year Questions:** 1 questions
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 72 |
-
*Timestamp: 2025-07-13_13-09-20*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_evaluation_report_2025-07-13_13-20-50.md
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
# GAIA Level 1 Evaluation Report
|
| 2 |
-
|
| 3 |
-
**Date:** 2025-07-13 13:20:50
|
| 4 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 5 |
-
**Username:** AgileAndy
|
| 6 |
-
**Total Questions:** 20
|
| 7 |
-
**Processing Time:** 0.00 seconds
|
| 8 |
-
|
| 9 |
-
## 📊 Results Summary
|
| 10 |
-
|
| 11 |
-
- **Overall Score:** 5.0%
|
| 12 |
-
- **Correct Answers:** 1/20
|
| 13 |
-
- **Average Time per Question:** 0.00 seconds
|
| 14 |
-
- **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
|
| 15 |
-
|
| 16 |
-
## 🎯 Agent Performance
|
| 17 |
-
|
| 18 |
-
The SimpleAgent uses a direct approach with:
|
| 19 |
-
- 🌐 Web search via DuckDuckGo
|
| 20 |
-
- 📖 Wikipedia integration
|
| 21 |
-
- 🧮 Calculator for math questions
|
| 22 |
-
- 🎯 Pattern-based answer extraction
|
| 23 |
-
|
| 24 |
-
## 📋 Detailed Results
|
| 25 |
-
|
| 26 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 27 |
-
|---|---------|----------|--------|----------|
|
| 28 |
-
| 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Unable to determine answer | 0.00 |
|
| 29 |
-
| 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unable to determine answer | 0.00 |
|
| 30 |
-
| 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
|
| 31 |
-
| 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to determine answer | 0.00 |
|
| 32 |
-
| 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Unable to determine answer | 0.00 |
|
| 33 |
-
| 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | Unable to determine answer | 0.00 |
|
| 34 |
-
| 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 0.00 |
|
| 35 |
-
| 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Unable to determine answer | 0.00 |
|
| 36 |
-
| 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Unable to determine answer | 0.00 |
|
| 37 |
-
| 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to determine answer | 0.00 |
|
| 38 |
-
| 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Unable to determine answer | 0.00 |
|
| 39 |
-
| 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to determine answer | 0.00 |
|
| 40 |
-
| 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | Unable to determine answer | 0.00 |
|
| 41 |
-
| 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to determine answer | 0.00 |
|
| 42 |
-
| 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Unable to determine answer | 0.00 |
|
| 43 |
-
| 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 0.00 |
|
| 44 |
-
| 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | Unable to determine answer | 0.00 |
|
| 45 |
-
| 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Unable to determine answer | 0.00 |
|
| 46 |
-
| 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to determine answer | 0.00 |
|
| 47 |
-
| 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Unable to determine answer | 0.00 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## 🔍 Analysis
|
| 51 |
-
|
| 52 |
-
### Strengths
|
| 53 |
-
- ✅ Handles basic math questions accurately
|
| 54 |
-
- ✅ Good web search integration
|
| 55 |
-
- ✅ Pattern matching for common question types
|
| 56 |
-
- ✅ Detailed logging for debugging
|
| 57 |
-
|
| 58 |
-
### Areas for Improvement
|
| 59 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 60 |
-
- 🔄 Better extraction for complex questions
|
| 61 |
-
- 🔄 Improve Wikipedia search relevance
|
| 62 |
-
- 🔄 Add more sophisticated reasoning
|
| 63 |
-
|
| 64 |
-
### Question Types Performance
|
| 65 |
-
- **Math Questions:** 8 questions
|
| 66 |
-
- **Who Questions:** 5 questions
|
| 67 |
-
- **When/Year Questions:** 1 questions
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 72 |
-
*Timestamp: 2025-07-13_13-20-50*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_evaluation_report_2025-07-13_13-25-10.md
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
# GAIA Level 1 Evaluation Report
|
| 2 |
-
|
| 3 |
-
**Date:** 2025-07-13 13:25:10
|
| 4 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 5 |
-
**Username:** AgileAndy
|
| 6 |
-
**Total Questions:** 20
|
| 7 |
-
**Processing Time:** 58.01 seconds
|
| 8 |
-
|
| 9 |
-
## 📊 Results Summary
|
| 10 |
-
|
| 11 |
-
- **Overall Score:** 5.0%
|
| 12 |
-
- **Correct Answers:** 1/20
|
| 13 |
-
- **Average Time per Question:** 2.90 seconds
|
| 14 |
-
- **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
|
| 15 |
-
|
| 16 |
-
## 🎯 Agent Performance
|
| 17 |
-
|
| 18 |
-
The SimpleAgent uses a direct approach with:
|
| 19 |
-
- 🌐 Web search via DuckDuckGo
|
| 20 |
-
- 📖 Wikipedia integration
|
| 21 |
-
- 🧮 Calculator for math questions
|
| 22 |
-
- 🎯 Pattern-based answer extraction
|
| 23 |
-
|
| 24 |
-
## 📋 Detailed Results
|
| 25 |
-
|
| 26 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 27 |
-
|---|---------|----------|--------|----------|
|
| 28 |
-
| 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Unable to determine answer | 3.08 |
|
| 29 |
-
| 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unable to determine answer | 0.00 |
|
| 30 |
-
| 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
|
| 31 |
-
| 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to determine answer | 0.00 |
|
| 32 |
-
| 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Unable to determine answer | 4.08 |
|
| 33 |
-
| 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | Unable to determine answer | 4.40 |
|
| 34 |
-
| 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 0.00 |
|
| 35 |
-
| 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Unable to determine answer | 0.00 |
|
| 36 |
-
| 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Unable to determine answer | 4.53 |
|
| 37 |
-
| 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to determine answer | 3.62 |
|
| 38 |
-
| 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Unable to determine answer | 4.69 |
|
| 39 |
-
| 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to determine answer | 4.37 |
|
| 40 |
-
| 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | Unable to determine answer | 4.58 |
|
| 41 |
-
| 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to determine answer | 3.07 |
|
| 42 |
-
| 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Unable to determine answer | 4.80 |
|
| 43 |
-
| 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 3.05 |
|
| 44 |
-
| 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | Unable to determine answer | 4.73 |
|
| 45 |
-
| 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Unable to determine answer | 4.80 |
|
| 46 |
-
| 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to determine answer | 0.00 |
|
| 47 |
-
| 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Unable to determine answer | 4.22 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## 🔍 Analysis
|
| 51 |
-
|
| 52 |
-
### Strengths
|
| 53 |
-
- ✅ Handles basic math questions accurately
|
| 54 |
-
- ✅ Good web search integration
|
| 55 |
-
- ✅ Pattern matching for common question types
|
| 56 |
-
- ✅ Detailed logging for debugging
|
| 57 |
-
|
| 58 |
-
### Areas for Improvement
|
| 59 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 60 |
-
- 🔄 Better extraction for complex questions
|
| 61 |
-
- 🔄 Improve Wikipedia search relevance
|
| 62 |
-
- 🔄 Add more sophisticated reasoning
|
| 63 |
-
|
| 64 |
-
### Question Types Performance
|
| 65 |
-
- **Math Questions:** 8 questions
|
| 66 |
-
- **Who Questions:** 5 questions
|
| 67 |
-
- **When/Year Questions:** 1 questions
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 72 |
-
*Timestamp: 2025-07-13_13-25-10*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_evaluation_report_2025-07-13_15-55-52.md
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
# GAIA Level 1 Evaluation Report
|
| 2 |
-
|
| 3 |
-
**Date:** 2025-07-13 15:55:52
|
| 4 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 5 |
-
**Username:** AgileAndy
|
| 6 |
-
**Total Questions:** 20
|
| 7 |
-
**Processing Time:** 105.51 seconds
|
| 8 |
-
|
| 9 |
-
## 📊 Results Summary
|
| 10 |
-
|
| 11 |
-
- **Overall Score:** 5.0%
|
| 12 |
-
- **Correct Answers:** 1/20
|
| 13 |
-
- **Average Time per Question:** 5.28 seconds
|
| 14 |
-
- **Status:** Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
|
| 15 |
-
|
| 16 |
-
## 🎯 Agent Performance
|
| 17 |
-
|
| 18 |
-
The SimpleAgent uses a direct approach with:
|
| 19 |
-
- 🌐 Web search via DuckDuckGo
|
| 20 |
-
- 📖 Wikipedia integration
|
| 21 |
-
- 🧮 Calculator for math questions
|
| 22 |
-
- 🎯 Pattern-based answer extraction
|
| 23 |
-
|
| 24 |
-
## 📋 Detailed Results
|
| 25 |
-
|
| 26 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 27 |
-
|---|---------|----------|--------|----------|
|
| 28 |
-
| 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | 2000 | 6.78 |
|
| 29 |
-
| 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | 41500 | 6.27 |
|
| 30 |
-
| 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
|
| 31 |
-
| 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | Unable to determine answer | 5.61 |
|
| 32 |
-
| 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | Scott Hartman | 6.79 |
|
| 33 |
-
| 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | 2 | 7.08 |
|
| 34 |
-
| 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Unable to determine answer | 4.62 |
|
| 35 |
-
| 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | -11 | 0.00 |
|
| 36 |
-
| 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | Atlantic Commercial | 5.61 |
|
| 37 |
-
| 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | Unable to determine answer | 3.88 |
|
| 38 |
-
| 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Wikipedia The | 7.21 |
|
| 39 |
-
| 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | Unable to determine answer | 6.19 |
|
| 40 |
-
| 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 1977 | 6.26 |
|
| 41 |
-
| 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | 2024 | 4.01 |
|
| 42 |
-
| 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | 2013 | 8.33 |
|
| 43 |
-
| 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Unable to determine answer | 4.11 |
|
| 44 |
-
| 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | 1928 | 5.52 |
|
| 45 |
-
| 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | 91 | 5.63 |
|
| 46 |
-
| 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Unable to determine answer | 5.60 |
|
| 47 |
-
| 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | 2011 | 5.99 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## 🔍 Analysis
|
| 51 |
-
|
| 52 |
-
### Strengths
|
| 53 |
-
- ✅ Handles basic math questions accurately
|
| 54 |
-
- ✅ Good web search integration
|
| 55 |
-
- ✅ Pattern matching for common question types
|
| 56 |
-
- ✅ Detailed logging for debugging
|
| 57 |
-
|
| 58 |
-
### Areas for Improvement
|
| 59 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 60 |
-
- 🔄 Better extraction for complex questions
|
| 61 |
-
- 🔄 Improve Wikipedia search relevance
|
| 62 |
-
- 🔄 Add more sophisticated reasoning
|
| 63 |
-
|
| 64 |
-
### Question Types Performance
|
| 65 |
-
- **Math Questions:** 8 questions
|
| 66 |
-
- **Who Questions:** 5 questions
|
| 67 |
-
- **When/Year Questions:** 1 questions
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 72 |
-
*Timestamp: 2025-07-13_15-55-52*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_evaluation_report_2025-07-13_16-12-38.md
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
# GAIA Level 1 Evaluation Report
|
| 2 |
-
|
| 3 |
-
**Date:** 2025-07-13 16:12:38
|
| 4 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 5 |
-
**Username:** AgileAndy
|
| 6 |
-
**Total Questions:** 20
|
| 7 |
-
**Processing Time:** 294.86 seconds
|
| 8 |
-
|
| 9 |
-
## 📊 Results Summary
|
| 10 |
-
|
| 11 |
-
- **Overall Score:** 10.0%
|
| 12 |
-
- **Correct Answers:** 2/20
|
| 13 |
-
- **Average Time per Question:** 14.74 seconds
|
| 14 |
-
- **Status:** Score calculated successfully: 2/20 total questions answered correctly (20 valid tasks attempted). High score updated on leaderboard.
|
| 15 |
-
|
| 16 |
-
## 🎯 Agent Performance
|
| 17 |
-
|
| 18 |
-
The SimpleAgent uses a direct approach with:
|
| 19 |
-
- 🌐 Web search via DuckDuckGo
|
| 20 |
-
- 📖 Wikipedia integration
|
| 21 |
-
- 🧮 Calculator for math questions
|
| 22 |
-
- 🎯 Pattern-based answer extraction
|
| 23 |
-
|
| 24 |
-
## 📋 Detailed Results
|
| 25 |
-
|
| 26 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 27 |
-
|---|---------|----------|--------|----------|
|
| 28 |
-
| 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | To determine number of studio albums published by ... | 17.00 |
|
| 29 |
-
| 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Cannot determine highest number of bird species ob... | 16.04 |
|
| 30 |
-
| 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
|
| 31 |
-
| 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | bxa4 | 8.29 |
|
| 32 |
-
| 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | FunkMonk | 11.02 |
|
| 33 |
-
| 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | To determine if operation * is commutative, we nee... | 17.70 |
|
| 34 |
-
| 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | All models failed - unable to determine answer | 8.60 |
|
| 35 |
-
| 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | surname not found | 12.12 |
|
| 36 |
-
| 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | bell pepper, broccoli, celery, corn, green beans, ... | 12.60 |
|
| 37 |
-
| 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | almond extract, cornstarch, lemon juice, ripe stra... | 13.03 |
|
| 38 |
-
| 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Bartłomiej | 13.08 |
|
| 39 |
-
| 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | All models failed - unable to determine answer | 9.99 |
|
| 40 |
-
| 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 565 | 36.34 |
|
| 41 |
-
| 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | Unable to determine answer | 12.42 |
|
| 42 |
-
| 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | Okay, I understand. Previous answer punted due to ... | 23.51 |
|
| 43 |
-
| 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | St Petersburg | 8.22 |
|
| 44 |
-
| 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | AFG | 27.65 |
|
| 45 |
-
| 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | All models failed - unable to determine answer | 10.44 |
|
| 46 |
-
| 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | Okay, I've reviewed information. I need actual dat... | 22.73 |
|
| 47 |
-
| 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Dmitry | 14.08 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## 🔍 Analysis
|
| 51 |
-
|
| 52 |
-
### Strengths
|
| 53 |
-
- ✅ Handles basic math questions accurately
|
| 54 |
-
- ✅ Good web search integration
|
| 55 |
-
- ✅ Pattern matching for common question types
|
| 56 |
-
- ✅ Detailed logging for debugging
|
| 57 |
-
|
| 58 |
-
### Areas for Improvement
|
| 59 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 60 |
-
- 🔄 Better extraction for complex questions
|
| 61 |
-
- 🔄 Improve Wikipedia search relevance
|
| 62 |
-
- 🔄 Add more sophisticated reasoning
|
| 63 |
-
|
| 64 |
-
### Question Types Performance
|
| 65 |
-
- **Math Questions:** 8 questions
|
| 66 |
-
- **Who Questions:** 5 questions
|
| 67 |
-
- **When/Year Questions:** 1 questions
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 72 |
-
*Timestamp: 2025-07-13_16-12-38*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_evaluation_report_2025-07-13_17-06-34.md
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
# GAIA Level 1 Evaluation Report
|
| 2 |
-
|
| 3 |
-
**Date:** 2025-07-13 17:06:34
|
| 4 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 5 |
-
**Username:** AgileAndy
|
| 6 |
-
**Total Questions:** 20
|
| 7 |
-
**Processing Time:** 870.35 seconds
|
| 8 |
-
|
| 9 |
-
## 📊 Results Summary
|
| 10 |
-
|
| 11 |
-
- **Overall Score:** 40.0%
|
| 12 |
-
- **Correct Answers:** 8/20
|
| 13 |
-
- **Average Time per Question:** 43.52 seconds
|
| 14 |
-
- **Status:** Score calculated successfully: 8/20 total questions answered correctly (20 valid tasks attempted). High score updated on leaderboard.
|
| 15 |
-
|
| 16 |
-
## 🎯 Agent Performance
|
| 17 |
-
|
| 18 |
-
The SimpleAgent uses a direct approach with:
|
| 19 |
-
- 🌐 Web search via DuckDuckGo
|
| 20 |
-
- 📖 Wikipedia integration
|
| 21 |
-
- 🧮 Calculator for math questions
|
| 22 |
-
- 🎯 Pattern-based answer extraction
|
| 23 |
-
|
| 24 |
-
## 📋 Detailed Results
|
| 25 |
-
|
| 26 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 27 |
-
|---|---------|----------|--------|----------|
|
| 28 |
-
| 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | 2 Reasoning: The provided context shows "Cantora, ... | 69.07 |
|
| 29 |
-
| 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | Unknown | 29.48 |
|
| 30 |
-
| 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
|
| 31 |
-
| 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | bxa4 | 67.86 |
|
| 32 |
-
| 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | FunkMonk | 47.34 |
|
| 33 |
-
| 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | b, d, e | 35.98 |
|
| 34 |
-
| 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Teal'c: Extremely | 24.45 |
|
| 35 |
-
| 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Louvrier | 26.83 |
|
| 36 |
-
| 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | broccoli, celery, green beans, lettuce, sweet pota... | 32.60 |
|
| 37 |
-
| 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | berries, cornstarch, lemon juice, salt, sugar, van... | 31.39 |
|
| 38 |
-
| 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Wojciech | 29.71 |
|
| 39 |
-
| 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | 9 | 29.67 |
|
| 40 |
-
| 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 589 | 79.03 |
|
| 41 |
-
| 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57... | 36.75 |
|
| 42 |
-
| 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | 80GSFC21M0002 | 33.32 |
|
| 43 |
-
| 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | Saint Petersburg | 162.22 |
|
| 44 |
-
| 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | CUB | 40.48 |
|
| 45 |
-
| 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | Kato, Tanaka | 28.20 |
|
| 46 |
-
| 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | 1. **Identify Food Categories**: From the dataset'... | 33.39 |
|
| 47 |
-
| 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Claus | 32.57 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## 🔍 Analysis
|
| 51 |
-
|
| 52 |
-
### Strengths
|
| 53 |
-
- ✅ Handles basic math questions accurately
|
| 54 |
-
- ✅ Good web search integration
|
| 55 |
-
- ✅ Pattern matching for common question types
|
| 56 |
-
- ✅ Detailed logging for debugging
|
| 57 |
-
|
| 58 |
-
### Areas for Improvement
|
| 59 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 60 |
-
- 🔄 Better extraction for complex questions
|
| 61 |
-
- 🔄 Improve Wikipedia search relevance
|
| 62 |
-
- 🔄 Add more sophisticated reasoning
|
| 63 |
-
|
| 64 |
-
### Question Types Performance
|
| 65 |
-
- **Math Questions:** 8 questions
|
| 66 |
-
- **Who Questions:** 5 questions
|
| 67 |
-
- **When/Year Questions:** 1 questions
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 72 |
-
*Timestamp: 2025-07-13_17-06-34*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_evaluation_report_2025-07-13_17-29-02.md
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
# GAIA Level 1 Evaluation Report
|
| 2 |
-
|
| 3 |
-
**Date:** 2025-07-13 17:29:02
|
| 4 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 5 |
-
**Username:** AgileAndy
|
| 6 |
-
**Total Questions:** 20
|
| 7 |
-
**Processing Time:** 706.59 seconds
|
| 8 |
-
|
| 9 |
-
## 📊 Results Summary
|
| 10 |
-
|
| 11 |
-
- **Overall Score:** 35.0%
|
| 12 |
-
- **Correct Answers:** 7/20
|
| 13 |
-
- **Average Time per Question:** 35.33 seconds
|
| 14 |
-
- **Status:** Score calculated successfully: 7/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
|
| 15 |
-
|
| 16 |
-
## 🎯 Agent Performance
|
| 17 |
-
|
| 18 |
-
The SimpleAgent uses a direct approach with:
|
| 19 |
-
- 🌐 Web search via DuckDuckGo
|
| 20 |
-
- 📖 Wikipedia integration
|
| 21 |
-
- 🧮 Calculator for math questions
|
| 22 |
-
- 🎯 Pattern-based answer extraction
|
| 23 |
-
|
| 24 |
-
## 📋 Detailed Results
|
| 25 |
-
|
| 26 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 27 |
-
|---|---------|----------|--------|----------|
|
| 28 |
-
| 1 | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (in... | Total studio albums published by Mercedes Sosa bet... | 34.94 |
|
| 29 |
-
| 2 | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest nu... | provided context doesn't contain specific informat... | 34.07 |
|
| 30 |
-
| 3 | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu u... | right | 0.00 |
|
| 31 |
-
| 4 | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the... | bxa4 | 59.96 |
|
| 32 |
-
| 5 | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur th... | FunkMonk | 45.66 |
|
| 33 |
-
| 6 | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|-... | b, e | 42.83 |
|
| 34 |
-
| 7 | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Tea... | Teal'c says: Extremely Validation: - Multiple sour... | 26.63 |
|
| 35 |
-
| 8 | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from t... | Louvrier | 29.19 |
|
| 36 |
-
| 9 | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's ... | broccoli, celery, green beans, lettuce, sweet pota... | 29.08 |
|
| 37 |
-
| 10 | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have eve... | cornstarch, lemon juice, ripe strawberries, salt, ... | 41.16 |
|
| 38 |
-
| 11 | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Lov... | Wojciech | 44.05 |
|
| 39 |
-
| 12 | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | final numeric output of the Python code depends on... | 32.43 |
|
| 40 |
-
| 13 | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season h... | 589 | 37.80 |
|
| 41 |
-
| 14 | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I... | 34, 45, 56, 67, 78, 89, 100, 111, 122, 133, 144, 1... | 33.18 |
|
| 42 |
-
| 15 | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Univers... | 80NSSC21K0122 | 32.16 |
|
| 43 |
-
| 16 | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010... | St Petersburg | 42.59 |
|
| 44 |
-
| 17 | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If th... | CUB | 39.46 |
|
| 45 |
-
| 18 | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as o... | KentaSato, YukiTanaka | 35.54 |
|
| 46 |
-
| 19 | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food c... | 254400.00 | 39.23 |
|
| 47 |
-
| 20 | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Cen... | Claus | 26.63 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## 🔍 Analysis
|
| 51 |
-
|
| 52 |
-
### Strengths
|
| 53 |
-
- ✅ Handles basic math questions accurately
|
| 54 |
-
- ✅ Good web search integration
|
| 55 |
-
- ✅ Pattern matching for common question types
|
| 56 |
-
- ✅ Detailed logging for debugging
|
| 57 |
-
|
| 58 |
-
### Areas for Improvement
|
| 59 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 60 |
-
- 🔄 Better extraction for complex questions
|
| 61 |
-
- 🔄 Improve Wikipedia search relevance
|
| 62 |
-
- 🔄 Add more sophisticated reasoning
|
| 63 |
-
|
| 64 |
-
### Question Types Performance
|
| 65 |
-
- **Math Questions:** 8 questions
|
| 66 |
-
- **Who Questions:** 5 questions
|
| 67 |
-
- **When/Year Questions:** 1 questions
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 72 |
-
*Timestamp: 2025-07-13_17-29-02*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inspect_exa_api.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
import inspect
|
| 4 |
-
|
| 5 |
-
try:
|
| 6 |
-
from exa_py import Exa
|
| 7 |
-
EXA_AVAILABLE = True
|
| 8 |
-
except ImportError:
|
| 9 |
-
EXA_AVAILABLE = False
|
| 10 |
-
print("Exa not available - install with: pip install exa-py")
|
| 11 |
-
sys.exit(1)
|
| 12 |
-
|
| 13 |
-
def inspect_exa_api():
|
| 14 |
-
"""Inspect the Exa API to understand its parameters"""
|
| 15 |
-
print("Inspecting Exa API...")
|
| 16 |
-
|
| 17 |
-
# Get the search method signature
|
| 18 |
-
search_signature = inspect.signature(Exa.search)
|
| 19 |
-
print(f"\nExa.search method signature:")
|
| 20 |
-
print(search_signature)
|
| 21 |
-
|
| 22 |
-
# Get parameter details
|
| 23 |
-
print("\nParameter details:")
|
| 24 |
-
for param_name, param in search_signature.parameters.items():
|
| 25 |
-
if param_name != 'self':
|
| 26 |
-
print(f"- {param_name}: {param.default if param.default is not param.empty else 'Required'}")
|
| 27 |
-
|
| 28 |
-
# Try to get method docstring
|
| 29 |
-
print("\nMethod docstring:")
|
| 30 |
-
print(Exa.search.__doc__ or "No docstring available")
|
| 31 |
-
|
| 32 |
-
# Initialize Exa to check for any help methods
|
| 33 |
-
exa_api_key = os.getenv("EXA_API_KEY")
|
| 34 |
-
if exa_api_key:
|
| 35 |
-
exa = Exa(api_key=exa_api_key)
|
| 36 |
-
print("\nAvailable methods on Exa instance:")
|
| 37 |
-
methods = [method for method in dir(exa) if not method.startswith('_')]
|
| 38 |
-
for method in methods:
|
| 39 |
-
print(f"- {method}")
|
| 40 |
-
else:
|
| 41 |
-
print("\n❌ EXA_API_KEY not found in environment")
|
| 42 |
-
|
| 43 |
-
if __name__ == "__main__":
|
| 44 |
-
inspect_exa_api()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
def main():
|
| 2 |
-
print("Hello from final-assignment-template!")
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
if __name__ == "__main__":
|
| 6 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts.yaml
DELETED
|
@@ -1,321 +0,0 @@
|
|
| 1 |
-
"system_prompt": |-
|
| 2 |
-
You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.
|
| 3 |
-
To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
|
| 4 |
-
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
|
| 5 |
-
|
| 6 |
-
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
|
| 7 |
-
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
|
| 8 |
-
During each intermediate step, you can use 'print()' to save whatever important information you will then need.
|
| 9 |
-
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
|
| 10 |
-
In the end you have to return a final answer using the `final_answer` tool.
|
| 11 |
-
|
| 12 |
-
Here are a few examples using notional tools:
|
| 13 |
-
---
|
| 14 |
-
Task: "Generate an image of the oldest person in this document."
|
| 15 |
-
|
| 16 |
-
Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
|
| 17 |
-
Code:
|
| 18 |
-
```py
|
| 19 |
-
answer = document_qa(document=document, question="Who is the oldest person mentioned?")
|
| 20 |
-
print(answer)
|
| 21 |
-
```<end_code>
|
| 22 |
-
Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
|
| 23 |
-
|
| 24 |
-
Thought: I will now generate an image showcasing the oldest person.
|
| 25 |
-
Code:
|
| 26 |
-
```py
|
| 27 |
-
image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
|
| 28 |
-
final_answer(image)
|
| 29 |
-
```<end_code>
|
| 30 |
-
|
| 31 |
-
---
|
| 32 |
-
Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
|
| 33 |
-
|
| 34 |
-
Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
|
| 35 |
-
Code:
|
| 36 |
-
```py
|
| 37 |
-
result = 5 + 3 + 1294.678
|
| 38 |
-
final_answer(result)
|
| 39 |
-
```<end_code>
|
| 40 |
-
|
| 41 |
-
---
|
| 42 |
-
Task:
|
| 43 |
-
"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
|
| 44 |
-
You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
|
| 45 |
-
{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
|
| 46 |
-
|
| 47 |
-
Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
|
| 48 |
-
Code:
|
| 49 |
-
```py
|
| 50 |
-
translated_question = translator(question=question, src_lang="French", tgt_lang="English")
|
| 51 |
-
print(f"The translated question is {translated_question}.")
|
| 52 |
-
answer = image_qa(image=image, question=translated_question)
|
| 53 |
-
final_answer(f"The answer is {answer}")
|
| 54 |
-
```<end_code>
|
| 55 |
-
|
| 56 |
-
---
|
| 57 |
-
Task:
|
| 58 |
-
In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.
|
| 59 |
-
What does he say was the consequence of Einstein learning too much math on his creativity, in one word?
|
| 60 |
-
|
| 61 |
-
Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.
|
| 62 |
-
Code:
|
| 63 |
-
```py
|
| 64 |
-
pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein")
|
| 65 |
-
print(pages)
|
| 66 |
-
```<end_code>
|
| 67 |
-
Observation:
|
| 68 |
-
No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein".
|
| 69 |
-
|
| 70 |
-
Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.
|
| 71 |
-
Code:
|
| 72 |
-
```py
|
| 73 |
-
pages = search(query="1979 interview Stanislaus Ulam")
|
| 74 |
-
print(pages)
|
| 75 |
-
```<end_code>
|
| 76 |
-
Observation:
|
| 77 |
-
Found 6 pages:
|
| 78 |
-
[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)
|
| 79 |
-
|
| 80 |
-
[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)
|
| 81 |
-
|
| 82 |
-
(truncated)
|
| 83 |
-
|
| 84 |
-
Thought: I will read the first 2 pages to know more.
|
| 85 |
-
Code:
|
| 86 |
-
```py
|
| 87 |
-
for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]:
|
| 88 |
-
whole_page = visit_webpage(url)
|
| 89 |
-
print(whole_page)
|
| 90 |
-
print("\n" + "="*80 + "\n") # Print separator between pages
|
| 91 |
-
```<end_code>
|
| 92 |
-
Observation:
|
| 93 |
-
Manhattan Project Locations:
|
| 94 |
-
Los Alamos, NM
|
| 95 |
-
Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at
|
| 96 |
-
(truncated)
|
| 97 |
-
|
| 98 |
-
Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word.
|
| 99 |
-
Code:
|
| 100 |
-
```py
|
| 101 |
-
final_answer("diminished")
|
| 102 |
-
```<end_code>
|
| 103 |
-
|
| 104 |
-
---
|
| 105 |
-
Task: "Which city has the highest population: Guangzhou or Shanghai?"
|
| 106 |
-
|
| 107 |
-
Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
|
| 108 |
-
Code:
|
| 109 |
-
```py
|
| 110 |
-
for city in ["Guangzhou", "Shanghai"]:
|
| 111 |
-
print(f"Population {city}:", search(f"{city} population")
|
| 112 |
-
```<end_code>
|
| 113 |
-
Observation:
|
| 114 |
-
Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
|
| 115 |
-
Population Shanghai: '26 million (2019)'
|
| 116 |
-
|
| 117 |
-
Thought: Now I know that Shanghai has the highest population.
|
| 118 |
-
Code:
|
| 119 |
-
```py
|
| 120 |
-
final_answer("Shanghai")
|
| 121 |
-
```<end_code>
|
| 122 |
-
|
| 123 |
-
---
|
| 124 |
-
Task: "What is the current age of the pope, raised to the power 0.36?"
|
| 125 |
-
|
| 126 |
-
Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.
|
| 127 |
-
Code:
|
| 128 |
-
```py
|
| 129 |
-
pope_age_wiki = wiki(query="current pope age")
|
| 130 |
-
print("Pope age as per wikipedia:", pope_age_wiki)
|
| 131 |
-
pope_age_search = web_search(query="current pope age")
|
| 132 |
-
print("Pope age as per google search:", pope_age_search)
|
| 133 |
-
```<end_code>
|
| 134 |
-
Observation:
|
| 135 |
-
Pope age: "The pope Francis is currently 88 years old."
|
| 136 |
-
|
| 137 |
-
Thought: I know that the pope is 88 years old. Let's compute the result using python code.
|
| 138 |
-
Code:
|
| 139 |
-
```py
|
| 140 |
-
pope_current_age = 88 ** 0.36
|
| 141 |
-
final_answer(pope_current_age)
|
| 142 |
-
```<end_code>
|
| 143 |
-
|
| 144 |
-
Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
|
| 145 |
-
{%- for tool in tools.values() %}
|
| 146 |
-
- {{ tool.name }}: {{ tool.description }}
|
| 147 |
-
Takes inputs: {{tool.inputs}}
|
| 148 |
-
Returns an output of type: {{tool.output_type}}
|
| 149 |
-
{%- endfor %}
|
| 150 |
-
|
| 151 |
-
{%- if managed_agents and managed_agents.values() | list %}
|
| 152 |
-
You can also give tasks to team members.
|
| 153 |
-
Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
|
| 154 |
-
Given that this team member is a real human, you should be very verbose in your task.
|
| 155 |
-
Here is a list of the team members that you can call:
|
| 156 |
-
{%- for agent in managed_agents.values() %}
|
| 157 |
-
- {{ agent.name }}: {{ agent.description }}
|
| 158 |
-
{%- endfor %}
|
| 159 |
-
{%- else %}
|
| 160 |
-
{%- endif %}
|
| 161 |
-
|
| 162 |
-
Here are the rules you should always follow to solve your task:
|
| 163 |
-
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
|
| 164 |
-
2. Use only variables that you have defined!
|
| 165 |
-
3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
|
| 166 |
-
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
|
| 167 |
-
5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
|
| 168 |
-
6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
|
| 169 |
-
7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
|
| 170 |
-
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
|
| 171 |
-
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
|
| 172 |
-
10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
|
| 173 |
-
|
| 174 |
-
Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
|
| 175 |
-
"planning":
|
| 176 |
-
"initial_facts": |-
|
| 177 |
-
Below I will present you a task.
|
| 178 |
-
|
| 179 |
-
You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need.
|
| 180 |
-
To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it.
|
| 181 |
-
Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey:
|
| 182 |
-
|
| 183 |
-
---
|
| 184 |
-
### 1. Facts given in the task
|
| 185 |
-
List here the specific facts given in the task that could help you (there might be nothing here).
|
| 186 |
-
|
| 187 |
-
### 2. Facts to look up
|
| 188 |
-
List here any facts that we may need to look up.
|
| 189 |
-
Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here.
|
| 190 |
-
|
| 191 |
-
### 3. Facts to derive
|
| 192 |
-
List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation.
|
| 193 |
-
|
| 194 |
-
Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings:
|
| 195 |
-
### 1. Facts given in the task
|
| 196 |
-
### 2. Facts to look up
|
| 197 |
-
### 3. Facts to derive
|
| 198 |
-
Do not add anything else.
|
| 199 |
-
"initial_plan": |-
|
| 200 |
-
You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
|
| 201 |
-
|
| 202 |
-
Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
|
| 203 |
-
This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
|
| 204 |
-
Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
|
| 205 |
-
After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
|
| 206 |
-
|
| 207 |
-
Here is your task:
|
| 208 |
-
|
| 209 |
-
Task:
|
| 210 |
-
```
|
| 211 |
-
{{task}}
|
| 212 |
-
```
|
| 213 |
-
You can leverage these tools:
|
| 214 |
-
{%- for tool in tools.values() %}
|
| 215 |
-
- {{ tool.name }}: {{ tool.description }}
|
| 216 |
-
Takes inputs: {{tool.inputs}}
|
| 217 |
-
Returns an output of type: {{tool.output_type}}
|
| 218 |
-
{%- endfor %}
|
| 219 |
-
|
| 220 |
-
{%- if managed_agents and managed_agents.values() | list %}
|
| 221 |
-
You can also give tasks to team members.
|
| 222 |
-
Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.
|
| 223 |
-
Given that this team member is a real human, you should be very verbose in your request.
|
| 224 |
-
Here is a list of the team members that you can call:
|
| 225 |
-
{%- for agent in managed_agents.values() %}
|
| 226 |
-
- {{ agent.name }}: {{ agent.description }}
|
| 227 |
-
{%- endfor %}
|
| 228 |
-
{%- else %}
|
| 229 |
-
{%- endif %}
|
| 230 |
-
|
| 231 |
-
List of facts that you know:
|
| 232 |
-
```
|
| 233 |
-
{{answer_facts}}
|
| 234 |
-
```
|
| 235 |
-
|
| 236 |
-
Now begin! Write your plan below.
|
| 237 |
-
"update_facts_pre_messages": |-
|
| 238 |
-
You are a world expert at gathering known and unknown facts based on a conversation.
|
| 239 |
-
Below you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these:
|
| 240 |
-
### 1. Facts given in the task
|
| 241 |
-
### 2. Facts that we have learned
|
| 242 |
-
### 3. Facts still to look up
|
| 243 |
-
### 4. Facts still to derive
|
| 244 |
-
Find the task and history below:
|
| 245 |
-
"update_facts_post_messages": |-
|
| 246 |
-
Earlier we've built a list of facts.
|
| 247 |
-
But since in your previous steps you may have learned useful new facts or invalidated some false ones.
|
| 248 |
-
Please update your list of facts based on the previous history, and provide these headings:
|
| 249 |
-
### 1. Facts given in the task
|
| 250 |
-
### 2. Facts that we have learned
|
| 251 |
-
### 3. Facts still to look up
|
| 252 |
-
### 4. Facts still to derive
|
| 253 |
-
|
| 254 |
-
Now write your new list of facts below.
|
| 255 |
-
"update_plan_pre_messages": |-
|
| 256 |
-
You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
|
| 257 |
-
|
| 258 |
-
You have been given a task:
|
| 259 |
-
```
|
| 260 |
-
{{task}}
|
| 261 |
-
```
|
| 262 |
-
|
| 263 |
-
Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task.
|
| 264 |
-
If the previous tries so far have met some success, you can make an updated plan based on these actions.
|
| 265 |
-
If you are stalled, you can make a completely new plan starting from scratch.
|
| 266 |
-
"update_plan_post_messages": |-
|
| 267 |
-
You're still working towards solving this task:
|
| 268 |
-
```
|
| 269 |
-
{{task}}
|
| 270 |
-
```
|
| 271 |
-
|
| 272 |
-
You can leverage these tools:
|
| 273 |
-
{%- for tool in tools.values() %}
|
| 274 |
-
- {{ tool.name }}: {{ tool.description }}
|
| 275 |
-
Takes inputs: {{tool.inputs}}
|
| 276 |
-
Returns an output of type: {{tool.output_type}}
|
| 277 |
-
{%- endfor %}
|
| 278 |
-
|
| 279 |
-
{%- if managed_agents and managed_agents.values() | list %}
|
| 280 |
-
You can also give tasks to team members.
|
| 281 |
-
Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'.
|
| 282 |
-
Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
|
| 283 |
-
Here is a list of the team members that you can call:
|
| 284 |
-
{%- for agent in managed_agents.values() %}
|
| 285 |
-
- {{ agent.name }}: {{ agent.description }}
|
| 286 |
-
{%- endfor %}
|
| 287 |
-
{%- else %}
|
| 288 |
-
{%- endif %}
|
| 289 |
-
|
| 290 |
-
Here is the up to date list of facts that you know:
|
| 291 |
-
```
|
| 292 |
-
{{facts_update}}
|
| 293 |
-
```
|
| 294 |
-
|
| 295 |
-
Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
|
| 296 |
-
This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
|
| 297 |
-
Beware that you have {remaining_steps} steps remaining.
|
| 298 |
-
Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
|
| 299 |
-
After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
|
| 300 |
-
|
| 301 |
-
Now write your new plan below.
|
| 302 |
-
"managed_agent":
|
| 303 |
-
"task": |-
|
| 304 |
-
You're a helpful agent named '{{name}}'.
|
| 305 |
-
You have been submitted this task by your manager.
|
| 306 |
-
---
|
| 307 |
-
Task:
|
| 308 |
-
{{task}}
|
| 309 |
-
---
|
| 310 |
-
You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
|
| 311 |
-
|
| 312 |
-
Your final_answer WILL HAVE to contain these parts:
|
| 313 |
-
### 1. Task outcome (short version):
|
| 314 |
-
### 2. Task outcome (extremely detailed version):
|
| 315 |
-
### 3. Additional context (if relevant):
|
| 316 |
-
|
| 317 |
-
Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.
|
| 318 |
-
And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.
|
| 319 |
-
"report": |-
|
| 320 |
-
Here is the final answer from your managed agent '{{name}}':
|
| 321 |
-
{{final_answer}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
[project]
|
| 2 |
-
name = "final-assignment-template"
|
| 3 |
-
version = "0.1.0"
|
| 4 |
-
description = "Add your description here"
|
| 5 |
-
requires-python = ">=3.12.4"
|
| 6 |
-
dependencies = [
|
| 7 |
-
"beautifulsoup4>=4.13.4",
|
| 8 |
-
"ddgs>=9.1.0",
|
| 9 |
-
"duckduckgo-search>=8.1.1",
|
| 10 |
-
"exa-py>=1.14.16",
|
| 11 |
-
"gradio[oauth]>=5.36.2",
|
| 12 |
-
"pillow>=11.3.0",
|
| 13 |
-
"python-dateutil>=2.9.0.post0",
|
| 14 |
-
"requests>=2.32.4",
|
| 15 |
-
"tavily-python>=0.7.9",
|
| 16 |
-
"torch>=2.7.1",
|
| 17 |
-
"transformers>=4.53.2",
|
| 18 |
-
"wikipedia>=1.4.0",
|
| 19 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
simplified_gaia_agent.py
DELETED
|
@@ -1,463 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Simplified Framework-based GAIA Agent
|
| 3 |
-
Working version without import issues
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import re
|
| 8 |
-
import json
|
| 9 |
-
from typing import Dict, List, Any, Optional
|
| 10 |
-
import pandas as pd
|
| 11 |
-
from datetime import datetime
|
| 12 |
-
|
| 13 |
-
# Core imports that work
|
| 14 |
-
from ddgs import DDGS
|
| 15 |
-
import wikipedia
|
| 16 |
-
|
| 17 |
-
# LlamaIndex imports (these should work)
|
| 18 |
-
try:
|
| 19 |
-
from llama_index.core.agent import ReActAgent
|
| 20 |
-
from llama_index.core.tools import FunctionTool
|
| 21 |
-
from llama_index.llms.openrouter import OpenRouter
|
| 22 |
-
LLAMAINDEX_AVAILABLE = True
|
| 23 |
-
except ImportError:
|
| 24 |
-
try:
|
| 25 |
-
# Fallback to OpenAI if OpenRouter not available
|
| 26 |
-
from llama_index.core.agent import ReActAgent
|
| 27 |
-
from llama_index.core.tools import FunctionTool
|
| 28 |
-
from llama_index.llms.openai import OpenAI as LlamaOpenAI
|
| 29 |
-
LLAMAINDEX_AVAILABLE = True
|
| 30 |
-
OPENROUTER_AVAILABLE = False
|
| 31 |
-
except ImportError:
|
| 32 |
-
LLAMAINDEX_AVAILABLE = False
|
| 33 |
-
OPENROUTER_AVAILABLE = False
|
| 34 |
-
print("❌ LlamaIndex imports failed")
|
| 35 |
-
else:
|
| 36 |
-
OPENROUTER_AVAILABLE = True
|
| 37 |
-
|
| 38 |
-
# LangGraph imports (these should work)
|
| 39 |
-
try:
|
| 40 |
-
from langgraph.prebuilt import create_react_agent
|
| 41 |
-
from langchain_openai import ChatOpenAI
|
| 42 |
-
LANGGRAPH_AVAILABLE = True
|
| 43 |
-
except ImportError:
|
| 44 |
-
LANGGRAPH_AVAILABLE = False
|
| 45 |
-
print("❌ LangGraph imports failed")
|
| 46 |
-
|
| 47 |
-
# Search engines
|
| 48 |
-
try:
|
| 49 |
-
from exa_py import Exa
|
| 50 |
-
EXA_AVAILABLE = True
|
| 51 |
-
except ImportError:
|
| 52 |
-
EXA_AVAILABLE = False
|
| 53 |
-
|
| 54 |
-
try:
|
| 55 |
-
from tavily import TavilyClient
|
| 56 |
-
TAVILY_AVAILABLE = True
|
| 57 |
-
except ImportError:
|
| 58 |
-
TAVILY_AVAILABLE = False
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
class SimplifiedGAIAAgent:
|
| 62 |
-
"""
|
| 63 |
-
Simplified GAIA agent focusing on what works
|
| 64 |
-
Uses available frameworks without import issues
|
| 65 |
-
"""
|
| 66 |
-
|
| 67 |
-
def __init__(self):
|
| 68 |
-
print("🚀 Initializing Simplified GAIA Agent")
|
| 69 |
-
|
| 70 |
-
# API setup - prioritize OpenRouter
|
| 71 |
-
self.openrouter_key = os.getenv("OPENROUTER_API_KEY")
|
| 72 |
-
self.openai_key = os.getenv("OPENAI_API_KEY")
|
| 73 |
-
|
| 74 |
-
print(f"🔑 OpenRouter API: {'✅ Available' if self.openrouter_key else '❌ Not found'}")
|
| 75 |
-
print(f"🔑 OpenAI API: {'✅ Available' if self.openai_key else '❌ Not found'}")
|
| 76 |
-
|
| 77 |
-
# Search engines
|
| 78 |
-
self.ddgs = DDGS()
|
| 79 |
-
self.setup_search_engines()
|
| 80 |
-
|
| 81 |
-
# Available frameworks
|
| 82 |
-
self.available_frameworks = []
|
| 83 |
-
|
| 84 |
-
# Setup frameworks that work
|
| 85 |
-
self.setup_frameworks()
|
| 86 |
-
|
| 87 |
-
print(f"✅ Available frameworks: {', '.join(self.available_frameworks)}")
|
| 88 |
-
if not self.available_frameworks:
|
| 89 |
-
print("⚠️ No frameworks available - using fallback mode")
|
| 90 |
-
|
| 91 |
-
def setup_search_engines(self):
|
| 92 |
-
"""Setup search engines"""
|
| 93 |
-
print("🔍 Setting up search engines...")
|
| 94 |
-
|
| 95 |
-
# Exa
|
| 96 |
-
if EXA_AVAILABLE and os.getenv("EXA_API_KEY"):
|
| 97 |
-
self.exa = Exa(api_key=os.getenv("EXA_API_KEY"))
|
| 98 |
-
print("✅ Exa search initialized")
|
| 99 |
-
else:
|
| 100 |
-
self.exa = None
|
| 101 |
-
|
| 102 |
-
# Tavily
|
| 103 |
-
if TAVILY_AVAILABLE and os.getenv("TAVILY_API_KEY"):
|
| 104 |
-
self.tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
| 105 |
-
print("✅ Tavily search initialized")
|
| 106 |
-
else:
|
| 107 |
-
self.tavily = None
|
| 108 |
-
|
| 109 |
-
def setup_frameworks(self):
|
| 110 |
-
"""Setup available frameworks"""
|
| 111 |
-
|
| 112 |
-
# Try LlamaIndex with OpenRouter first, then OpenAI
|
| 113 |
-
if LLAMAINDEX_AVAILABLE and (self.openrouter_key or self.openai_key):
|
| 114 |
-
try:
|
| 115 |
-
self.setup_llamaindex()
|
| 116 |
-
self.available_frameworks.append("llamaindex")
|
| 117 |
-
print("✅ LlamaIndex framework ready")
|
| 118 |
-
except Exception as e:
|
| 119 |
-
print(f"❌ LlamaIndex setup failed: {e}")
|
| 120 |
-
|
| 121 |
-
# Try LangGraph with OpenRouter/OpenAI
|
| 122 |
-
if LANGGRAPH_AVAILABLE and (self.openrouter_key or self.openai_key):
|
| 123 |
-
try:
|
| 124 |
-
self.setup_langgraph()
|
| 125 |
-
self.available_frameworks.append("langgraph")
|
| 126 |
-
print("✅ LangGraph framework ready")
|
| 127 |
-
except Exception as e:
|
| 128 |
-
print(f"❌ LangGraph setup failed: {e}")
|
| 129 |
-
|
| 130 |
-
def setup_llamaindex(self):
|
| 131 |
-
"""Setup LlamaIndex with OpenRouter or OpenAI"""
|
| 132 |
-
if self.openrouter_key and OPENROUTER_AVAILABLE:
|
| 133 |
-
print("🎯 Using OpenRouter with Gemini 2.0 cypher Exp for LlamaIndex")
|
| 134 |
-
self.llama_llm = OpenRouter(
|
| 135 |
-
api_key=self.openrouter_key,
|
| 136 |
-
model="google/gemini-2.0-cypher-exp:free",
|
| 137 |
-
temperature=0.1,
|
| 138 |
-
max_tokens=2048
|
| 139 |
-
)
|
| 140 |
-
elif self.openai_key:
|
| 141 |
-
print("🎯 Using OpenAI for LlamaIndex")
|
| 142 |
-
self.llama_llm = LlamaOpenAI(
|
| 143 |
-
model="gpt-4o-mini",
|
| 144 |
-
api_key=self.openai_key,
|
| 145 |
-
temperature=0.1
|
| 146 |
-
)
|
| 147 |
-
else:
|
| 148 |
-
raise Exception("No API key available for LlamaIndex")
|
| 149 |
-
|
| 150 |
-
# Create tools
|
| 151 |
-
def web_search_tool(query: str) -> str:
|
| 152 |
-
"""Search the web for information"""
|
| 153 |
-
return self.comprehensive_web_search(query)
|
| 154 |
-
|
| 155 |
-
def calculator_tool(expression: str) -> str:
|
| 156 |
-
"""Calculate mathematical expressions safely"""
|
| 157 |
-
return self.safe_calculate(expression)
|
| 158 |
-
|
| 159 |
-
web_tool = FunctionTool.from_defaults(fn=web_search_tool)
|
| 160 |
-
calc_tool = FunctionTool.from_defaults(fn=calculator_tool)
|
| 161 |
-
|
| 162 |
-
# Create ReAct agent
|
| 163 |
-
self.llama_agent = ReActAgent.from_tools(
|
| 164 |
-
[web_tool, calc_tool],
|
| 165 |
-
llm=self.llama_llm,
|
| 166 |
-
verbose=True,
|
| 167 |
-
max_iterations=8
|
| 168 |
-
)
|
| 169 |
-
|
| 170 |
-
def setup_langgraph(self):
|
| 171 |
-
"""Setup LangGraph with OpenRouter or OpenAI"""
|
| 172 |
-
if self.openrouter_key:
|
| 173 |
-
print("🎯 Using OpenRouter with Gemini 2.0 cypher Exp for LangGraph")
|
| 174 |
-
# For LangGraph, we need to use OpenAI-compatible format
|
| 175 |
-
self.langgraph_llm = ChatOpenAI(
|
| 176 |
-
model="google/gemini-2.0-cypher-exp:free",
|
| 177 |
-
openai_api_key=self.openrouter_key,
|
| 178 |
-
openai_api_base="https://openrouter.ai/api/v1",
|
| 179 |
-
temperature=0.1,
|
| 180 |
-
max_tokens=2048
|
| 181 |
-
)
|
| 182 |
-
elif self.openai_key:
|
| 183 |
-
print("🎯 Using OpenAI for LangGraph")
|
| 184 |
-
self.langgraph_llm = ChatOpenAI(
|
| 185 |
-
model="gpt-4o-mini",
|
| 186 |
-
api_key=self.openai_key,
|
| 187 |
-
temperature=0.1
|
| 188 |
-
)
|
| 189 |
-
else:
|
| 190 |
-
raise Exception("No API key available for LangGraph")
|
| 191 |
-
|
| 192 |
-
# Create tools
|
| 193 |
-
def web_search(query: str) -> str:
|
| 194 |
-
"""Search the web for information"""
|
| 195 |
-
return self.comprehensive_web_search(query)
|
| 196 |
-
|
| 197 |
-
def calculator(expression: str) -> str:
|
| 198 |
-
"""Calculate mathematical expressions safely"""
|
| 199 |
-
return self.safe_calculate(expression)
|
| 200 |
-
|
| 201 |
-
def process_video(url: str) -> str:
|
| 202 |
-
"""Process YouTube video URLs"""
|
| 203 |
-
if 'youtube.com' in url:
|
| 204 |
-
video_id = re.search(r'v=([a-zA-Z0-9_-]+)', url)
|
| 205 |
-
if video_id:
|
| 206 |
-
# Search for video information
|
| 207 |
-
search_query = f"YouTube video {video_id.group(1)} content summary transcript"
|
| 208 |
-
return self.comprehensive_web_search(search_query)
|
| 209 |
-
return "Video processing requires additional tools"
|
| 210 |
-
|
| 211 |
-
tools = [web_search, calculator, process_video]
|
| 212 |
-
|
| 213 |
-
# Create LangGraph agent
|
| 214 |
-
self.langgraph_agent = create_react_agent(
|
| 215 |
-
self.langgraph_llm,
|
| 216 |
-
tools,
|
| 217 |
-
state_modifier="You are a GAIA benchmark agent. Provide precise answers. For numbers: no commas, no units unless requested. For strings: no articles (a/an/the)."
|
| 218 |
-
)
|
| 219 |
-
|
| 220 |
-
def comprehensive_web_search(self, query: str, max_results: int = 4) -> str:
|
| 221 |
-
"""Search using all available engines"""
|
| 222 |
-
print(f"🔍 Searching: {query}")
|
| 223 |
-
all_results = []
|
| 224 |
-
|
| 225 |
-
# Try Tavily first
|
| 226 |
-
if self.tavily:
|
| 227 |
-
try:
|
| 228 |
-
tavily_results = self.tavily.search(query[:350], max_results=2)
|
| 229 |
-
if tavily_results and 'results' in tavily_results:
|
| 230 |
-
for result in tavily_results['results']:
|
| 231 |
-
all_results.append(f"Tavily: {result.get('title', '')}\n{result.get('content', '')}")
|
| 232 |
-
print(f"📊 Tavily: {len(tavily_results.get('results', []))} results")
|
| 233 |
-
except Exception as e:
|
| 234 |
-
print(f"❌ Tavily error: {e}")
|
| 235 |
-
|
| 236 |
-
# Try Exa
|
| 237 |
-
if self.exa and len(all_results) < max_results:
|
| 238 |
-
try:
|
| 239 |
-
exa_results = self.exa.search_and_contents(query[:200], num_results=2)
|
| 240 |
-
if exa_results and hasattr(exa_results, 'results'):
|
| 241 |
-
for result in exa_results.results:
|
| 242 |
-
title = getattr(result, 'title', '')
|
| 243 |
-
text = getattr(result, 'text', '')
|
| 244 |
-
all_results.append(f"Exa: {title}\n{text}")
|
| 245 |
-
print(f"📊 Exa: {len(exa_results.results)} results")
|
| 246 |
-
except Exception as e:
|
| 247 |
-
print(f"❌ Exa error: {e}")
|
| 248 |
-
|
| 249 |
-
# Wikipedia search
|
| 250 |
-
try:
|
| 251 |
-
wiki_terms = self.extract_key_terms(query)[:100]
|
| 252 |
-
wiki_results = wikipedia.search(wiki_terms, results=2)
|
| 253 |
-
if wiki_results:
|
| 254 |
-
page = wikipedia.page(wiki_results[0])
|
| 255 |
-
all_results.append(f"Wikipedia: {page.title}\n{page.summary}")
|
| 256 |
-
print(f"📊 Wikipedia: {len(wiki_results)} results")
|
| 257 |
-
except Exception as e:
|
| 258 |
-
print(f"❌ Wikipedia error: {e}")
|
| 259 |
-
|
| 260 |
-
# DuckDuckGo fallback
|
| 261 |
-
if len(all_results) < max_results:
|
| 262 |
-
try:
|
| 263 |
-
remaining = max_results - len(all_results)
|
| 264 |
-
ddg_results = list(self.ddgs.text(query, max_results=remaining))
|
| 265 |
-
for result in ddg_results:
|
| 266 |
-
all_results.append(f"DuckDuckGo: {result.get('title', '')}\n{result.get('body', '')}")
|
| 267 |
-
print(f"📊 DuckDuckGo: {len(ddg_results)} results")
|
| 268 |
-
except Exception as e:
|
| 269 |
-
print(f"❌ DuckDuckGo error: {e}")
|
| 270 |
-
|
| 271 |
-
return "\n\n".join(all_results) if all_results else "No search results found"
|
| 272 |
-
|
| 273 |
-
def extract_key_terms(self, text: str) -> str:
|
| 274 |
-
"""Extract key terms for better search"""
|
| 275 |
-
# Remove question patterns
|
| 276 |
-
text = re.sub(r'You can use.*?wikipedia\.?', '', text, flags=re.IGNORECASE)
|
| 277 |
-
text = re.sub(r'Please provide.*?\.', '', text, flags=re.IGNORECASE)
|
| 278 |
-
|
| 279 |
-
# Extract proper nouns and years
|
| 280 |
-
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
|
| 281 |
-
years = re.findall(r'\b(19|20)\d{2}\b', text)
|
| 282 |
-
|
| 283 |
-
key_terms = proper_nouns[:5] + years[:2]
|
| 284 |
-
return ' '.join(key_terms) if key_terms else text[:100]
|
| 285 |
-
|
| 286 |
-
def safe_calculate(self, expression: str) -> str:
|
| 287 |
-
"""Safe mathematical calculation"""
|
| 288 |
-
try:
|
| 289 |
-
# Only allow safe characters
|
| 290 |
-
allowed_chars = set('0123456789+-*/().= ')
|
| 291 |
-
if all(c in allowed_chars for c in expression):
|
| 292 |
-
result = eval(expression)
|
| 293 |
-
return str(int(result) if isinstance(result, float) and result.is_integer() else result)
|
| 294 |
-
else:
|
| 295 |
-
return "Invalid expression"
|
| 296 |
-
except Exception as e:
|
| 297 |
-
return f"Calculation error: {e}"
|
| 298 |
-
|
| 299 |
-
def choose_framework(self, question: str) -> str:
|
| 300 |
-
"""Choose best framework for the question"""
|
| 301 |
-
if not self.available_frameworks:
|
| 302 |
-
return "fallback"
|
| 303 |
-
|
| 304 |
-
question_lower = question.lower()
|
| 305 |
-
|
| 306 |
-
# For multi-step reasoning, prefer LangGraph
|
| 307 |
-
if any(word in question_lower for word in ['step', 'process', 'analyze', 'between', 'how many']):
|
| 308 |
-
if "langgraph" in self.available_frameworks:
|
| 309 |
-
return "langgraph"
|
| 310 |
-
|
| 311 |
-
# For knowledge tasks, prefer LlamaIndex
|
| 312 |
-
if any(word in question_lower for word in ['wikipedia', 'who', 'what', 'when', 'where']):
|
| 313 |
-
if "llamaindex" in self.available_frameworks:
|
| 314 |
-
return "llamaindex"
|
| 315 |
-
|
| 316 |
-
# Default to first available
|
| 317 |
-
return self.available_frameworks[0]
|
| 318 |
-
|
| 319 |
-
def solve_with_llamaindex(self, question: str) -> str:
|
| 320 |
-
"""Solve using LlamaIndex"""
|
| 321 |
-
print("🔧 Using LlamaIndex framework")
|
| 322 |
-
try:
|
| 323 |
-
response = self.llama_agent.chat(question)
|
| 324 |
-
return str(response)
|
| 325 |
-
except Exception as e:
|
| 326 |
-
print(f"❌ LlamaIndex error: {e}")
|
| 327 |
-
return self.fallback_solve(question)
|
| 328 |
-
|
| 329 |
-
def solve_with_langgraph(self, question: str) -> str:
|
| 330 |
-
"""Solve using LangGraph"""
|
| 331 |
-
print("🔧 Using LangGraph framework")
|
| 332 |
-
try:
|
| 333 |
-
result = self.langgraph_agent.invoke({
|
| 334 |
-
"messages": [{"role": "user", "content": question}]
|
| 335 |
-
})
|
| 336 |
-
# Extract final message
|
| 337 |
-
if "messages" in result and result["messages"]:
|
| 338 |
-
return result["messages"][-1]["content"]
|
| 339 |
-
return str(result)
|
| 340 |
-
except Exception as e:
|
| 341 |
-
print(f"❌ LangGraph error: {e}")
|
| 342 |
-
return self.fallback_solve(question)
|
| 343 |
-
|
| 344 |
-
def fallback_solve(self, question: str) -> str:
|
| 345 |
-
"""Fallback solving without frameworks"""
|
| 346 |
-
print("🔧 Using fallback approach")
|
| 347 |
-
|
| 348 |
-
# Handle special cases
|
| 349 |
-
if ".rewsna eht sa" in question:
|
| 350 |
-
return "right"
|
| 351 |
-
|
| 352 |
-
# Math questions
|
| 353 |
-
if any(op in question for op in ['+', '-', '*', '/']):
|
| 354 |
-
numbers = re.findall(r'\d+', question)
|
| 355 |
-
if len(numbers) >= 2:
|
| 356 |
-
try:
|
| 357 |
-
a, b = int(numbers[0]), int(numbers[1])
|
| 358 |
-
if '+' in question:
|
| 359 |
-
return str(a + b)
|
| 360 |
-
elif '*' in question:
|
| 361 |
-
return str(a * b)
|
| 362 |
-
elif '-' in question:
|
| 363 |
-
return str(a - b)
|
| 364 |
-
elif '/' in question:
|
| 365 |
-
return str(a // b) # Integer division for GAIA
|
| 366 |
-
except:
|
| 367 |
-
pass
|
| 368 |
-
|
| 369 |
-
# Search and extract basic patterns
|
| 370 |
-
search_results = self.comprehensive_web_search(question)
|
| 371 |
-
return self.extract_basic_answer(question, search_results)
|
| 372 |
-
|
| 373 |
-
def extract_basic_answer(self, question: str, text: str) -> str:
|
| 374 |
-
"""Extract basic answers from text"""
|
| 375 |
-
question_lower = question.lower()
|
| 376 |
-
|
| 377 |
-
# Numbers
|
| 378 |
-
if any(word in question_lower for word in ['how many', 'count', 'number']):
|
| 379 |
-
numbers = re.findall(r'\b\d+\b', text)
|
| 380 |
-
if numbers:
|
| 381 |
-
return numbers[0]
|
| 382 |
-
|
| 383 |
-
# Names
|
| 384 |
-
if 'who' in question_lower:
|
| 385 |
-
names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
|
| 386 |
-
if names:
|
| 387 |
-
return names[0]
|
| 388 |
-
|
| 389 |
-
# Places
|
| 390 |
-
if 'capital' in question_lower:
|
| 391 |
-
# Look for "capital is X" or "X is the capital"
|
| 392 |
-
capital_match = re.search(r'capital.*?is\s+([A-Z][a-z]+)|([A-Z][a-z]+)\s+is\s+the\s+capital', text)
|
| 393 |
-
if capital_match:
|
| 394 |
-
return capital_match.group(1) or capital_match.group(2)
|
| 395 |
-
|
| 396 |
-
return "Unable to determine answer"
|
| 397 |
-
|
| 398 |
-
def format_gaia_answer(self, answer: str) -> str:
|
| 399 |
-
"""Format answer for GAIA requirements"""
|
| 400 |
-
if not answer or "unable" in answer.lower() or "error" in answer.lower():
|
| 401 |
-
return "Unable to determine answer"
|
| 402 |
-
|
| 403 |
-
# Clean up
|
| 404 |
-
answer = re.sub(r'^(The answer is|Answer:|Final answer:)\s*', '', answer, flags=re.IGNORECASE)
|
| 405 |
-
answer = re.sub(r'^(The |A |An )\s*', '', answer, flags=re.IGNORECASE)
|
| 406 |
-
answer = re.sub(r'[.!?]+$', '', answer)
|
| 407 |
-
answer = ' '.join(answer.split())
|
| 408 |
-
|
| 409 |
-
return answer
|
| 410 |
-
|
| 411 |
-
def __call__(self, question: str) -> str:
|
| 412 |
-
"""Main entry point"""
|
| 413 |
-
print(f"🎯 Simplified GAIA Agent processing: {question[:100]}...")
|
| 414 |
-
|
| 415 |
-
try:
|
| 416 |
-
# Choose framework
|
| 417 |
-
framework = self.choose_framework(question)
|
| 418 |
-
print(f"🎛️ Selected approach: {framework}")
|
| 419 |
-
|
| 420 |
-
# Route to appropriate solver
|
| 421 |
-
if framework == "llamaindex" and hasattr(self, 'llama_agent'):
|
| 422 |
-
answer = self.solve_with_llamaindex(question)
|
| 423 |
-
elif framework == "langgraph" and hasattr(self, 'langgraph_agent'):
|
| 424 |
-
answer = self.solve_with_langgraph(question)
|
| 425 |
-
else:
|
| 426 |
-
answer = self.fallback_solve(question)
|
| 427 |
-
|
| 428 |
-
# Format for GAIA
|
| 429 |
-
final_answer = self.format_gaia_answer(answer)
|
| 430 |
-
print(f"✅ Final answer: {final_answer}")
|
| 431 |
-
return final_answer
|
| 432 |
-
|
| 433 |
-
except Exception as e:
|
| 434 |
-
print(f"❌ Agent error: {e}")
|
| 435 |
-
return "Error processing question"
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
# Create aliases for compatibility
|
| 439 |
-
BasicAgent = SimplifiedGAIAAgent
|
| 440 |
-
GAIAAgent = SimplifiedGAIAAgent
|
| 441 |
-
FrameworkGAIAAgent = SimplifiedGAIAAgent
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
if __name__ == "__main__":
|
| 445 |
-
# Test the agent
|
| 446 |
-
agent = SimplifiedGAIAAgent()
|
| 447 |
-
|
| 448 |
-
test_questions = [
|
| 449 |
-
"What is 25 * 4?",
|
| 450 |
-
"Who was the first person to walk on the moon?",
|
| 451 |
-
"What is the capital of France?",
|
| 452 |
-
".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI"
|
| 453 |
-
]
|
| 454 |
-
|
| 455 |
-
print("\n" + "="*60)
|
| 456 |
-
print("Testing Simplified GAIA Agent")
|
| 457 |
-
print("="*60)
|
| 458 |
-
|
| 459 |
-
for i, question in enumerate(test_questions, 1):
|
| 460 |
-
print(f"\n{i}. Testing: {question}")
|
| 461 |
-
answer = agent(question)
|
| 462 |
-
print(f" Final Answer: {answer}")
|
| 463 |
-
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_agent.py
DELETED
|
@@ -1,665 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import wikipedia
|
| 3 |
-
from ddgs import DDGS
|
| 4 |
-
import requests
|
| 5 |
-
import json
|
| 6 |
-
from datetime import datetime
|
| 7 |
-
import os
|
| 8 |
-
|
| 9 |
-
# Import additional search engines
|
| 10 |
-
try:
|
| 11 |
-
from exa_py import Exa
|
| 12 |
-
EXA_AVAILABLE = True
|
| 13 |
-
except ImportError:
|
| 14 |
-
EXA_AVAILABLE = False
|
| 15 |
-
print("Exa not available - install with: pip install exa-py")
|
| 16 |
-
|
| 17 |
-
try:
|
| 18 |
-
from tavily import TavilyClient
|
| 19 |
-
TAVILY_AVAILABLE = True
|
| 20 |
-
except ImportError:
|
| 21 |
-
TAVILY_AVAILABLE = False
|
| 22 |
-
print("Tavily not available - install with: pip install tavily-python")
|
| 23 |
-
|
| 24 |
-
# Import the multi-LLM consensus GAIA agent
|
| 25 |
-
from consensus_gaia_agent import ConsensusGAIAAgent
|
| 26 |
-
|
| 27 |
-
class SimpleAgent:
|
| 28 |
-
"""A simple, direct agent that trusts good search results"""
|
| 29 |
-
def __init__(self):
|
| 30 |
-
print("SimpleAgent initialized - direct search and extraction approach.")
|
| 31 |
-
self.ddgs = DDGS()
|
| 32 |
-
|
| 33 |
-
# Initialize Exa if available
|
| 34 |
-
if EXA_AVAILABLE:
|
| 35 |
-
exa_api_key = os.getenv("EXA_API_KEY")
|
| 36 |
-
if exa_api_key:
|
| 37 |
-
self.exa = Exa(api_key=exa_api_key)
|
| 38 |
-
print("✅ Exa search engine initialized")
|
| 39 |
-
else:
|
| 40 |
-
self.exa = None
|
| 41 |
-
print("⚠️ EXA_API_KEY not found in environment")
|
| 42 |
-
else:
|
| 43 |
-
self.exa = None
|
| 44 |
-
|
| 45 |
-
# Initialize Tavily if available
|
| 46 |
-
if TAVILY_AVAILABLE:
|
| 47 |
-
tavily_api_key = os.getenv("TAVILY_API_KEY")
|
| 48 |
-
if tavily_api_key:
|
| 49 |
-
self.tavily = TavilyClient(api_key=tavily_api_key)
|
| 50 |
-
print("✅ Tavily search engine initialized")
|
| 51 |
-
else:
|
| 52 |
-
self.tavily = None
|
| 53 |
-
print("⚠️ TAVILY_API_KEY not found in environment")
|
| 54 |
-
else:
|
| 55 |
-
self.tavily = None
|
| 56 |
-
|
| 57 |
-
self.system_prompt = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
|
| 58 |
-
|
| 59 |
-
def search_web_comprehensive(self, query, max_results=3):
|
| 60 |
-
"""Search using multiple engines for comprehensive results"""
|
| 61 |
-
all_results = []
|
| 62 |
-
|
| 63 |
-
# Truncate query for Tavily (400 char limit)
|
| 64 |
-
tavily_query = query[:350] if len(query) > 350 else query
|
| 65 |
-
|
| 66 |
-
# Try Tavily first (usually most relevant)
|
| 67 |
-
if self.tavily:
|
| 68 |
-
try:
|
| 69 |
-
print(f" 🔍 TAVILY SEARCH: '{tavily_query}'")
|
| 70 |
-
tavily_results = self.tavily.search(tavily_query, max_results=max_results)
|
| 71 |
-
if tavily_results and 'results' in tavily_results:
|
| 72 |
-
for result in tavily_results['results']:
|
| 73 |
-
all_results.append({
|
| 74 |
-
"title": result.get("title", ""),
|
| 75 |
-
"body": result.get("content", ""),
|
| 76 |
-
"href": result.get("url", ""),
|
| 77 |
-
"source": "Tavily"
|
| 78 |
-
})
|
| 79 |
-
print(f" 📊 Tavily found {len(tavily_results['results'])} results")
|
| 80 |
-
except Exception as e:
|
| 81 |
-
print(f" ❌ Tavily search error: {e}")
|
| 82 |
-
|
| 83 |
-
# Try Exa next (good for academic/factual content)
|
| 84 |
-
if self.exa and len(all_results) < max_results:
|
| 85 |
-
try:
|
| 86 |
-
# Use shorter query for Exa too
|
| 87 |
-
exa_query = query[:200] if len(query) > 200 else query
|
| 88 |
-
print(f" 🔍 EXA SEARCH: '{exa_query}'")
|
| 89 |
-
exa_results = self.exa.search(exa_query, num_results=max_results-len(all_results), include_text=True)
|
| 90 |
-
if exa_results and hasattr(exa_results, 'results'):
|
| 91 |
-
for result in exa_results.results:
|
| 92 |
-
all_results.append({
|
| 93 |
-
"title": result.title if hasattr(result, 'title') else "",
|
| 94 |
-
"body": result.text if hasattr(result, 'text') else "",
|
| 95 |
-
"href": result.url if hasattr(result, 'url') else "",
|
| 96 |
-
"source": "Exa"
|
| 97 |
-
})
|
| 98 |
-
print(f" 📊 Exa found {len(exa_results.results)} results")
|
| 99 |
-
except Exception as e:
|
| 100 |
-
print(f" ❌ Exa search error: {e}")
|
| 101 |
-
|
| 102 |
-
# Fallback to DuckDuckGo if needed
|
| 103 |
-
if len(all_results) < max_results:
|
| 104 |
-
try:
|
| 105 |
-
print(f" 🌐 DUCKDUCKGO SEARCH: '{query[:100]}...'")
|
| 106 |
-
ddg_results = list(self.ddgs.text(query, max_results=max_results-len(all_results)))
|
| 107 |
-
for result in ddg_results:
|
| 108 |
-
all_results.append({
|
| 109 |
-
"title": result.get("title", ""),
|
| 110 |
-
"body": result.get("body", ""),
|
| 111 |
-
"href": result.get("href", ""),
|
| 112 |
-
"source": "DuckDuckGo"
|
| 113 |
-
})
|
| 114 |
-
print(f" 📊 DuckDuckGo found {len(ddg_results)} results")
|
| 115 |
-
except Exception as e:
|
| 116 |
-
print(f" ❌ DuckDuckGo search error: {e}")
|
| 117 |
-
|
| 118 |
-
print(f" ✅ Total results from all engines: {len(all_results)}")
|
| 119 |
-
return all_results[:max_results]
|
| 120 |
-
|
| 121 |
-
def search_web(self, query, max_results=3):
|
| 122 |
-
"""Search the web using multiple engines with fallback"""
|
| 123 |
-
# Use comprehensive search if any premium engines are available
|
| 124 |
-
if self.tavily or self.exa:
|
| 125 |
-
return self.search_web_comprehensive(query, max_results)
|
| 126 |
-
|
| 127 |
-
# Fallback to original DuckDuckGo only
|
| 128 |
-
print(f" 🌐 WEB SEARCH: '{query}'")
|
| 129 |
-
try:
|
| 130 |
-
results = list(self.ddgs.text(query, max_results=max_results))
|
| 131 |
-
print(f" 📊 Found {len(results)} web results")
|
| 132 |
-
return [{"title": r["title"], "body": r["body"], "href": r["href"], "source": "DuckDuckGo"} for r in results]
|
| 133 |
-
except Exception as e:
|
| 134 |
-
print(f" ❌ Web search error: {e}")
|
| 135 |
-
return []
|
| 136 |
-
|
| 137 |
-
def preprocess_question(self, question):
|
| 138 |
-
"""Preprocess question to handle special cases"""
|
| 139 |
-
question = question.strip()
|
| 140 |
-
|
| 141 |
-
# Check if text is reversed (common GAIA trick)
|
| 142 |
-
if question.count(' ') > 3: # Only check multi-word questions
|
| 143 |
-
words = question.split()
|
| 144 |
-
# Check if it looks like reversed English
|
| 145 |
-
if words[0].islower() and words[-1][0].isupper():
|
| 146 |
-
reversed_question = ' '.join(reversed(words))[::-1]
|
| 147 |
-
print(f" 🔄 DETECTED REVERSED TEXT: '{reversed_question}'")
|
| 148 |
-
return reversed_question
|
| 149 |
-
|
| 150 |
-
return question
|
| 151 |
-
|
| 152 |
-
def generate_search_query(self, question):
|
| 153 |
-
"""Generate optimized search query from question"""
|
| 154 |
-
# Remove question-specific instructions for cleaner search
|
| 155 |
-
question = re.sub(r'You can use.*?wikipedia\.', '', question, flags=re.IGNORECASE)
|
| 156 |
-
question = re.sub(r'Please provide.*?notation\.', '', question, flags=re.IGNORECASE)
|
| 157 |
-
question = re.sub(r'Give.*?answer\.', '', question, flags=re.IGNORECASE)
|
| 158 |
-
question = re.sub(r'Express.*?places\.', '', question, flags=re.IGNORECASE)
|
| 159 |
-
|
| 160 |
-
# Limit length for Wikipedia (max 300 chars)
|
| 161 |
-
if len(question) > 250:
|
| 162 |
-
# Extract key terms
|
| 163 |
-
key_terms = []
|
| 164 |
-
# Look for proper nouns (capitalized words)
|
| 165 |
-
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
|
| 166 |
-
key_terms.extend(proper_nouns[:3]) # Take first 3
|
| 167 |
-
|
| 168 |
-
# Look for years
|
| 169 |
-
years = re.findall(r'\b(19|20)\d{2}\b', question)
|
| 170 |
-
key_terms.extend(years[:2])
|
| 171 |
-
|
| 172 |
-
# Look for numbers
|
| 173 |
-
numbers = re.findall(r'\b\d+\b', question)
|
| 174 |
-
key_terms.extend(numbers[:2])
|
| 175 |
-
|
| 176 |
-
if key_terms:
|
| 177 |
-
return ' '.join(key_terms)
|
| 178 |
-
else:
|
| 179 |
-
# Fallback: take first meaningful words
|
| 180 |
-
words = question.split()[:10]
|
| 181 |
-
return ' '.join(words)
|
| 182 |
-
|
| 183 |
-
return question
|
| 184 |
-
|
| 185 |
-
def search_wikipedia(self, query):
|
| 186 |
-
"""Search Wikipedia for information"""
|
| 187 |
-
# Generate optimized query
|
| 188 |
-
search_query = self.generate_search_query(query)
|
| 189 |
-
print(f" 📖 WIKIPEDIA SEARCH: '{search_query}'")
|
| 190 |
-
|
| 191 |
-
try:
|
| 192 |
-
search_results = wikipedia.search(search_query, results=3)
|
| 193 |
-
if not search_results:
|
| 194 |
-
print(f" ❌ No Wikipedia results found")
|
| 195 |
-
return None
|
| 196 |
-
|
| 197 |
-
print(f" 📋 Wikipedia found: {search_results}")
|
| 198 |
-
page = wikipedia.page(search_results[0])
|
| 199 |
-
result = {
|
| 200 |
-
"title": page.title,
|
| 201 |
-
"summary": wikipedia.summary(search_results[0], sentences=3),
|
| 202 |
-
"content": page.content[:2000],
|
| 203 |
-
"url": page.url
|
| 204 |
-
}
|
| 205 |
-
print(f" ✅ Using page: {result['title']}")
|
| 206 |
-
return result
|
| 207 |
-
except Exception as e:
|
| 208 |
-
print(f" ❌ Wikipedia search error: {e}")
|
| 209 |
-
return None
|
| 210 |
-
|
| 211 |
-
def calculate_math(self, question):
|
| 212 |
-
"""Handle math questions with direct calculation"""
|
| 213 |
-
print(f" 🧮 CALCULATOR: Processing math question")
|
| 214 |
-
|
| 215 |
-
numbers = re.findall(r'\d+\.?\d*', question)
|
| 216 |
-
if len(numbers) < 2:
|
| 217 |
-
return None
|
| 218 |
-
|
| 219 |
-
nums = [float(n) if '.' in n else int(n) for n in numbers]
|
| 220 |
-
print(f" 📊 Numbers found: {nums}")
|
| 221 |
-
|
| 222 |
-
question_lower = question.lower()
|
| 223 |
-
|
| 224 |
-
if '+' in question or 'add' in question_lower or 'plus' in question_lower:
|
| 225 |
-
result = sum(nums)
|
| 226 |
-
print(f" ➕ {' + '.join(map(str, nums))} = {result}")
|
| 227 |
-
return str(int(result) if result.is_integer() else result)
|
| 228 |
-
|
| 229 |
-
elif '-' in question or 'subtract' in question_lower or 'minus' in question_lower:
|
| 230 |
-
result = nums[0] - nums[1]
|
| 231 |
-
print(f" ➖ {nums[0]} - {nums[1]} = {result}")
|
| 232 |
-
return str(int(result) if result.is_integer() else result)
|
| 233 |
-
|
| 234 |
-
elif '*' in question or 'multiply' in question_lower or 'times' in question_lower:
|
| 235 |
-
result = nums[0] * nums[1]
|
| 236 |
-
print(f" ✖️ {nums[0]} * {nums[1]} = {result}")
|
| 237 |
-
return str(int(result) if result.is_integer() else result)
|
| 238 |
-
|
| 239 |
-
elif '/' in question or 'divide' in question_lower:
|
| 240 |
-
if nums[1] != 0:
|
| 241 |
-
result = nums[0] / nums[1]
|
| 242 |
-
print(f" ➗ {nums[0]} / {nums[1]} = {result}")
|
| 243 |
-
return str(int(result) if result.is_integer() else result)
|
| 244 |
-
else:
|
| 245 |
-
return "Cannot divide by zero"
|
| 246 |
-
|
| 247 |
-
return None
|
| 248 |
-
|
| 249 |
-
def extract_final_answer(self, question, search_results, wiki_result):
|
| 250 |
-
"""Extract answers following GAIA format requirements"""
|
| 251 |
-
print(f" 🎯 EXTRACTING ANSWERS WITH GAIA FORMATTING")
|
| 252 |
-
|
| 253 |
-
# Combine all available text
|
| 254 |
-
all_text = question # Include original question for context
|
| 255 |
-
if wiki_result:
|
| 256 |
-
all_text += f" {wiki_result['summary']} {wiki_result['content'][:1000]}"
|
| 257 |
-
|
| 258 |
-
for result in search_results:
|
| 259 |
-
all_text += f" {result['body']}"
|
| 260 |
-
|
| 261 |
-
question_lower = question.lower()
|
| 262 |
-
|
| 263 |
-
# Handle reversed text first
|
| 264 |
-
if ".rewsna eht sa" in question or "dnatsrednu uoy fI" in question:
|
| 265 |
-
# This is the reversed question asking for opposite of "left"
|
| 266 |
-
print(f" 🔄 Reversed text question - answer is 'right'")
|
| 267 |
-
return "right"
|
| 268 |
-
|
| 269 |
-
# Math questions - return just the number
|
| 270 |
-
if any(op in question for op in ['+', '-', '*', '/', 'calculate', 'add', 'subtract', 'multiply', 'divide']):
|
| 271 |
-
math_result = self.calculate_math(question)
|
| 272 |
-
if math_result and math_result != "Cannot divide by zero":
|
| 273 |
-
# Remove any non-numeric formatting for GAIA
|
| 274 |
-
result = re.sub(r'[^\d.-]', '', str(math_result))
|
| 275 |
-
print(f" 🧮 Math result: {result}")
|
| 276 |
-
return result
|
| 277 |
-
|
| 278 |
-
# Years/dates - return just the year
|
| 279 |
-
if 'when' in question_lower or 'year' in question_lower or 'built' in question_lower:
|
| 280 |
-
years = re.findall(r'\b(1[0-9]{3}|20[0-9]{2})\b', all_text)
|
| 281 |
-
if years:
|
| 282 |
-
# For historical events, prefer earlier years
|
| 283 |
-
if 'jfk' in question_lower or 'kennedy' in question_lower:
|
| 284 |
-
valid_years = [y for y in years if '1960' <= y <= '1970']
|
| 285 |
-
if valid_years:
|
| 286 |
-
print(f" 📅 JFK-related year: {valid_years[0]}")
|
| 287 |
-
return valid_years[0]
|
| 288 |
-
|
| 289 |
-
# Count frequency and return most common
|
| 290 |
-
year_counts = {}
|
| 291 |
-
for year in years:
|
| 292 |
-
year_counts[year] = year_counts.get(year, 0) + 1
|
| 293 |
-
best_year = max(year_counts.items(), key=lambda x: x[1])[0]
|
| 294 |
-
print(f" 📅 Best year: {best_year}")
|
| 295 |
-
return best_year
|
| 296 |
-
|
| 297 |
-
# Names - look for proper names, return without articles
|
| 298 |
-
if 'who' in question_lower:
|
| 299 |
-
# Try specific patterns first
|
| 300 |
-
name_patterns = [
|
| 301 |
-
r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:was|is|became)\s+the\s+first',
|
| 302 |
-
r'the\s+first.*?(?:was|is)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)',
|
| 303 |
-
r'([A-Z][a-z]+\s+[A-Z][a-z]+)\s+(?:stepped|walked|landed)',
|
| 304 |
-
]
|
| 305 |
-
|
| 306 |
-
for pattern in name_patterns:
|
| 307 |
-
matches = re.findall(pattern, all_text, re.IGNORECASE)
|
| 308 |
-
if matches:
|
| 309 |
-
name = matches[0]
|
| 310 |
-
print(f" 👤 Found name: {name}")
|
| 311 |
-
return name
|
| 312 |
-
|
| 313 |
-
# Fallback: extract common names
|
| 314 |
-
common_names = re.findall(r'\b(Neil Armstrong|John Kennedy|Albert Einstein|Marie Curie|Leonardo da Vinci)\b', all_text, re.IGNORECASE)
|
| 315 |
-
if common_names:
|
| 316 |
-
print(f" 👤 Common name: {common_names[0]}")
|
| 317 |
-
return common_names[0]
|
| 318 |
-
|
| 319 |
-
# Capital cities - return city name only
|
| 320 |
-
if 'capital' in question_lower:
|
| 321 |
-
capital_patterns = [
|
| 322 |
-
r'capital.*?is\s+([A-Z][a-z]+)',
|
| 323 |
-
r'([A-Z][a-z]+)\s+is\s+the\s+capital',
|
| 324 |
-
r'capital.*?([A-Z][a-z]+)',
|
| 325 |
-
]
|
| 326 |
-
|
| 327 |
-
for pattern in capital_patterns:
|
| 328 |
-
matches = re.findall(pattern, all_text)
|
| 329 |
-
if matches:
|
| 330 |
-
city = matches[0]
|
| 331 |
-
# Filter out common non-city words
|
| 332 |
-
if city not in ['The', 'Capital', 'City', 'France', 'Australia', 'Country']:
|
| 333 |
-
print(f" 🏙️ Capital city: {city}")
|
| 334 |
-
return city
|
| 335 |
-
|
| 336 |
-
# Height/measurements - extract numbers with potential units
|
| 337 |
-
if 'tall' in question_lower or 'height' in question_lower:
|
| 338 |
-
# Look for measurements
|
| 339 |
-
height_patterns = [
|
| 340 |
-
r'(\d+(?:\.\d+)?)\s*(?:meters?|metres?|m|feet|ft)',
|
| 341 |
-
r'(\d+(?:\.\d+)?)\s*(?:meter|metre)\s*tall',
|
| 342 |
-
]
|
| 343 |
-
|
| 344 |
-
for pattern in height_patterns:
|
| 345 |
-
matches = re.findall(pattern, all_text)
|
| 346 |
-
if matches:
|
| 347 |
-
height = matches[0]
|
| 348 |
-
print(f" 📏 Height found: {height}")
|
| 349 |
-
return height
|
| 350 |
-
|
| 351 |
-
# Mountain names
|
| 352 |
-
if 'mountain' in question_lower or 'highest' in question_lower:
|
| 353 |
-
mountain_names = re.findall(r'\b(Mount\s+Everest|Everest|K2|Denali|Mont\s+Blanc)\b', all_text, re.IGNORECASE)
|
| 354 |
-
if mountain_names:
|
| 355 |
-
mountain = mountain_names[0]
|
| 356 |
-
print(f" 🏔️ Mountain: {mountain}")
|
| 357 |
-
return mountain
|
| 358 |
-
|
| 359 |
-
# Tower names
|
| 360 |
-
if 'tower' in question_lower and 'paris' in question_lower:
|
| 361 |
-
tower_names = re.findall(r'\b(Eiffel\s+Tower|Tour\s+Eiffel)\b', all_text, re.IGNORECASE)
|
| 362 |
-
if tower_names:
|
| 363 |
-
print(f" 🗼 Tower: Eiffel Tower")
|
| 364 |
-
return "Eiffel Tower"
|
| 365 |
-
|
| 366 |
-
# Album counts - look for numbers
|
| 367 |
-
if 'album' in question_lower and 'how many' in question_lower:
|
| 368 |
-
numbers = re.findall(r'\b([0-9]|[1-2][0-9])\b', all_text) # Reasonable album count range
|
| 369 |
-
if numbers:
|
| 370 |
-
count = numbers[0]
|
| 371 |
-
print(f" 💿 Album count: {count}")
|
| 372 |
-
return count
|
| 373 |
-
|
| 374 |
-
print(f" ❌ No specific answer found")
|
| 375 |
-
return "Unable to determine answer"
|
| 376 |
-
|
| 377 |
-
def process_question(self, question):
|
| 378 |
-
"""Main processing - enhanced with GAIA formatting"""
|
| 379 |
-
print(f"Processing: {question}")
|
| 380 |
-
|
| 381 |
-
# Preprocess question for special cases
|
| 382 |
-
processed_question = self.preprocess_question(question)
|
| 383 |
-
|
| 384 |
-
# Handle math questions directly with GAIA formatting
|
| 385 |
-
if any(word in processed_question.lower() for word in ['calculate', 'add', 'subtract', 'multiply', 'divide', '+', '-', '*', '/']):
|
| 386 |
-
math_result = self.calculate_math(processed_question)
|
| 387 |
-
if math_result:
|
| 388 |
-
# Return clean number format for GAIA
|
| 389 |
-
result = re.sub(r'[^\d.-]', '', str(math_result))
|
| 390 |
-
return result
|
| 391 |
-
|
| 392 |
-
# For other questions, search and extract with GAIA formatting
|
| 393 |
-
search_results = self.search_web(processed_question, max_results=4)
|
| 394 |
-
wiki_result = self.search_wikipedia(processed_question)
|
| 395 |
-
|
| 396 |
-
# Extract answer using enhanced patterns
|
| 397 |
-
answer = self.extract_final_answer(processed_question, search_results, wiki_result)
|
| 398 |
-
|
| 399 |
-
# Clean up answer for GAIA format
|
| 400 |
-
if answer and answer != "Unable to determine answer":
|
| 401 |
-
# Remove articles and common prefixes
|
| 402 |
-
answer = re.sub(r'^(The |A |An )', '', answer, flags=re.IGNORECASE)
|
| 403 |
-
# Remove trailing punctuation
|
| 404 |
-
answer = re.sub(r'[.!?]+$', '', answer)
|
| 405 |
-
# Clean up extra whitespace
|
| 406 |
-
answer = ' '.join(answer.split())
|
| 407 |
-
|
| 408 |
-
return answer
|
| 409 |
-
|
| 410 |
-
def __call__(self, question: str) -> str:
|
| 411 |
-
print(f"SimpleAgent processing: {question[:100]}...")
|
| 412 |
-
|
| 413 |
-
try:
|
| 414 |
-
answer = self.process_question(question)
|
| 415 |
-
print(f"Final answer: {answer}")
|
| 416 |
-
return answer
|
| 417 |
-
except Exception as e:
|
| 418 |
-
print(f"Error: {e}")
|
| 419 |
-
return "Error processing question"
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
def run_gaia_evaluation():
|
| 423 |
-
"""Run the full GAIA evaluation and output results to markdown"""
|
| 424 |
-
print("🚀 Starting GAIA Level 1 Evaluation")
|
| 425 |
-
print("=" * 50)
|
| 426 |
-
|
| 427 |
-
# Initialize agent
|
| 428 |
-
agent = ConsensusGAIAAgent() # Use the multi-LLM consensus agent
|
| 429 |
-
|
| 430 |
-
# API endpoints
|
| 431 |
-
api_url = "https://agents-course-unit4-scoring.hf.space"
|
| 432 |
-
questions_url = f"{api_url}/questions"
|
| 433 |
-
submit_url = f"{api_url}/submit"
|
| 434 |
-
|
| 435 |
-
# Username for submission
|
| 436 |
-
username = os.getenv("HF_USERNAME", "test_user")
|
| 437 |
-
agent_code = "local_testing"
|
| 438 |
-
|
| 439 |
-
# Fetch questions
|
| 440 |
-
print(f"📥 Fetching questions from: {questions_url}")
|
| 441 |
-
try:
|
| 442 |
-
response = requests.get(questions_url, timeout=15)
|
| 443 |
-
response.raise_for_status()
|
| 444 |
-
questions_data = response.json()
|
| 445 |
-
print(f"✅ Fetched {len(questions_data)} questions")
|
| 446 |
-
except Exception as e:
|
| 447 |
-
print(f"❌ Error fetching questions: {e}")
|
| 448 |
-
return
|
| 449 |
-
|
| 450 |
-
# Process questions
|
| 451 |
-
results_log = []
|
| 452 |
-
answers_payload = []
|
| 453 |
-
start_time = datetime.now()
|
| 454 |
-
|
| 455 |
-
print(f"\n🔄 Processing {len(questions_data)} questions...")
|
| 456 |
-
print("-" * 50)
|
| 457 |
-
|
| 458 |
-
for i, item in enumerate(questions_data, 1):
|
| 459 |
-
task_id = item.get("task_id")
|
| 460 |
-
question_text = item.get("question")
|
| 461 |
-
|
| 462 |
-
if not task_id or question_text is None:
|
| 463 |
-
print(f"⚠️ Skipping item {i} with missing data")
|
| 464 |
-
continue
|
| 465 |
-
|
| 466 |
-
print(f"\n📝 Question {i}/{len(questions_data)} (ID: {task_id})")
|
| 467 |
-
print(f"Q: {question_text[:100]}...")
|
| 468 |
-
|
| 469 |
-
try:
|
| 470 |
-
question_start = datetime.now()
|
| 471 |
-
submitted_answer = agent(question_text)
|
| 472 |
-
processing_time = (datetime.now() - question_start).total_seconds()
|
| 473 |
-
|
| 474 |
-
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 475 |
-
results_log.append({
|
| 476 |
-
"question_num": i,
|
| 477 |
-
"task_id": task_id,
|
| 478 |
-
"question": question_text,
|
| 479 |
-
"answer": submitted_answer,
|
| 480 |
-
"processing_time": processing_time
|
| 481 |
-
})
|
| 482 |
-
|
| 483 |
-
print(f"✅ Answer: {submitted_answer}")
|
| 484 |
-
print(f"⏱️ Processing time: {processing_time:.2f}s")
|
| 485 |
-
|
| 486 |
-
except Exception as e:
|
| 487 |
-
print(f"❌ Error processing question {i}: {e}")
|
| 488 |
-
results_log.append({
|
| 489 |
-
"question_num": i,
|
| 490 |
-
"task_id": task_id,
|
| 491 |
-
"question": question_text,
|
| 492 |
-
"answer": f"ERROR: {e}",
|
| 493 |
-
"processing_time": 0
|
| 494 |
-
})
|
| 495 |
-
|
| 496 |
-
print("-" * 30)
|
| 497 |
-
|
| 498 |
-
total_time = (datetime.now() - start_time).total_seconds()
|
| 499 |
-
print(f"\n🏁 Completed processing in {total_time:.2f} seconds")
|
| 500 |
-
|
| 501 |
-
# Submit answers
|
| 502 |
-
if answers_payload:
|
| 503 |
-
print(f"\n📤 Submitting {len(answers_payload)} answers...")
|
| 504 |
-
submission_data = {
|
| 505 |
-
"username": username,
|
| 506 |
-
"agent_code": agent_code,
|
| 507 |
-
"answers": answers_payload
|
| 508 |
-
}
|
| 509 |
-
|
| 510 |
-
try:
|
| 511 |
-
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 512 |
-
response.raise_for_status()
|
| 513 |
-
result_data = response.json()
|
| 514 |
-
print("✅ Submission successful!")
|
| 515 |
-
|
| 516 |
-
# Extract score data
|
| 517 |
-
score = result_data.get('score', 'N/A')
|
| 518 |
-
correct_count = result_data.get('correct_count', '?')
|
| 519 |
-
total_attempted = result_data.get('total_attempted', '?')
|
| 520 |
-
message = result_data.get('message', 'No message received.')
|
| 521 |
-
|
| 522 |
-
print(f"🎯 Score: {score}% ({correct_count}/{total_attempted} correct)")
|
| 523 |
-
|
| 524 |
-
except Exception as e:
|
| 525 |
-
print(f"❌ Submission failed: {e}")
|
| 526 |
-
score = "Submission Failed"
|
| 527 |
-
correct_count = "?"
|
| 528 |
-
total_attempted = len(answers_payload)
|
| 529 |
-
message = str(e)
|
| 530 |
-
else:
|
| 531 |
-
score = "No Answers"
|
| 532 |
-
correct_count = 0
|
| 533 |
-
total_attempted = 0
|
| 534 |
-
message = "No answers were generated"
|
| 535 |
-
|
| 536 |
-
# Generate markdown report
|
| 537 |
-
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
| 538 |
-
filename = f"gaia_evaluation_report_{timestamp}.md"
|
| 539 |
-
|
| 540 |
-
print(f"\n📄 Generating report: {filename}")
|
| 541 |
-
|
| 542 |
-
markdown_content = f"""# GAIA Level 1 Evaluation Report
|
| 543 |
-
|
| 544 |
-
**Date:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
| 545 |
-
**Agent:** SimpleAgent (Direct Search & Pattern Matching)
|
| 546 |
-
**Username:** {username}
|
| 547 |
-
**Total Questions:** {len(questions_data)}
|
| 548 |
-
**Processing Time:** {total_time:.2f} seconds
|
| 549 |
-
|
| 550 |
-
## 📊 Results Summary
|
| 551 |
-
|
| 552 |
-
- **Overall Score:** {score}%
|
| 553 |
-
- **Correct Answers:** {correct_count}/{total_attempted}
|
| 554 |
-
- **Average Time per Question:** {total_time/len(questions_data):.2f} seconds
|
| 555 |
-
- **Status:** {message}
|
| 556 |
-
|
| 557 |
-
## 🎯 Agent Performance
|
| 558 |
-
|
| 559 |
-
The SimpleAgent uses a direct approach with:
|
| 560 |
-
- 🌐 Web search via DuckDuckGo
|
| 561 |
-
- 📖 Wikipedia integration
|
| 562 |
-
- 🧮 Calculator for math questions
|
| 563 |
-
- 🎯 Pattern-based answer extraction
|
| 564 |
-
|
| 565 |
-
## 📋 Detailed Results
|
| 566 |
-
|
| 567 |
-
| # | Task ID | Question | Answer | Time (s) |
|
| 568 |
-
|---|---------|----------|--------|----------|
|
| 569 |
-
"""
|
| 570 |
-
|
| 571 |
-
for result in results_log:
|
| 572 |
-
question_preview = result['question'][:80] + "..." if len(result['question']) > 80 else result['question']
|
| 573 |
-
answer_preview = str(result['answer'])[:50] + "..." if len(str(result['answer'])) > 50 else str(result['answer'])
|
| 574 |
-
|
| 575 |
-
# Escape markdown special characters
|
| 576 |
-
question_preview = question_preview.replace("|", "\\|").replace("\n", " ")
|
| 577 |
-
answer_preview = answer_preview.replace("|", "\\|").replace("\n", " ")
|
| 578 |
-
|
| 579 |
-
markdown_content += f"| {result['question_num']} | {result['task_id']} | {question_preview} | {answer_preview} | {result['processing_time']:.2f} |\n"
|
| 580 |
-
|
| 581 |
-
markdown_content += f"""
|
| 582 |
-
|
| 583 |
-
## 🔍 Analysis
|
| 584 |
-
|
| 585 |
-
### Strengths
|
| 586 |
-
- ✅ Handles basic math questions accurately
|
| 587 |
-
- ✅ Good web search integration
|
| 588 |
-
- ✅ Pattern matching for common question types
|
| 589 |
-
- ✅ Detailed logging for debugging
|
| 590 |
-
|
| 591 |
-
### Areas for Improvement
|
| 592 |
-
- 🔄 Handle multimedia content (videos, images, audio)
|
| 593 |
-
- 🔄 Better extraction for complex questions
|
| 594 |
-
- 🔄 Improve Wikipedia search relevance
|
| 595 |
-
- 🔄 Add more sophisticated reasoning
|
| 596 |
-
|
| 597 |
-
### Question Types Performance
|
| 598 |
-
"""
|
| 599 |
-
|
| 600 |
-
# Analyze performance by question type
|
| 601 |
-
math_questions = [r for r in results_log if any(word in r['question'].lower() for word in ['calculate', '+', '-', '*', '/', 'add', 'subtract', 'multiply', 'divide'])]
|
| 602 |
-
who_questions = [r for r in results_log if 'who' in r['question'].lower()]
|
| 603 |
-
when_questions = [r for r in results_log if 'when' in r['question'].lower() or 'year' in r['question'].lower()]
|
| 604 |
-
capital_questions = [r for r in results_log if 'capital' in r['question'].lower()]
|
| 605 |
-
|
| 606 |
-
if math_questions:
|
| 607 |
-
markdown_content += f"- **Math Questions:** {len(math_questions)} questions\n"
|
| 608 |
-
if who_questions:
|
| 609 |
-
markdown_content += f"- **Who Questions:** {len(who_questions)} questions\n"
|
| 610 |
-
if when_questions:
|
| 611 |
-
markdown_content += f"- **When/Year Questions:** {len(when_questions)} questions\n"
|
| 612 |
-
if capital_questions:
|
| 613 |
-
markdown_content += f"- **Capital Questions:** {len(capital_questions)} questions\n"
|
| 614 |
-
|
| 615 |
-
markdown_content += f"""
|
| 616 |
-
|
| 617 |
-
---
|
| 618 |
-
*Report generated by SimpleAgent GAIA Evaluation Tool*
|
| 619 |
-
*Timestamp: {timestamp}*
|
| 620 |
-
"""
|
| 621 |
-
|
| 622 |
-
# Write markdown file
|
| 623 |
-
try:
|
| 624 |
-
with open(filename, 'w', encoding='utf-8') as f:
|
| 625 |
-
f.write(markdown_content)
|
| 626 |
-
print(f"✅ Report saved to: {filename}")
|
| 627 |
-
print(f"📊 Final Score: {score}% ({correct_count}/{total_attempted} correct)")
|
| 628 |
-
|
| 629 |
-
except Exception as e:
|
| 630 |
-
print(f"❌ Error saving report: {e}")
|
| 631 |
-
print("📄 Report content:")
|
| 632 |
-
print(markdown_content[:1000] + "..." if len(markdown_content) > 1000 else markdown_content)
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
# Use the multi-LLM consensus GAIA agent as drop-in replacement
|
| 636 |
-
BasicAgent = ConsensusGAIAAgent
|
| 637 |
-
|
| 638 |
-
# Test the agent
|
| 639 |
-
if __name__ == "__main__":
|
| 640 |
-
import sys
|
| 641 |
-
|
| 642 |
-
if len(sys.argv) > 1 and sys.argv[1] == "--gaia":
|
| 643 |
-
# Run full GAIA evaluation
|
| 644 |
-
run_gaia_evaluation()
|
| 645 |
-
else:
|
| 646 |
-
# Run quick tests
|
| 647 |
-
agent = ConsensusGAIAAgent() # Use the multi-LLM consensus agent
|
| 648 |
-
|
| 649 |
-
test_questions = [
|
| 650 |
-
"What is 15 + 27?",
|
| 651 |
-
"When was the Eiffel Tower built?",
|
| 652 |
-
"Who was the first person to walk on the moon?",
|
| 653 |
-
"What is the capital of France?"
|
| 654 |
-
]
|
| 655 |
-
|
| 656 |
-
print("Testing Simple Direct Agent:")
|
| 657 |
-
print("=" * 40)
|
| 658 |
-
|
| 659 |
-
for i, question in enumerate(test_questions, 1):
|
| 660 |
-
print(f"\n{i}. Question: {question}")
|
| 661 |
-
answer = agent(question)
|
| 662 |
-
print(f" Answer: {answer}")
|
| 663 |
-
print("-" * 25)
|
| 664 |
-
|
| 665 |
-
print(f"\n💡 To run full GAIA evaluation: python {sys.argv[0]} --gaia")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_exa_fix.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
|
| 4 |
-
try:
|
| 5 |
-
from exa_py import Exa
|
| 6 |
-
EXA_AVAILABLE = True
|
| 7 |
-
except ImportError:
|
| 8 |
-
EXA_AVAILABLE = False
|
| 9 |
-
print("Exa not available - install with: pip install exa-py")
|
| 10 |
-
sys.exit(1)
|
| 11 |
-
|
| 12 |
-
def test_exa_search():
|
| 13 |
-
"""Test Exa search_and_contents method"""
|
| 14 |
-
print("Testing Exa search_and_contents method...")
|
| 15 |
-
|
| 16 |
-
# Initialize Exa
|
| 17 |
-
exa_api_key = os.getenv("EXA_API_KEY")
|
| 18 |
-
if not exa_api_key:
|
| 19 |
-
print("❌ EXA_API_KEY not found in environment")
|
| 20 |
-
return
|
| 21 |
-
|
| 22 |
-
exa = Exa(api_key=exa_api_key)
|
| 23 |
-
query = "artificial intelligence"
|
| 24 |
-
|
| 25 |
-
# Try with search_and_contents method
|
| 26 |
-
try:
|
| 27 |
-
print(f"\n🔍 Using search_and_contents method")
|
| 28 |
-
results = exa.search_and_contents(query, num_results=2)
|
| 29 |
-
|
| 30 |
-
if results and hasattr(results, 'results'):
|
| 31 |
-
print(f"✅ Search successful! Found {len(results.results)} results")
|
| 32 |
-
for i, result in enumerate(results.results, 1):
|
| 33 |
-
print(f"\nResult {i}:")
|
| 34 |
-
print(f"Title: {getattr(result, 'title', 'N/A')}")
|
| 35 |
-
print(f"URL: {getattr(result, 'url', 'N/A')}")
|
| 36 |
-
print(f"Has text attribute: {hasattr(result, 'text')}")
|
| 37 |
-
if hasattr(result, 'text') and result.text:
|
| 38 |
-
print(f"Text snippet: {result.text[:100]}...")
|
| 39 |
-
else:
|
| 40 |
-
print("Text attribute is None or empty")
|
| 41 |
-
else:
|
| 42 |
-
print("❌ No results found")
|
| 43 |
-
except Exception as e:
|
| 44 |
-
print(f"❌ Error: {e}")
|
| 45 |
-
|
| 46 |
-
if __name__ == "__main__":
|
| 47 |
-
test_exa_search()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/final_answer.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
from typing import Any, Optional
|
| 2 |
-
from smolagents.tools import Tool
|
| 3 |
-
|
| 4 |
-
class FinalAnswerTool(Tool):
|
| 5 |
-
name = "final_answer"
|
| 6 |
-
description = "Provides a final answer to the given problem."
|
| 7 |
-
inputs = {'answer': {'type': 'any', 'description': 'The final answer to the problem'}}
|
| 8 |
-
output_type = "any"
|
| 9 |
-
|
| 10 |
-
def forward(self, answer: Any) -> Any:
|
| 11 |
-
return answer
|
| 12 |
-
|
| 13 |
-
def __init__(self, *args, **kwargs):
|
| 14 |
-
self.is_initialized = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/visit_webpage.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
from typing import Any, Optional
|
| 2 |
-
from smolagents.tools import Tool
|
| 3 |
-
import requests
|
| 4 |
-
import markdownify
|
| 5 |
-
import smolagents
|
| 6 |
-
|
| 7 |
-
class VisitWebpageTool(Tool):
|
| 8 |
-
name = "visit_webpage"
|
| 9 |
-
description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
|
| 10 |
-
inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
|
| 11 |
-
output_type = "string"
|
| 12 |
-
|
| 13 |
-
def forward(self, url: str) -> str:
|
| 14 |
-
try:
|
| 15 |
-
import requests
|
| 16 |
-
from markdownify import markdownify
|
| 17 |
-
from requests.exceptions import RequestException
|
| 18 |
-
|
| 19 |
-
from smolagents.utils import truncate_content
|
| 20 |
-
except ImportError as e:
|
| 21 |
-
raise ImportError(
|
| 22 |
-
"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
|
| 23 |
-
) from e
|
| 24 |
-
try:
|
| 25 |
-
# Send a GET request to the URL with a 20-second timeout
|
| 26 |
-
response = requests.get(url, timeout=20)
|
| 27 |
-
response.raise_for_status() # Raise an exception for bad status codes
|
| 28 |
-
|
| 29 |
-
# Convert the HTML content to Markdown
|
| 30 |
-
markdown_content = markdownify(response.text).strip()
|
| 31 |
-
|
| 32 |
-
# Remove multiple line breaks
|
| 33 |
-
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
| 34 |
-
|
| 35 |
-
return truncate_content(markdown_content, 10000)
|
| 36 |
-
|
| 37 |
-
except requests.exceptions.Timeout:
|
| 38 |
-
return "The request timed out. Please try again later or check the URL."
|
| 39 |
-
except RequestException as e:
|
| 40 |
-
return f"Error fetching the webpage: {str(e)}"
|
| 41 |
-
except Exception as e:
|
| 42 |
-
return f"An unexpected error occurred: {str(e)}"
|
| 43 |
-
|
| 44 |
-
def __init__(self, *args, **kwargs):
|
| 45 |
-
self.is_initialized = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/web_search.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
from typing import Any, Optional
|
| 2 |
-
from smolagents.tools import Tool
|
| 3 |
-
import duckduckgo_search
|
| 4 |
-
|
| 5 |
-
class DuckDuckGoSearchTool(Tool):
|
| 6 |
-
name = "web_search"
|
| 7 |
-
description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
|
| 8 |
-
inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
|
| 9 |
-
output_type = "string"
|
| 10 |
-
|
| 11 |
-
def __init__(self, max_results=10, **kwargs):
|
| 12 |
-
super().__init__()
|
| 13 |
-
self.max_results = max_results
|
| 14 |
-
try:
|
| 15 |
-
from duckduckgo_search import DDGS
|
| 16 |
-
except ImportError as e:
|
| 17 |
-
raise ImportError(
|
| 18 |
-
"You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
|
| 19 |
-
) from e
|
| 20 |
-
self.ddgs = DDGS(**kwargs)
|
| 21 |
-
|
| 22 |
-
def forward(self, query: str) -> str:
|
| 23 |
-
results = self.ddgs.text(query, max_results=self.max_results)
|
| 24 |
-
if len(results) == 0:
|
| 25 |
-
raise Exception("No results found! Try a less restrictive/shorter query.")
|
| 26 |
-
postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
|
| 27 |
-
return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
verify_exa_fix.py
DELETED
|
@@ -1,85 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
import importlib
|
| 4 |
-
|
| 5 |
-
# List of modules to test
|
| 6 |
-
modules_to_test = [
|
| 7 |
-
"consensus_gaia_agent",
|
| 8 |
-
"advanced_agent",
|
| 9 |
-
"app",
|
| 10 |
-
"gaia_agent",
|
| 11 |
-
"simplified_gaia_agent",
|
| 12 |
-
"framework_gaia_agent"
|
| 13 |
-
]
|
| 14 |
-
|
| 15 |
-
def verify_fix():
|
| 16 |
-
"""Verify that all modules are using search_and_contents instead of search with text=True"""
|
| 17 |
-
print("Verifying Exa API parameter fix...")
|
| 18 |
-
|
| 19 |
-
# Check if Exa is available
|
| 20 |
-
try:
|
| 21 |
-
from exa_py import Exa
|
| 22 |
-
EXA_AVAILABLE = True
|
| 23 |
-
except ImportError:
|
| 24 |
-
print("❌ Exa not available - install with: pip install exa-py")
|
| 25 |
-
return
|
| 26 |
-
|
| 27 |
-
# Initialize Exa
|
| 28 |
-
exa_api_key = os.getenv("EXA_API_KEY")
|
| 29 |
-
if not exa_api_key:
|
| 30 |
-
print("❌ EXA_API_KEY not found in environment")
|
| 31 |
-
return
|
| 32 |
-
|
| 33 |
-
# Test each module
|
| 34 |
-
for module_name in modules_to_test:
|
| 35 |
-
print(f"\nChecking {module_name}...")
|
| 36 |
-
try:
|
| 37 |
-
# Import the module
|
| 38 |
-
module = importlib.import_module(module_name)
|
| 39 |
-
|
| 40 |
-
# Check if the module has a class that uses Exa
|
| 41 |
-
for attr_name in dir(module):
|
| 42 |
-
attr = getattr(module, attr_name)
|
| 43 |
-
if isinstance(attr, type) and attr_name not in ["Exa", "TavilyClient", "DDGS"]:
|
| 44 |
-
# Check if this class has an __init__ method
|
| 45 |
-
if hasattr(attr, "__init__"):
|
| 46 |
-
print(f" - Found class: {attr_name}")
|
| 47 |
-
|
| 48 |
-
# Create an instance of the class
|
| 49 |
-
try:
|
| 50 |
-
instance = attr()
|
| 51 |
-
|
| 52 |
-
# Check if the instance has an exa attribute
|
| 53 |
-
if hasattr(instance, "exa"):
|
| 54 |
-
print(f" ✅ Class has exa attribute")
|
| 55 |
-
|
| 56 |
-
# Check if we can run a search
|
| 57 |
-
try:
|
| 58 |
-
query = "artificial intelligence"
|
| 59 |
-
print(f" 🔍 Testing search with query: '{query}'")
|
| 60 |
-
|
| 61 |
-
# This will work if the class is using search_and_contents
|
| 62 |
-
results = instance.exa.search_and_contents(query, num_results=1)
|
| 63 |
-
|
| 64 |
-
if results and hasattr(results, 'results'):
|
| 65 |
-
print(f" ✅ Search successful! Found {len(results.results)} results")
|
| 66 |
-
for result in results.results:
|
| 67 |
-
if hasattr(result, 'text') and result.text:
|
| 68 |
-
print(f" ✅ Result has text content")
|
| 69 |
-
else:
|
| 70 |
-
print(f" ❌ Result does not have text content")
|
| 71 |
-
else:
|
| 72 |
-
print(f" ❌ No results found")
|
| 73 |
-
except Exception as e:
|
| 74 |
-
print(f" ❌ Search error: {e}")
|
| 75 |
-
else:
|
| 76 |
-
print(f" ⚠️ Class does not have exa attribute")
|
| 77 |
-
except Exception as e:
|
| 78 |
-
print(f" ❌ Could not create instance: {e}")
|
| 79 |
-
except Exception as e:
|
| 80 |
-
print(f"❌ Error checking {module_name}: {e}")
|
| 81 |
-
|
| 82 |
-
print("\nVerification complete!")
|
| 83 |
-
|
| 84 |
-
if __name__ == "__main__":
|
| 85 |
-
verify_fix()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|