Francesco-A's picture
FIX: removed LocalAgent
6cc7969
# Generic agent
import os
from typing import Optional
import pandas as pd
# Smolagents imports
from smolagents import (
CodeAgent,
InferenceClientModel,
TransformersModel,
LiteLLMModel,
Tool,
tool,
DuckDuckGoSearchTool,
VisitWebpageTool,
WikipediaSearchTool,
PythonInterpreterTool,
FinalAnswerTool,
)
# Import your custom tools (to be used in app, not in local notebook)
from tools.download_file import download_file_from_url
from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
from tools.audio_tools import youtube_to_text, transcribe_audio
# Define tools
AGENT_TOOLS = [
# Default Tools
DuckDuckGoSearchTool(), # Internet search
VisitWebpageTool(), # Retrieve webpage content
PythonInterpreterTool(), # Executes agent-generated Python code
FinalAnswerTool(), # Ends agent reasoning and returns final answer
# Custom Tools
download_file_from_url, # file downloader
text_file_to_string, # .txt, .md, .json, etc.
pdf_to_text, # PyMuPDF-based safe PDF parser
image_to_text, # OCR for images
youtube_to_text, # Youtube audio to text
transcribe_audio, # Audio file to text
]
# System prompt
SYSTEM_PROMPT = """
You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
### 1. Reason-Act-Observe
Follow a **PLAN β†’ ACT β†’ OBSERVE** loop:
- **PLAN:** Break the task into 1–3 logical steps. Identify tools for each step.
- **ACT:** Write and run one self-contained Python block per step.
- **OBSERVE:** Examine outputs or errors before proceeding.
### 2. File Handling
- When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
You must select the reading or transcription method **strictly** based on the file type or source, following the rules below.
| File Type / Source | Tool / Method to Use |
| :--- | :--- |
| `.csv` | `pd.read_csv(filepath)` |
| `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
| `.pdf` | `pdf_to_text(filepath)` |
| `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
| `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
| **YouTube URL** | `youtube_to_text(url)` |
| `.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg` | `transcribe_audio(filepath)` |
**Important rules:**
- When a tool returns a local file path, you **must** store it in a variable (e.g. `filepath`) and pass that variable directly to the next tool.
- You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
- For YouTube links, always attempt `youtube_to_text` first; it will automatically fall back to Whisper if captions are unavailable.
### 3. Data Analysis & Answer
- Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
- Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
- Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
### 4. Additional instructions for the following tasks provided by GAIA team
- You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
- Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
### 5. To provide the final answer, you MUST call the final_answer tool inside a <code> block.
- Example of how to end the task:
Thought: I have found the answer. I will now provide it.
<code>
final_answer("FINAL ANSWER: The capital of France is Paris")
</code>
\n\n
"""
class BasicAgent:
def __init__(self):
self.system_prompt = SYSTEM_PROMPT
self.model = InferenceClientModel(
model_id = "Qwen/Qwen3-Next-80B-A3B-Thinking",
temperature = 0.0,
top_p = 1.0,
max_tokens = 8196,
)
self.tools = AGENT_TOOLS
self.basic_agent = CodeAgent(
name = "basic_agent",
description = "Basic smolagents CodeAgent",
model = self.model,
tools = self.tools,
add_base_tools = True, # probably redundant, but it does not hurt
max_steps = 5,
additional_authorized_imports = [
'numpy','subprocess', 're', 'pandas',
'json', 'os', 'datetime', 'tempfile',
],
verbosity_level = 1,
max_print_outputs_length=1_000_000
)
print("βœ… Basic agent initialized")
def __call__(self, question: str, file_path: Optional[str] = None) -> str:
if file_path:
# Inject system prompt + question and (optional) file path
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
f"There is an associated file at path: {file_path}.\n"
f"Use the appropriate tool to download it (if necessary) and read it before answering"
)
else:
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
)
return self.basic_agent.run(prompt)
class GeminiAgent:
def __init__(self):
self.system_prompt = SYSTEM_PROMPT
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
raise RuntimeError(
"GOOGLE_API_KEY not found."
)
self.model = LiteLLMModel(
model_id = "gemini/gemini-2.0-flash",
api_key = GOOGLE_API_KEY,
temperature = 0.0,
top_p = 1.0,
max_tokens = 8196,
)
self.tools = AGENT_TOOLS
self.gemini_agent = CodeAgent(
name = "gemini_agent",
description = "Gemini CodeAgent",
model = self.model,
tools = self.tools,
add_base_tools = True, # probably redundant, but it does not hurt
max_steps = 5,
additional_authorized_imports = [
'numpy','subprocess', 're', 'pandas',
'json', 'os', 'datetime', 'tempfile',
],
verbosity_level = 1,
max_print_outputs_length=1_000_000
)
print("βœ… Gemini agent initialized")
def __call__(self, question: str, file_path: Optional[str] = None) -> str:
if file_path:
# Inject system prompt + question and (optional) file path
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
f"There is an associated file at path: {file_path}.\n"
f"Use the appropriate tool to download it (if necessary) and read it before answering"
)
else:
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
)
return self.gemini_agent.run(prompt)