File size: 8,028 Bytes
15a3001 f4c14e9 15a3001 f4c14e9 15a3001 f4c14e9 15a3001 f4c14e9 15a3001 f4c14e9 15a3001 f4c14e9 15a3001 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
# Generic agent
import os
from typing import Optional
import pandas as pd
# Smolagents imports
from smolagents import (
CodeAgent,
InferenceClientModel,
TransformersModel,
LiteLLMModel,
Tool,
tool,
DuckDuckGoSearchTool,
VisitWebpageTool,
WikipediaSearchTool,
PythonInterpreterTool,
FinalAnswerTool,
)
# Import your custom tools (to be used in app, not in local notebook)
from tools.download_file import download_file_from_url
from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
from tools.audio_tools import youtube_to_text, transcribe_audio
# Define tools
AGENT_TOOLS = [
# Default Tools
DuckDuckGoSearchTool(), # Internet search
VisitWebpageTool(), # Retrieve webpage content
PythonInterpreterTool(), # Executes agent-generated Python code
FinalAnswerTool(), # Ends agent reasoning and returns final answer
# Custom Tools
download_file_from_url, # file downloader
text_file_to_string, # .txt, .md, .json, etc.
pdf_to_text, # PyMuPDF-based safe PDF parser
image_to_text, # OCR for images
youtube_to_text, # Youtube audio to text
transcribe_audio, # Audio file to text
]
# System prompt
SYSTEM_PROMPT = """
You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
### 1. Reason-Act-Observe
Follow a **PLAN β ACT β OBSERVE** loop:
- **PLAN:** Break the task into 1β3 logical steps. Identify tools for each step.
- **ACT:** Write and run one self-contained Python block per step.
- **OBSERVE:** Examine outputs or errors before proceeding.
### 2. File Handling
- When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
You must select the reading or transcription method **strictly** based on the file type or source, following the rules below.
| File Type / Source | Tool / Method to Use |
| :--- | :--- |
| `.csv` | `pd.read_csv(filepath)` |
| `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
| `.pdf` | `pdf_to_text(filepath)` |
| `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
| `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
| **YouTube URL** | `youtube_to_text(url)` |
| `.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg` | `transcribe_audio(filepath)` |
**Important rules:**
- When a tool returns a local file path, you **must** store it in a variable (e.g. `filepath`) and pass that variable directly to the next tool.
- You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
- For YouTube links, always attempt `youtube_to_text` first; it will automatically fall back to Whisper if captions are unavailable.
### 3. Data Analysis & Answer
- Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
- Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
- Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
### 4. Additional instructions for the following tasks provided by GAIA team
- You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
- Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
### 5. To provide the final answer, you MUST call the final_answer tool inside a <code> block.
- Example of how to end the task:
Thought: I have found the answer. I will now provide it.
<code>
final_answer("FINAL ANSWER: The capital of France is Paris")
</code>
\n\n
"""
class BasicAgent:
def __init__(self):
self.system_prompt = SYSTEM_PROMPT
self.model = InferenceClientModel(
model_id = "Qwen/Qwen3-Next-80B-A3B-Thinking",
temperature = 0.0,
top_p = 1.0,
max_tokens = 8196,
)
self.tools = AGENT_TOOLS
self.basic_agent = CodeAgent(
name = "basic_agent",
description = "Basic smolagents CodeAgent",
model = self.model,
tools = self.tools,
add_base_tools = True, # probably redundant, but it does not hurt
max_steps = 5,
additional_authorized_imports = [
'numpy','subprocess', 're', 'pandas',
'json', 'os', 'datetime', 'tempfile',
],
verbosity_level = 1,
max_print_outputs_length=1_000_000
)
print("β
Basic agent initialized")
def __call__(self, question: str, file_path: Optional[str] = None) -> str:
if file_path:
# Inject system prompt + question and (optional) file path
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
f"There is an associated file at path: {file_path}.\n"
f"Use the appropriate tool to download it (if necessary) and read it before answering"
)
else:
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
)
return self.basic_agent.run(prompt)
class GeminiAgent:
def __init__(self):
self.system_prompt = SYSTEM_PROMPT
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
raise RuntimeError(
"GOOGLE_API_KEY not found."
)
self.model = LiteLLMModel(
model_id = "gemini/gemini-2.0-flash",
api_key = GOOGLE_API_KEY,
temperature = 0.0,
top_p = 1.0,
max_tokens = 8196,
)
self.tools = AGENT_TOOLS
self.gemini_agent = CodeAgent(
name = "gemini_agent",
description = "Gemini CodeAgent",
model = self.model,
tools = self.tools,
add_base_tools = True, # probably redundant, but it does not hurt
max_steps = 5,
additional_authorized_imports = [
'numpy','subprocess', 're', 'pandas',
'json', 'os', 'datetime', 'tempfile',
],
verbosity_level = 1,
max_print_outputs_length=1_000_000
)
print("β
Gemini agent initialized")
def __call__(self, question: str, file_path: Optional[str] = None) -> str:
if file_path:
# Inject system prompt + question and (optional) file path
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
f"There is an associated file at path: {file_path}.\n"
f"Use the appropriate tool to download it (if necessary) and read it before answering"
)
else:
prompt = (
f"{self.system_prompt}\n\n"
f"Question: {question}\n\n"
)
return self.gemini_agent.run(prompt)
|