vandenn's picture
Remove done image interpretation tooling comment
f81ffba
import mimetypes
import random
import time
from io import BytesIO
from typing import Any, TypedDict
import pandas as pd
import requests
from PIL import Image
from smolagents import (
DuckDuckGoSearchTool,
LiteLLMModel,
PythonInterpreterTool,
ToolCallingAgent,
VisitWebpageTool,
)
from smolagents.agents import FinalAnswerStep
from src.settings import settings
from src.tools import FinalAnswerTool
from src.utils import BaseAgent, InputTokenRateLimiter
class ParsedFile(TypedDict):
text: str
image: Image
class GaiaAgent(BaseAgent):
def __init__(self):
self.model = LiteLLMModel(
model_id=settings.llm_model_id, api_key=settings.llm_api_key
)
self.agent = ToolCallingAgent(
tools=[
DuckDuckGoSearchTool(max_results=3),
VisitWebpageTool(max_output_length=20000),
PythonInterpreterTool(),
FinalAnswerTool(),
# TODO: MP3 interpretation
],
max_steps=10,
planning_interval=5,
model=self.model,
)
self.token_rate_limiter = InputTokenRateLimiter()
self.expected_tokens_per_step = 10000
self.max_retries = 3
self.base_delay = 5
def run(self, question: str, file_name: str = "", file_url: str = "") -> Any:
final_answer = None
retry_count = 0
parsed_file = self._parse_file(file_name, file_url)
input = f"""
Answer the following QUESTION as concisely as possible.
If available, a FILE NAME and the actual FILE will be attached after your planning stage.
If the FILE NAME is not N/A, assume that the FILE is available.
Make the shortest possible execution plan to answer this QUESTION.
QUESTION: {question}
FILE NAME: {file_name if file_name else "N/A"}
"""
if parsed_file["text"]:
input = input + f"\nFILE CONTENT: {parsed_file['text']}"
input_images = None
if parsed_file["image"]:
input_images = [parsed_file["image"]]
while True:
try:
for step in self.agent.run(input, images=input_images, stream=True):
self.token_rate_limiter.maybe_wait(self.expected_tokens_per_step)
token_usage_info = getattr(step, "token_usage", None)
tokens_used = 0
if tokens_used:
tokens_used = token_usage_info.input_tokens
self.token_rate_limiter.add_tokens(tokens_used)
if isinstance(step, FinalAnswerStep):
final_answer = step.output
break
except Exception as e:
if (
"overloaded" in str(e).lower()
or "rate limit" in str(e).lower()
or "529" in str(e)
):
if retry_count >= self.max_retries:
print(f"Max retries reached. Error: {e}")
break
delay = self.base_delay * (2**retry_count) + random.uniform(0, 1)
print(
f"Anthropic server error due to overload or rate limit. Retrying in {delay:.1f} seconds.."
)
print(f"The error was: {e}")
time.sleep(delay)
retry_count += 1
else:
print(f"Error occurred: {e}")
break
return final_answer
def _parse_file(self, file_name: str, file_url: str) -> ParsedFile:
result = ParsedFile(text=None, image=None)
if not file_name or not file_url:
return result
try:
response = requests.get(file_url)
response.raise_for_status()
except Exception as e:
print(f"Failed to download file: {e}")
return result
# Try to handle the 'no file' JSON case
try:
file_data = response.json()
if (
"detail" in file_data
and "No file path associated" in file_data["detail"]
):
print(f"No file found for {file_name} at {file_url}")
return result
except Exception:
pass # Not JSON, so it's probably the file content
file_type, _ = mimetypes.guess_type(file_name)
if file_type and file_type.startswith("text"):
try:
result["text"] = response.content.decode("utf-8")
return result
except Exception:
return "Failed to decode text file as utf-8."
elif file_name.endswith(".py"):
try:
result["text"] = response.content.decode("utf-8")
return result
except Exception:
return "Failed to decode Python file as utf-8."
elif file_name.endswith(".xlsx"):
try:
df = pd.read_excel(BytesIO(response.content))
result["text"] = df.to_string()
return result
except Exception as e:
return f"Failed to parse Excel file: {e}"
elif file_type and file_type.startswith("image"):
try:
image = Image.open(BytesIO(response.content))
result["image"] = image
return result
except Exception as e:
return f"Failed to decode image file: {e}"
else:
print(
f"[{file_name} is a binary file of type {file_type or 'unknown'} and cannot be parsed as text.]"
)
return result
if __name__ == "__main__":
agent = GaiaAgent()
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of English wikipedia."
print(f"Response: {agent.run(question)}")