RCaz's picture
Update agent.py
9d6412f verified
import math
from typing import Optional, Tuple, Literal
from smolagents import tool
import base64
from openai import OpenAI
import joblib
import os
@tool
def download_and_get_path_for_provided_file(path: str) -> str:
"""
Download and cache the provided file. Returns the path of the cached file.
Args:
path (str): Intended file path
Returns:
bytes: The binary content of the downloaded file
"""
from huggingface_hub import hf_hub_download
for dataset in ["test","validation"]:
try:
file_path = hf_hub_download(
repo_id="gaia-benchmark/GAIA",
filename=f"2023/{dataset}/{path}",
repo_type="dataset",
token=os.environ['HF_TOKEN'])
if file_path:
return file_path
except Exception as e:
print(e)
continue
@tool
def extract_text_from_audio(file_path: str) -> str:
"""
Extract and return text transcription from an audio file given its path.
Args:
file_path (str): Path to the audio file to be transcribed.
Returns:
str: The extracted text content from the audio file.
Raises:
Exception : the exception
Examples:
>>> extract_text_from_audio("meeting_recording.wav")
"Hello team, welcome to our weekly meeting..."
>>> extract_text_from_audio("/path/to/audio/interview.mp3")
"Could you please introduce yourself and your background?"
"""
try:
return joblib.load(f"{file_path}")
except:
client = OpenAI()
audio_file = open(file_path, "rb")
transcription = client.audio.transcriptions.create(
model="gpt-4o-transcribe",
file=audio_file,
response_format="text"
)
joblib.dump(transcription, f"{file_path}")
return transcription
@tool
def describe_image(request:str, file_path: str) -> str:
"""
Extract and return the requested information from an image given its path.
Args:
request: The information to retreive from the image. The request must be simple, short and precise.
file_path (str): Path to the audio file to be transcribed. The file should
be in a format compatible with the SpeechRecognition library.
Returns:
str: The extracted text from the image.
Examples:
>>> describe_image("how many birds are in the picture", "underwater_picture.jpg")
"There are 2 birds depicted in an frame placed underwater"
>>> describe_image("what is the position of the black queen?","chess_board.png")
"Qd3"
"""
try :
return joblib.load(f"{file_path}")
except:
client = OpenAI()
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# Getting the Base64 string
base64_image = encode_image(file_path)
response = client.responses.create(
model="gpt-4.1",
input=[
{
"role": "user",
"content": [
{ "type": "input_text", "text": request },
{
"type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_image}",
},
],
}
],
)
joblib.dump(response.output_text,f"{file_path}")
return response.output_text
@tool
def get_transcript_from_youtube_file_id(file_id: str) -> str:
"""
Retrieve the transcript for a YouTube video given its id.
Args:
file_id (str): The YouTube video ID (the alphanumeric string that appears after
'v=' in a YouTube URL, e.g., 'dQw4w9WgXcQ').
Returns:
str: The transcript content for the specified video. a JSON string or formatted
text containing transcript segments with timestamps.
"""
from youtube_transcript_api import YouTubeTranscriptApi
ytt_api = YouTubeTranscriptApi()
transcript = ytt_api.fetch(file_id)
return transcript
@tool
def parse_python_file(path: str) -> str:
"""
Read and return the contents of a Python file from its path.
Args:
path (str): The file path to the Python file to be read.
Returns:
str: The complete contents of the Python file as a string.
"""
with open(path, "r") as py_file:
return py_file.read()
@tool
def parse_pdf_file(path: str) -> str:
"""
Read and return the contents of a pdf file from its path.
Args:
path (str): The file path to the pdf file to be read.
Returns:
str: The complete contents of the pdf file as a string.
"""
from pypdf import PdfReader
if not path.endswith(".pdf"):
return "file does not end with .pdf"
reader = PdfReader(path)
len_pages = len(reader.pages)
out = ""
for p in range(len_pages):
page = reader.pages[0]
text = page.extract_text()
out+=text+"\n"
return out
class TestAgent:
def __init__(self):
# import code agent and basic tool from smolagent
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient
# import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools
#from langchain_community.agent_toolkits import load_tools
from langchain_community.agent_toolkits.load_tools import load_tools
from smolagents import Tool
wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0])
wikipedia_tool.top_k_results=3
# import tools from MCP servers @ https://github.com/mcp
#from mcp import StdioServerParameters
#server_parameters = StdioServerParameters(command="uvx",
# args=["--quiet", "youtubeqa@0.2.1"],
# env={"UV_PYTHON": "3.12", **os.environ},
# )
#youtube_tools = MCPServerTool(server_params=server_parameters)
model = OpenAIServerModel(model_id="gpt-4.1-mini")
#model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct")
# Instantiate the agent
self.agent = CodeAgent(
tools=[download_and_get_path_for_provided_file, # V4. get attached file
DuckDuckGoSearchTool(), # basic tools from smolagent
VisitWebpageTool(),
wikipedia_tool, # tool from langchain with extra parmaeters
#youtube_tools, # tool from MCP server
get_transcript_from_youtube_file_id, # V4
parse_python_file, # V4
describe_image, # V4
extract_text_from_audio, # V4
parse_pdf_file, # V5
FinalAnswerTool()],
additional_authorized_imports=["pandas","markdownify","requests","chess","os"], # V2 add markdownify & requests V5 add chess and os
model=model,
max_steps=6, # V3 increase steps
planning_interval=3, # V3 add structure
verbosity_level=0,
use_structured_outputs_internally=True # V3. Adds structure
)
# V3. add Guidance
#prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
#self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
# V4. use prompt from the paper as guidance
prompt = """\n\n
It is very important to remember the foillowing: You are a general AI assistant. I will ask you a question. Report your thoughts, and
finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated
list of numbers and/or strings.
If you are asked for a number, don’t use comma to write your number neither use units such as $ or
percent sign unless specified otherwise.
If you are asked for a string, don’t use articles, neither abbreviations (e.g. for cities), and write the
digits in plain text unless specified otherwise.
If you are asked for a comma separated list, apply the above rules depending of whether the element
to be put in the list is a number or a string.
\n\n
Now it's your turn.
"""
self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt
def __call__(self, question: str) -> str:
print(f"Agent received question (first 50 chars): {question[:50]}...")
answer = self.agent.run(question)
print(f"Agent returning his answer: {answer}")
return answer