abliznyuk's picture
fixes
5e11dec
raw
history blame
2.77 kB
import requests
from openai import OpenAI
from smolagents import CodeAgent, OpenAIServerModel, tool, Tool, FinalAnswerTool
from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool
def get_prompt():
with open("prompt.txt", "r") as f:
return f.read()
@tool
def visual_qa(image_url: str, question: str) -> str:
"""
Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.
Args:
image_url (str): A URL pointing to the location of the image to be analyzed. The URL should be accessible and point to a valid image file.
question: (str): A natural language string containing the question to be answered based on the provided image.
Returns:
str: The model-generated answer to the provided question based on the analysis of the image.
"""
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": image_url,
"detail": "low"
},
},
],
}],
)
return response.choices[0].message.content
@tool
def transcribe_audio(audio_url: str) -> str:
"""
Provides functionality to perform audio transcription.
Args:
audio_url (str): A URL pointing to the location of the audio to be analyzed.
Returns:
str: Audio transcription.
"""
client = OpenAI()
r = client.audio.transcriptions.create(
model="gpt-4o-mini-transcribe",
file=requests.get(audio_url).content,
response_format="text",
)
return r.text
class GAIAAgent:
def __init__(self):
self.agent = CodeAgent(
tools=[
GoogleSearchTool(provider="serper"),
VisitWebpageTool(),
WikipediaSearchTool(),
PythonInterpreterTool(),
FinalAnswerTool(),
visual_qa,
transcribe_audio,
],
model=OpenAIServerModel(model_id='gpt-4.1-mini', max_tokens=4096, temperature=0),
add_base_tools=False,
max_steps=15,
additional_authorized_imports=["pandas"],
)
self.prompt = get_prompt()
def __call__(self, question: str) -> str:
args = {"question": question}
return self.agent.run(self.prompt, additional_args=args)
if __name__ == '__main__':
agent = GAIAAgent()