|
|
import requests |
|
|
from openai import OpenAI |
|
|
from smolagents import CodeAgent, OpenAIServerModel, tool, Tool, FinalAnswerTool |
|
|
from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool |
|
|
|
|
|
|
|
|
def get_prompt(): |
|
|
with open("prompt.txt", "r") as f: |
|
|
return f.read() |
|
|
|
|
|
|
|
|
@tool |
|
|
def visual_qa(image_url: str, question: str) -> str: |
|
|
""" |
|
|
Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question. |
|
|
|
|
|
Args: |
|
|
image_url (str): A URL pointing to the location of the image to be analyzed. The URL should be accessible and point to a valid image file. |
|
|
question: (str): A natural language string containing the question to be answered based on the provided image. |
|
|
|
|
|
Returns: |
|
|
str: The model-generated answer to the provided question based on the analysis of the image. |
|
|
""" |
|
|
from openai import OpenAI |
|
|
client = OpenAI() |
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": question}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": { |
|
|
"url": image_url, |
|
|
"detail": "low" |
|
|
}, |
|
|
}, |
|
|
], |
|
|
}], |
|
|
) |
|
|
return response.choices[0].message.content |
|
|
|
|
|
|
|
|
@tool |
|
|
def transcribe_audio(audio_url: str) -> str: |
|
|
""" |
|
|
Provides functionality to perform audio transcription. |
|
|
|
|
|
Args: |
|
|
audio_url (str): A URL pointing to the location of the audio to be analyzed. |
|
|
|
|
|
Returns: |
|
|
str: Audio transcription. |
|
|
""" |
|
|
client = OpenAI() |
|
|
r = client.audio.transcriptions.create( |
|
|
model="gpt-4o-mini-transcribe", |
|
|
file=requests.get(audio_url).content, |
|
|
response_format="text", |
|
|
) |
|
|
return r.text |
|
|
|
|
|
|
|
|
class GAIAAgent: |
|
|
def __init__(self): |
|
|
self.agent = CodeAgent( |
|
|
tools=[ |
|
|
GoogleSearchTool(provider="serper"), |
|
|
VisitWebpageTool(), |
|
|
WikipediaSearchTool(), |
|
|
PythonInterpreterTool(), |
|
|
FinalAnswerTool(), |
|
|
visual_qa, |
|
|
transcribe_audio, |
|
|
], |
|
|
model=OpenAIServerModel(model_id='gpt-4.1-mini', max_tokens=4096, temperature=0), |
|
|
add_base_tools=False, |
|
|
max_steps=15, |
|
|
additional_authorized_imports=["pandas"], |
|
|
) |
|
|
self.prompt = get_prompt() |
|
|
|
|
|
def __call__(self, question: str) -> str: |
|
|
args = {"question": question} |
|
|
return self.agent.run(self.prompt, additional_args=args) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
agent = GAIAAgent() |
|
|
|