Agent_Course_Final_Assignment

Sleeping

File size: 2,766 Bytes

import requests
from openai import OpenAI
from smolagents import CodeAgent, OpenAIServerModel, tool, Tool, FinalAnswerTool
from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool


def get_prompt():
    with open("prompt.txt", "r") as f:
        return f.read()


@tool
def visual_qa(image_url: str, question: str) -> str:
    """
    Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.

    Args:
        image_url (str): A URL pointing to the location of the image to be analyzed. The URL should be accessible and point to a valid image file.
        question: (str): A natural language string containing the question to be answered based on the provided image.

    Returns:
        str: The model-generated answer to the provided question based on the analysis of the image.
    """
    from openai import OpenAI
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url,
                        "detail": "low"
                    },
                },
            ],
        }],
    )
    return response.choices[0].message.content


@tool
def transcribe_audio(audio_url: str) -> str:
    """
    Provides functionality to perform audio transcription.

    Args:
        audio_url (str): A URL pointing to the location of the audio to be analyzed.

    Returns:
        str: Audio transcription.
    """
    client = OpenAI()
    r = client.audio.transcriptions.create(
        model="gpt-4o-mini-transcribe",
        file=requests.get(audio_url).content,
        response_format="text",
    )
    return r.text


class GAIAAgent:
    def __init__(self):
        self.agent = CodeAgent(
            tools=[
                GoogleSearchTool(provider="serper"),
                VisitWebpageTool(),
                WikipediaSearchTool(),
                PythonInterpreterTool(),
                FinalAnswerTool(),
                visual_qa,
                transcribe_audio,
            ],
            model=OpenAIServerModel(model_id='gpt-4.1-mini', max_tokens=4096, temperature=0),
            add_base_tools=False,
            max_steps=15,
            additional_authorized_imports=["pandas"],
        )
        self.prompt = get_prompt()

    def __call__(self, question: str) -> str:
        args = {"question": question}
        return self.agent.run(self.prompt, additional_args=args)


if __name__ == '__main__':
    agent = GAIAAgent()