File size: 2,766 Bytes
d5e37c5
 
 
bed91f1
3b5401f
 
 
 
 
1004317
 
a73f583
 
 
 
 
 
d5e37c5
 
a73f583
 
d5e37c5
a73f583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5e37c5
e715173
d5e37c5
 
 
 
 
a73f583
d5e37c5
 
 
 
 
 
 
 
 
e715173
a73f583
 
1004317
 
 
3b5401f
 
 
 
 
a73f583
 
d5e37c5
3b5401f
f381c1d
b64c436
f381c1d
5e11dec
1004317
3b5401f
1004317
f381c1d
 
d5e37c5
3b5401f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from openai import OpenAI
from smolagents import CodeAgent, OpenAIServerModel, tool, Tool, FinalAnswerTool
from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool


def get_prompt():
    with open("prompt.txt", "r") as f:
        return f.read()


@tool
def visual_qa(image_url: str, question: str) -> str:
    """
    Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.

    Args:
        image_url (str): A URL pointing to the location of the image to be analyzed. The URL should be accessible and point to a valid image file.
        question: (str): A natural language string containing the question to be answered based on the provided image.

    Returns:
        str: The model-generated answer to the provided question based on the analysis of the image.
    """
    from openai import OpenAI
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url,
                        "detail": "low"
                    },
                },
            ],
        }],
    )
    return response.choices[0].message.content


@tool
def transcribe_audio(audio_url: str) -> str:
    """
    Provides functionality to perform audio transcription.

    Args:
        audio_url (str): A URL pointing to the location of the audio to be analyzed.

    Returns:
        str: Audio transcription.
    """
    client = OpenAI()
    r = client.audio.transcriptions.create(
        model="gpt-4o-mini-transcribe",
        file=requests.get(audio_url).content,
        response_format="text",
    )
    return r.text


class GAIAAgent:
    def __init__(self):
        self.agent = CodeAgent(
            tools=[
                GoogleSearchTool(provider="serper"),
                VisitWebpageTool(),
                WikipediaSearchTool(),
                PythonInterpreterTool(),
                FinalAnswerTool(),
                visual_qa,
                transcribe_audio,
            ],
            model=OpenAIServerModel(model_id='gpt-4.1-mini', max_tokens=4096, temperature=0),
            add_base_tools=False,
            max_steps=15,
            additional_authorized_imports=["pandas"],
        )
        self.prompt = get_prompt()

    def __call__(self, question: str) -> str:
        args = {"question": question}
        return self.agent.run(self.prompt, additional_args=args)


if __name__ == '__main__':
    agent = GAIAAgent()