abliznyuk commited on
Commit
a73f583
·
1 Parent(s): e9204df

swithc to 4o-mini, add VQA tool

Browse files
Files changed (2) hide show
  1. agent.py +62 -4
  2. requirements.txt +2 -1
agent.py CHANGED
@@ -1,4 +1,4 @@
1
- from smolagents import CodeAgent, OpenAIServerModel
2
  from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool
3
 
4
 
@@ -7,6 +7,63 @@ def get_prompt():
7
  return f.read()
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  class GAIAAgent:
11
  def __init__(self):
12
  self.agent = CodeAgent(
@@ -15,10 +72,12 @@ class GAIAAgent:
15
  VisitWebpageTool(),
16
  WikipediaSearchTool(),
17
  PythonInterpreterTool(),
 
 
18
  ],
19
- model=OpenAIServerModel(model_id='gpt-4.1', max_tokens=4096, temperature=0),
20
  add_base_tools=False,
21
- max_steps=15,
22
  )
23
  self.prompt = get_prompt()
24
 
@@ -28,4 +87,3 @@ class GAIAAgent:
28
 
29
  if __name__ == '__main__':
30
  agent = GAIAAgent()
31
- agent("What is the meaning of life?")
 
1
+ from smolagents import CodeAgent, OpenAIServerModel, tool, Tool
2
  from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool
3
 
4
 
 
7
  return f.read()
8
 
9
 
10
+ @tool
11
+ def visual_qa(image_url: str, question: str) -> str:
12
+ """
13
+ Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.
14
+
15
+ Args:
16
+ image_url: str
17
+ A URL pointing to the location of the image to be analyzed. The URL
18
+ should be accessible and point to a valid image file.
19
+ question: str
20
+ A natural language string containing the question to be answered
21
+ based on the provided image.
22
+
23
+ Returns:
24
+ str
25
+ The model-generated answer to the provided question based on the
26
+ analysis of the image.
27
+
28
+ Raises:
29
+ Exception
30
+ If there is any issue with the API request, such as connection
31
+ errors or invalid inputs.
32
+ """
33
+ from openai import OpenAI
34
+ client = OpenAI()
35
+
36
+ response = client.chat.completions.create(
37
+ model="gpt-4o-mini",
38
+ messages=[{
39
+ "role": "user",
40
+ "content": [
41
+ {"type": "text", "text": question},
42
+ {
43
+ "type": "image_url",
44
+ "image_url": {
45
+ "url": image_url,
46
+ "detail": "low"
47
+ },
48
+ },
49
+ ],
50
+ }],
51
+ )
52
+ return response.choices[0].message.content
53
+
54
+
55
+ class FinalAnswerTool(Tool):
56
+ name = "final_answer"
57
+ description = "Provides a final answer to the given problem."
58
+ inputs = {"answer": {"type": "any", "description": "The final answer to the problem"}}
59
+ output_type = "any"
60
+
61
+ def forward(self, answer: str) -> str:
62
+ if "final answer:" in answer.lower():
63
+ return answer.lower().split("final answer:")[1].strip()
64
+ return answer
65
+
66
+
67
  class GAIAAgent:
68
  def __init__(self):
69
  self.agent = CodeAgent(
 
72
  VisitWebpageTool(),
73
  WikipediaSearchTool(),
74
  PythonInterpreterTool(),
75
+ FinalAnswerTool(),
76
+ visual_qa,
77
  ],
78
+ model=OpenAIServerModel(model_id='gpt-4o-mini', max_tokens=4096, temperature=0),
79
  add_base_tools=False,
80
+ max_steps=10,
81
  )
82
  self.prompt = get_prompt()
83
 
 
87
 
88
  if __name__ == '__main__':
89
  agent = GAIAAgent()
 
requirements.txt CHANGED
@@ -2,4 +2,5 @@ smolagents[openai]==1.19.0
2
  wikipedia-api==0.8.1
3
  duckduckgo-search==8.0.4
4
  gradio==5.35.0
5
- requests
 
 
2
  wikipedia-api==0.8.1
3
  duckduckgo-search==8.0.4
4
  gradio==5.35.0
5
+ requests
6
+ markdownify