abliznyuk commited on
Commit
d5e37c5
·
1 Parent(s): c432527

audio processing, rework task file source passing

Browse files
Files changed (2) hide show
  1. agent.py +26 -26
  2. app.py +1 -1
agent.py CHANGED
@@ -1,4 +1,6 @@
1
- from smolagents import CodeAgent, OpenAIServerModel, tool, Tool
 
 
2
  from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool
3
 
4
 
@@ -13,22 +15,11 @@ def visual_qa(image_url: str, question: str) -> str:
13
  Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.
14
 
15
  Args:
16
- image_url: str
17
- A URL pointing to the location of the image to be analyzed. The URL
18
- should be accessible and point to a valid image file.
19
- question: str
20
- A natural language string containing the question to be answered
21
- based on the provided image.
22
 
23
  Returns:
24
- str
25
- The model-generated answer to the provided question based on the
26
- analysis of the image.
27
-
28
- Raises:
29
- Exception
30
- If there is any issue with the API request, such as connection
31
- errors or invalid inputs.
32
  """
33
  from openai import OpenAI
34
  client = OpenAI()
@@ -52,16 +43,23 @@ def visual_qa(image_url: str, question: str) -> str:
52
  return response.choices[0].message.content
53
 
54
 
55
- class FinalAnswerTool(Tool):
56
- name = "final_answer"
57
- description = "Provides a final answer to the given problem."
58
- inputs = {"answer": {"type": "any", "description": "The final answer to the problem"}}
59
- output_type = "any"
 
 
60
 
61
- def forward(self, answer: str) -> str:
62
- if "final answer:" in answer.lower():
63
- return answer.lower().split("final answer:")[1].strip()
64
- return answer
 
 
 
 
 
65
 
66
 
67
  class GAIAAgent:
@@ -74,6 +72,7 @@ class GAIAAgent:
74
  PythonInterpreterTool(),
75
  FinalAnswerTool(),
76
  visual_qa,
 
77
  ],
78
  model=OpenAIServerModel(model_id='gpt-4o-mini', max_tokens=4096, temperature=0),
79
  add_base_tools=False,
@@ -81,8 +80,9 @@ class GAIAAgent:
81
  )
82
  self.prompt = get_prompt()
83
 
84
- def __call__(self, question: str) -> str:
85
- return self.agent.run(self.prompt, additional_args={"question": question})
 
86
 
87
 
88
  if __name__ == '__main__':
 
1
+ import requests
2
+ from openai import OpenAI
3
+ from smolagents import CodeAgent, OpenAIServerModel, tool, Tool, FinalAnswerTool
4
  from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool
5
 
6
 
 
15
  Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.
16
 
17
  Args:
18
+ image_url (str): A URL pointing to the location of the image to be analyzed. The URL should be accessible and point to a valid image file.
19
+ question: (str): A natural language string containing the question to be answered based on the provided image.
 
 
 
 
20
 
21
  Returns:
22
+ str: The model-generated answer to the provided question based on the analysis of the image.
 
 
 
 
 
 
 
23
  """
24
  from openai import OpenAI
25
  client = OpenAI()
 
43
  return response.choices[0].message.content
44
 
45
 
46
+ @tool
47
+ def transcribe_audio(audio_url):
48
+ """
49
+ Provides functionality to perform audio transcription.
50
+
51
+ Args:
52
+ audio_url (str): A URL pointing to the location of the audio to be analyzed.
53
 
54
+ Returns:
55
+ str: Audio transcription.
56
+ """
57
+ client = OpenAI()
58
+ r = client.audio.transcriptions.create(
59
+ model="gpt-4o-mini-transcribe",
60
+ file=requests.get(audio_url).content,
61
+ response_format="text",
62
+ )
63
 
64
 
65
  class GAIAAgent:
 
72
  PythonInterpreterTool(),
73
  FinalAnswerTool(),
74
  visual_qa,
75
+ transcribe_audio,
76
  ],
77
  model=OpenAIServerModel(model_id='gpt-4o-mini', max_tokens=4096, temperature=0),
78
  add_base_tools=False,
 
80
  )
81
  self.prompt = get_prompt()
82
 
83
+ def __call__(self, question: str, source: str = None) -> str:
84
+ args = {"question": question, "task_file_source": source} if source else {"question": question, }
85
+ return self.agent.run(self.prompt, additional_args=args)
86
 
87
 
88
  if __name__ == '__main__':
app.py CHANGED
@@ -75,7 +75,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
75
  continue
76
  try:
77
  if data_url:
78
- submitted_answer = agent(question_text + f"Task file source URL: {data_url}.")
79
  else:
80
  submitted_answer = agent(question_text)
81
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
75
  continue
76
  try:
77
  if data_url:
78
+ submitted_answer = agent(question_text, data_url)
79
  else:
80
  submitted_answer = agent(question_text)
81
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})