Agent_Course_Final_Assignment

Sleeping

App Files Files Community

abliznyuk commited on Jun 30, 2025

Commit

d5e37c5

1 Parent(s): c432527

audio processing, rework task file source passing

Browse files

Files changed (2) hide show

agent.py +26 -26
app.py +1 -1

agent.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from smolagents import CodeAgent, OpenAIServerModel, tool, Tool
 from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool
@@ -13,22 +15,11 @@ def visual_qa(image_url: str, question: str) -> str:
     Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.
     Args:
-        image_url: str
-            A URL pointing to the location of the image to be analyzed. The URL
-            should be accessible and point to a valid image file.
-        question: str
-            A natural language string containing the question to be answered
-            based on the provided image.
     Returns:
-        str
-            The model-generated answer to the provided question based on the
-            analysis of the image.
-    Raises:
-        Exception
-            If there is any issue with the API request, such as connection
-            errors or invalid inputs.
     """
     from openai import OpenAI
     client = OpenAI()
@@ -52,16 +43,23 @@ def visual_qa(image_url: str, question: str) -> str:
     return response.choices[0].message.content
-class FinalAnswerTool(Tool):
-    name = "final_answer"
-    description = "Provides a final answer to the given problem."
-    inputs = {"answer": {"type": "any", "description": "The final answer to the problem"}}
-    output_type = "any"
-    def forward(self, answer: str) -> str:
-        if "final answer:" in answer.lower():
-            return answer.lower().split("final answer:")[1].strip()
-        return answer
 class GAIAAgent:
@@ -74,6 +72,7 @@ class GAIAAgent:
                 PythonInterpreterTool(),
                 FinalAnswerTool(),
                 visual_qa,
             ],
             model=OpenAIServerModel(model_id='gpt-4o-mini', max_tokens=4096, temperature=0),
             add_base_tools=False,
@@ -81,8 +80,9 @@ class GAIAAgent:
         )
         self.prompt = get_prompt()
-    def __call__(self, question: str) -> str:
-        return self.agent.run(self.prompt, additional_args={"question": question})
 if __name__ == '__main__':

+import requests
+from openai import OpenAI
+from smolagents import CodeAgent, OpenAIServerModel, tool, Tool, FinalAnswerTool
 from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool
     Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question.
     Args:
+        image_url (str): A URL pointing to the location of the image to be analyzed. The URL should be accessible and point to a valid image file.
+        question: (str): A natural language string containing the question to be answered based on the provided image.
     Returns:
+        str: The model-generated answer to the provided question based on the analysis of the image.
     """
     from openai import OpenAI
     client = OpenAI()
     return response.choices[0].message.content
+@tool
+def transcribe_audio(audio_url):
+    """
+    Provides functionality to perform audio transcription.
+    Args:
+        audio_url (str): A URL pointing to the location of the audio to be analyzed.
+    Returns:
+        str: Audio transcription.
+    """
+    client = OpenAI()
+    r = client.audio.transcriptions.create(
+        model="gpt-4o-mini-transcribe",
+        file=requests.get(audio_url).content,
+        response_format="text",
+    )
 class GAIAAgent:
                 PythonInterpreterTool(),
                 FinalAnswerTool(),
                 visual_qa,
+                transcribe_audio,
             ],
             model=OpenAIServerModel(model_id='gpt-4o-mini', max_tokens=4096, temperature=0),
             add_base_tools=False,
         )
         self.prompt = get_prompt()
+    def __call__(self, question: str, source: str = None) -> str:
+        args = {"question": question, "task_file_source": source} if source else {"question": question, }
+        return self.agent.run(self.prompt, additional_args=args)
 if __name__ == '__main__':

app.py CHANGED Viewed

@@ -75,7 +75,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             continue
         try:
             if data_url:
-                submitted_answer = agent(question_text + f"Task file source URL: {data_url}.")
             else:
                 submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})

             continue
         try:
             if data_url:
+                submitted_answer = agent(question_text, data_url)
             else:
                 submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})