Agent_course_Final_Assignment

Sleeping

App Files Files Community

RCaz commited on Nov 9, 2025

Commit

982aacc

verified ·

1 Parent(s): faa5fe0

2 more llm for audio and vieo processing

Browse files

Files changed (1) hide show

agent.py +83 -18

agent.py CHANGED Viewed

@@ -1,22 +1,38 @@
 import math
 from typing import Optional, Tuple, Literal
 from smolagents import tool
 @tool
 def extract_text_from_audio(file_path: str) -> str:
     """
-    Extract and return text transcription from an audio file using speech recognition.
-    This tool uses Google's speech recognition API to convert spoken audio content
-    into text. It supports various audio formats including WAV, AIFF, and FLAC
-    (formats supported by the SpeechRecognition library).
     Args:
-        file_path (str): Path to the audio file to be transcribed. The file should
-                        be in a format compatible with the SpeechRecognition library.
     Returns:
         str: The extracted text content from the audio file.
@@ -32,18 +48,65 @@ def extract_text_from_audio(file_path: str) -> str:
         "Could you please introduce yourself and your background?"
     """
-    import speech_recognition as sr
-    r = sr.Recognizer()
-    try:
-        with sr.AudioFile(file_path) as source:
-            # listen for the data (load audio to memory)
-            audio_data = r.record(source)
-            # recognize (convert from speech to text)
-            text = r.recognize_google(audio_data)
-        return text
-    except Exception as e:
-        return e
 class TestAgent:
     def __init__(self):
@@ -88,6 +151,8 @@ class TestAgent:
         prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
         self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")

 import math
 from typing import Optional, Tuple, Literal
 from smolagents import tool
+import base64
+from openai import OpenAI
+@tool
+def download_and_get_path_for_provided_file(path: str):
+    """
+    Download and cache the provided file. Returns the path of the cached file.
+    Args:
+        path (str): Intended file path
+    Returns:
+        bytes: The binary content of the downloaded file
+    """
+    file_path = hf_hub_download(
+        repo_id="gaia-benchmark/GAIA",
+        filename="2023/test/063800f6-8832-4856-972b-17b877612533.png",
+        repo_type="dataset",
+        token=os.environ['HF_TOKEN']
+    )
+    return file_path
 @tool
 def extract_text_from_audio(file_path: str) -> str:
     """
+    Extract and return text transcription from an audio file.
     Args:
+        file_path (str): Path to the audio file to be transcribed.
     Returns:
         str: The extracted text content from the audio file.
         "Could you please introduce yourself and your background?"
     """
+    client = OpenAI()
+    audio_file = open(file_path, "rb")
+    transcription = client.audio.transcriptions.create(
+        model="gpt-4o-transcribe",
+        file=audio_file,
+        response_format="text"
+    )
+    return transcription
+def describe_image(request:str, file_path: str) -> str:
+    """
+    Extract and return the requested information from an image.
+    Args:
+        request: The information to retreive from the image.
+        file_path (str): Path to the audio file to be transcribed. The file should
+                        be in a format compatible with the SpeechRecognition library.
+    Returns:
+        str: The extracted text from the image.
+    Examples:
+        >>> describe_image("how many birds are in the picture", "underwater_picture.jpg")
+        "There are 2 birds depicted in an frame placed underwater"
+        >>> describe_image("what is the position of the black queen?","chess_board.png")
+        "Qd3"
+    """
+    client = OpenAI()
+    # Function to encode the image
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+    # Getting the Base64 string
+    base64_image = encode_image(file_path)
+    response = client.responses.create(
+        model="gpt-4.1",
+        input=[
+            {
+                "role": "user",
+                "content": [
+                    { "type": "input_text", "text": request },
+                    {
+                        "type": "input_image",
+                        "image_url": f"data:image/jpeg;base64,{base64_image}",
+                    },
+                ],
+            }
+        ],
+    )
+    return response.output_text
 class TestAgent:
     def __init__(self):
         prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
         self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
+        # V4. use prompt from the paper ?
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")