AgentsCourseFinalAssignment

Sleeping

App Files Files Community

zaldivards commited on May 29, 2025

Commit

aa94df3

1 Parent(s): 694a9b2

Refactor tools

Browse files

- Update image transcriber
- Add YouTube video descriptor tool

Files changed (2) hide show

requirements.txt +7 -6
tools.py +107 -26

requirements.txt CHANGED Viewed

@@ -3,8 +3,8 @@ aiofiles==24.1.0 ; python_version >= '3.8'
 annotated-types==0.7.0 ; python_version >= '3.8'
 anyio==4.9.0 ; python_version >= '3.9'
 beautifulsoup4==4.13.4 ; python_full_version >= '3.7.0'
-boto3==1.38.23
-botocore==1.38.23 ; python_version >= '3.9'
 certifi==2025.4.26 ; python_version >= '3.6'
 charset-normalizer==3.4.2 ; python_version >= '3.7'
 click==8.2.1 ; python_version >= '3.10'
@@ -23,14 +23,14 @@ h11==0.16.0 ; python_version >= '3.8'
 hf-xet==1.1.2 ; platform_machine == 'x86_64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'aarch64'
 httpcore==1.0.9 ; python_version >= '3.8'
 httpx==0.28.1 ; python_version >= '3.8'
-huggingface-hub==0.32.1 ; python_full_version >= '3.8.0'
 idna==3.10 ; python_version >= '3.6'
 jinja2==3.1.6 ; python_version >= '3.7'
 jiter==0.10.0 ; python_version >= '3.9'
 jmespath==1.0.1 ; python_version >= '3.7'
 jsonpatch==1.33 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
 jsonpointer==3.0.0 ; python_version >= '3.7'
-langchain-core==0.3.61 ; python_version >= '3.9'
 langchain-openai==0.3.18
 langgraph==0.4.7
 langgraph-checkpoint==2.0.26 ; python_version >= '3.9'
@@ -44,6 +44,7 @@ markupsafe==3.0.2 ; python_version >= '3.9'
 mdurl==0.1.2 ; python_version >= '3.7'
 numpy==2.2.6 ; python_version >= '3.10'
 openai==1.82.0
 openpyxl==3.1.5
 orjson==3.10.18 ; python_version >= '3.9'
 ormsgpack==1.10.0 ; python_version >= '3.9'
@@ -60,7 +61,7 @@ python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in
 python-dotenv==1.1.0
 python-multipart==0.0.20 ; python_version >= '3.8'
 pytube==15.0.0
-pytubefix==9.0.1
 pytz==2025.2
 pyyaml==6.0.2 ; python_version >= '3.8'
 regex==2024.11.6 ; python_version >= '3.8'
@@ -73,7 +74,7 @@ safehttpx==0.1.6 ; python_version >= '3.10'
 semantic-version==2.10.0 ; python_version >= '2.7'
 shellingham==1.5.4 ; python_version >= '3.7'
 six==1.17.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
-smolagents==1.16.1
 sniffio==1.3.1 ; python_version >= '3.7'
 soupsieve==2.7 ; python_version >= '3.8'
 starlette==0.46.2 ; sys_platform != 'emscripten'

 annotated-types==0.7.0 ; python_version >= '3.8'
 anyio==4.9.0 ; python_version >= '3.9'
 beautifulsoup4==4.13.4 ; python_full_version >= '3.7.0'
+boto3==1.38.24
+botocore==1.38.24 ; python_version >= '3.9'
 certifi==2025.4.26 ; python_version >= '3.6'
 charset-normalizer==3.4.2 ; python_version >= '3.7'
 click==8.2.1 ; python_version >= '3.10'
 hf-xet==1.1.2 ; platform_machine == 'x86_64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'aarch64'
 httpcore==1.0.9 ; python_version >= '3.8'
 httpx==0.28.1 ; python_version >= '3.8'
+huggingface-hub==0.32.2 ; python_full_version >= '3.8.0'
 idna==3.10 ; python_version >= '3.6'
 jinja2==3.1.6 ; python_version >= '3.7'
 jiter==0.10.0 ; python_version >= '3.9'
 jmespath==1.0.1 ; python_version >= '3.7'
 jsonpatch==1.33 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
 jsonpointer==3.0.0 ; python_version >= '3.7'
+langchain-core==0.3.62 ; python_version >= '3.9'
 langchain-openai==0.3.18
 langgraph==0.4.7
 langgraph-checkpoint==2.0.26 ; python_version >= '3.9'
 mdurl==0.1.2 ; python_version >= '3.7'
 numpy==2.2.6 ; python_version >= '3.10'
 openai==1.82.0
+opencv-python==4.11.0.86
 openpyxl==3.1.5
 orjson==3.10.18 ; python_version >= '3.9'
 ormsgpack==1.10.0 ; python_version >= '3.9'
 python-dotenv==1.1.0
 python-multipart==0.0.20 ; python_version >= '3.8'
 pytube==15.0.0
+pytubefix==9.1.1
 pytz==2025.2
 pyyaml==6.0.2 ; python_version >= '3.8'
 regex==2024.11.6 ; python_version >= '3.8'
 semantic-version==2.10.0 ; python_version >= '2.7'
 shellingham==1.5.4 ; python_version >= '3.7'
 six==1.17.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
+smolagents==1.17.0
 sniffio==1.3.1 ; python_version >= '3.7'
 soupsieve==2.7 ; python_version >= '3.8'
 starlette==0.46.2 ; sys_platform != 'emscripten'

tools.py CHANGED Viewed

@@ -3,11 +3,14 @@ import ast
 import json
 import os
 import base64
 from io import BytesIO
 from time import sleep
 from uuid import uuid4
 import boto3
 import fitz
 import requests
 from bs4 import BeautifulSoup
@@ -20,7 +23,7 @@ from requests.exceptions import HTTPError
 from urllib3.exceptions import ReadTimeoutError
 from definitions import TranscriptionJob
-from utils import get_file, s3_upload_file, s3_download_file, bedrock_runtime, BEDROCK_MODEL_ID
 @tool
@@ -159,43 +162,32 @@ class AudioTranscriberTool(Tool, AudioTranscriber):  # pylint: disable=C0115
 @tool
-def image_transcriber(text_prompt: str, task_id: str, file_name: str) -> str:
-    """Transcribes text from an image file
     Args:
-        text_prompt (str): The text prompt to guide the transcription.
         task_id (str): The ID of the task associated with the image file.
         file_name (str): The name of the image file to transcribe.
     """
     try:
         file_content = get_file(task_id)
         base64_image = base64.b64encode(file_content.getvalue()).decode("utf-8")
-        response = bedrock_runtime.invoke_model(
-            modelId=BEDROCK_MODEL_ID,
-            body=json.dumps(
                 {
-                    "anthropic_version": "bedrock-2023-05-31",
-                    "max_tokens": 4096,
-                    "messages": [
                         {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "image",
-                                    "source": {
-                                        "type": "base64",
-                                        "media_type": f"image/{file_name.split('.')[-1]}",
-                                        "data": base64_image,
-                                    },
-                                },
-                                {"type": "text", "text": text_prompt},
-                            ],
-                        }
                     ],
                 }
-            ),
-        )["body"].read()
-        return json.loads(response)["content"][0]["text"]
     except Exception as e:
         return f"Error processing image file {file_name}: {e}"
@@ -279,3 +271,92 @@ class YoutubeTranscriberTool(Tool, AudioTranscriber):  # pylint: disable=C0115
             return transcription
         except Exception as e:
             return f"Error starting transcription job for {file_name}: {e}"

 import json
 import os
 import base64
+import tempfile
 from io import BytesIO
 from time import sleep
+from typing import Generator
 from uuid import uuid4
 import boto3
+import cv2  # type: ignore
 import fitz
 import requests
 from bs4 import BeautifulSoup
 from urllib3.exceptions import ReadTimeoutError
 from definitions import TranscriptionJob
+from utils import get_file, s3_upload_file, s3_download_file, invoke_bedrock_model, invoke_openai_model
 @tool
 @tool
+def image_analyzer(task: str, task_id: str, file_name: str) -> str:
+    """Analyzes an image file and returns a response based on the task provided.
     Args:
+        task (str): The description of the information to extract from the image.
         task_id (str): The ID of the task associated with the image file.
         file_name (str): The name of the image file to transcribe.
     """
     try:
         file_content = get_file(task_id)
         base64_image = base64.b64encode(file_content.getvalue()).decode("utf-8")
+        response = invoke_openai_model(
+            [
                 {
+                    "role": "user",
+                    "content": [
                         {
+                            "type": "input_image",
+                            "image_url": f"data:image/{file_name.split('.')[-1]};base64,{base64_image}",
+                        },
+                        {"type": "input_text", "text": task},
                     ],
                 }
+            ]
+        )
+        return response
     except Exception as e:
         return f"Error processing image file {file_name}: {e}"
             return transcription
         except Exception as e:
             return f"Error starting transcription job for {file_name}: {e}"
+class YoutubeVideoDescriptorTool(Tool):  # pylint: disable=C0115
+    name = "YoutubeVideoDescriptor"
+    description = (
+        "Describe a youtube video based on the video. Use this tool for tasks like video understanding,"
+        "not for audio transcription. Example: 'What is in the video?'"
+    )
+    inputs = {
+        "youtube_url": {
+            "type": "string",
+            "description": "The URL of the YouTube video to get the description from.",
+        },
+        "task": {
+            "type": "string",
+            "description": "The task to perform on the video, e.g., 'Describe the video content'.",
+        },
+    }
+    output_type = "string"
+    # pylint: disable=E1101
+    def _base64_frames(self, video_buffer: BytesIO, target_fps: int = 10) -> Generator[list[str], None, None]:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as input_temp:
+            input_temp.write(video_buffer.getvalue())
+            input_temp_path = input_temp.name
+        cap = cv2.VideoCapture(input_temp_path)
+        orig_fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_interval = int(round(orig_fps / target_fps))
+        frames = []
+        i = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Keep every Nth frame to reduce to target_fps
+            if i % frame_interval == 0:
+                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # store frame in memory (RGB)
+            i += 1
+        cap.release()
+        base64_frames = []
+        for frame in frames:
+            _, buffer = cv2.imencode(".jpg", frame)
+            encoded_buffer = base64.b64encode(buffer).decode("utf-8")
+            base64_frames.append(encoded_buffer)
+            if len(base64_frames) == 20:  # yield every 20 frames
+                yield base64_frames
+                base64_frames = []
+    def forward(self, task: str, youtube_url: str) -> str:  # pylint: disable=W0221
+        file_name = f"{uuid4()}.mp4"
+        buffer = BytesIO()
+        try:
+            youtube_obj = YouTube(youtube_url, on_progress_callback=on_progress)
+            youtube_obj.streams.filter(progressive=True).first().stream_to_buffer(buffer)
+        except Exception as e:
+            return f"Error fetching YouTube video {youtube_url}: {e}"
+        try:
+            vision_messages = []
+            responses = []
+            for base64_frame_chunk in self._base64_frames(buffer, target_fps=1):
+                vision_messages = [
+                    {"type": "input_image", "image_url": f"data:image/jpeg;base64,{base64_frame}"}
+                    for base64_frame in base64_frame_chunk
+                ]
+                response = invoke_openai_model(
+                    [{"role": "user", "content": [*vision_messages, {"type": "input_text", "text": task}]}]
+                )
+                responses.append(response)
+            response = "\n".join(responses)
+            final_response = invoke_bedrock_model(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": response},
+                            {"type": "text", "text": "Please summarize the above text shortly."},
+                        ],
+                    }
+                ]
+            )
+            return final_response
+        except Exception as e:
+            return f"Error starting transcription job for {file_name}: {e}"