AgentsCourseFinalAssignment

Sleeping

App Files Files Community

zaldivards commited on May 27, 2025

Commit

741f470

1 Parent(s): 7bfc491

Add youtube transcription tool

Browse files

- Refactor AudioTranscriber for improved structure and error handling

Files changed (3) hide show

requirements.txt +2 -0
tools.py +64 -18
utils.py +1 -6

requirements.txt CHANGED Viewed

@@ -59,6 +59,8 @@ pymupdf==1.26.0
 python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
 python-dotenv==1.1.0
 python-multipart==0.0.20 ; python_version >= '3.8'
 pytz==2025.2
 pyyaml==6.0.2 ; python_version >= '3.8'
 regex==2024.11.6 ; python_version >= '3.8'

 python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
 python-dotenv==1.1.0
 python-multipart==0.0.20 ; python_version >= '3.8'
+pytube==15.0.0
+pytubefix==9.0.1
 pytz==2025.2
 pyyaml==6.0.2 ; python_version >= '3.8'
 regex==2024.11.6 ; python_version >= '3.8'

tools.py CHANGED Viewed

@@ -3,6 +3,7 @@ import ast
 import json
 import os
 import base64
 from time import sleep
 from uuid import uuid4
@@ -12,6 +13,8 @@ import requests
 from bs4 import BeautifulSoup
 from googlesearch import search
 from pandas import read_excel
 from smolagents import tool, Tool
 from requests.exceptions import HTTPError
 from urllib3.exceptions import ReadTimeoutError
@@ -85,23 +88,10 @@ def pdf_reader(task_id: str, file_name: str) -> str:
         return f"Error reading PDF file {file_name}: {e}"
-class AudioTranscriber(Tool):  # pylint: disable=C0115
-    name = "AudioTranscriber"
-    description = "Extract text from audio files, such as MP3, MP4, WAV, etc."
-    inputs = {
-        "task_id": {
-            "type": "string",
-            "description": "The ID of the task associated with the audio file.",
-        },
-        "file_name": {
-            "type": "string",
-            "description": "The name of the audio file to transcribe.",
-        },
-    }
-    output_type = "string"
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
         region = os.getenv("AWS_REGION", "us-east-1")
         self.client = boto3.client("transcribe", region_name=region)
@@ -126,7 +116,7 @@ class AudioTranscriber(Tool):  # pylint: disable=C0115
         try:
             bytes_result = s3_download_file(os.getenv("TARGET_BUCKET"), transcript_url.split("/")[-1])
             transcription_data = json.loads(bytes_result.read().decode("utf-8"))
-            return transcription_data["transcripts"][0]["transcript"]
         except json.JSONDecodeError as e:
             print(f"Error decoding transcription JSON: {e}")
             raise
@@ -134,6 +124,26 @@ class AudioTranscriber(Tool):  # pylint: disable=C0115
             print(f"Error downloading or processing transcription file: {e}")
             raise
     def forward(self, task_id: str, file_name: str) -> str:  # pylint: disable=W0221
         try:
             file_content = get_file(task_id)
@@ -213,7 +223,9 @@ def search_engine(search_term: str) -> str:
         try:
             html_content = BeautifulSoup(_get_content(url), "html.parser")
             # Remove headers and footers
-            for tag in html_content.find_all(["header", "footer", "nav", "aside"]):
                 tag.decompose()
         except (ReadTimeoutError, HTTPError) as ex:
             print("Got HTTP error when requesting %s. Error %s", url, ex)
@@ -233,3 +245,37 @@ def search_engine(search_term: str) -> str:
         return html_text.replace("\n", "")
     return "Could not retrieve any content from the search results."

 import json
 import os
 import base64
+from io import BytesIO
 from time import sleep
 from uuid import uuid4
 from bs4 import BeautifulSoup
 from googlesearch import search
 from pandas import read_excel
+from pytubefix import YouTube
+from pytubefix.cli import on_progress
 from smolagents import tool, Tool
 from requests.exceptions import HTTPError
 from urllib3.exceptions import ReadTimeoutError
         return f"Error reading PDF file {file_name}: {e}"
+class AudioTranscriber:
+    """A class to handle audio transcription using AWS Transcribe."""
+    def __init__(self):
         region = os.getenv("AWS_REGION", "us-east-1")
         self.client = boto3.client("transcribe", region_name=region)
         try:
             bytes_result = s3_download_file(os.getenv("TARGET_BUCKET"), transcript_url.split("/")[-1])
             transcription_data = json.loads(bytes_result.read().decode("utf-8"))
+            return transcription_data["results"]["transcripts"][0]["transcript"]
         except json.JSONDecodeError as e:
             print(f"Error decoding transcription JSON: {e}")
             raise
             print(f"Error downloading or processing transcription file: {e}")
             raise
+class AudioTranscriberTool(Tool, AudioTranscriber):  # pylint: disable=C0115
+    name = "AudioTranscriber"
+    description = "Extract text from audio files, such as MP3, MP4, WAV, etc."
+    inputs = {
+        "task_id": {
+            "type": "string",
+            "description": "The ID of the task associated with the audio file.",
+        },
+        "file_name": {
+            "type": "string",
+            "description": "The name of the audio file to transcribe.",
+        },
+    }
+    output_type = "string"
+    def __init__(self, *args, **kwargs):
+        Tool.__init__(self, *args, **kwargs)
+        AudioTranscriber.__init__(self, *args, **kwargs)
     def forward(self, task_id: str, file_name: str) -> str:  # pylint: disable=W0221
         try:
             file_content = get_file(task_id)
         try:
             html_content = BeautifulSoup(_get_content(url), "html.parser")
             # Remove headers and footers
+            for tag in html_content.find_all(
+                ["header", "footer", "nav", "aside", "script", "style", "noscript", "form"]
+            ):
                 tag.decompose()
         except (ReadTimeoutError, HTTPError) as ex:
             print("Got HTTP error when requesting %s. Error %s", url, ex)
         return html_text.replace("\n", "")
     return "Could not retrieve any content from the search results."
+class YoutubeTranscriberTool(Tool, AudioTranscriber):  # pylint: disable=C0115
+    name = "YoutubeTranscriber"
+    description = "Extract text from YouTube videos, do not work for video understanding."
+    inputs = {
+        "youtube_url": {
+            "type": "string",
+            "description": "The URL of the YouTube video to transcribe.",
+        },
+    }
+    output_type = "string"
+    def __init__(self, *args, **kwargs):
+        Tool.__init__(self, *args, **kwargs)
+        AudioTranscriber.__init__(self, *args, **kwargs)
+    def forward(self, youtube_url: str) -> str:  # pylint: disable=W0221
+        file_name = f"{uuid4()}.mp4"
+        buffer = BytesIO()
+        try:
+            youtube_obj = YouTube(youtube_url, on_progress_callback=on_progress)
+            youtube_obj.streams.filter(progressive=True).first().stream_to_buffer(buffer)
+        except Exception as e:
+            return f"Error fetching YouTube video {youtube_url}: {e}"
+        try:
+            s3_upload_file(buffer, os.getenv("SOURCE_BUCKET"), file_name)
+            media_uri = f"s3://{os.getenv('SOURCE_BUCKET')}/{file_name}"
+            job_name = f"{uuid4()}-{file_name.split('.', maxsplit=1)[0]}"
+            self._transcribe_audio(job_name, media_uri)
+            transcription = self._get_transcription(job_name)
+            return transcription
+        except Exception as e:
+            return f"Error starting transcription job for {file_name}: {e}"

utils.py CHANGED Viewed

@@ -62,12 +62,7 @@ def s3_upload_file(file_content: BytesIO, bucket_name: str, object_name: str) ->
     """
     try:
         s3_client = boto3.client("s3")
-        s3_client.put_object(
-            Bucket=bucket_name,
-            Key=object_name,
-            Body=file_content.getvalue(),
-            ContentType="application/octet-stream",
-        )
     except Exception as e:
         print(f"Error uploading file to S3: {e}")
         raise

     """
     try:
         s3_client = boto3.client("s3")
+        s3_client.put_object(Bucket=bucket_name, Key=object_name, Body=file_content.getvalue())
     except Exception as e:
         print(f"Error uploading file to S3: {e}")
         raise