Commit
·
741f470
1
Parent(s):
7bfc491
Add youtube transcription tool
Browse files- Refactor AudioTranscriber for improved structure and error handling
- requirements.txt +2 -0
- tools.py +64 -18
- utils.py +1 -6
requirements.txt
CHANGED
|
@@ -59,6 +59,8 @@ pymupdf==1.26.0
|
|
| 59 |
python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
| 60 |
python-dotenv==1.1.0
|
| 61 |
python-multipart==0.0.20 ; python_version >= '3.8'
|
|
|
|
|
|
|
| 62 |
pytz==2025.2
|
| 63 |
pyyaml==6.0.2 ; python_version >= '3.8'
|
| 64 |
regex==2024.11.6 ; python_version >= '3.8'
|
|
|
|
| 59 |
python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
| 60 |
python-dotenv==1.1.0
|
| 61 |
python-multipart==0.0.20 ; python_version >= '3.8'
|
| 62 |
+
pytube==15.0.0
|
| 63 |
+
pytubefix==9.0.1
|
| 64 |
pytz==2025.2
|
| 65 |
pyyaml==6.0.2 ; python_version >= '3.8'
|
| 66 |
regex==2024.11.6 ; python_version >= '3.8'
|
tools.py
CHANGED
|
@@ -3,6 +3,7 @@ import ast
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import base64
|
|
|
|
| 6 |
from time import sleep
|
| 7 |
from uuid import uuid4
|
| 8 |
|
|
@@ -12,6 +13,8 @@ import requests
|
|
| 12 |
from bs4 import BeautifulSoup
|
| 13 |
from googlesearch import search
|
| 14 |
from pandas import read_excel
|
|
|
|
|
|
|
| 15 |
from smolagents import tool, Tool
|
| 16 |
from requests.exceptions import HTTPError
|
| 17 |
from urllib3.exceptions import ReadTimeoutError
|
|
@@ -85,23 +88,10 @@ def pdf_reader(task_id: str, file_name: str) -> str:
|
|
| 85 |
return f"Error reading PDF file {file_name}: {e}"
|
| 86 |
|
| 87 |
|
| 88 |
-
class AudioTranscriber
|
| 89 |
-
|
| 90 |
-
description = "Extract text from audio files, such as MP3, MP4, WAV, etc."
|
| 91 |
-
inputs = {
|
| 92 |
-
"task_id": {
|
| 93 |
-
"type": "string",
|
| 94 |
-
"description": "The ID of the task associated with the audio file.",
|
| 95 |
-
},
|
| 96 |
-
"file_name": {
|
| 97 |
-
"type": "string",
|
| 98 |
-
"description": "The name of the audio file to transcribe.",
|
| 99 |
-
},
|
| 100 |
-
}
|
| 101 |
-
output_type = "string"
|
| 102 |
|
| 103 |
-
def __init__(self
|
| 104 |
-
super().__init__(*args, **kwargs)
|
| 105 |
region = os.getenv("AWS_REGION", "us-east-1")
|
| 106 |
self.client = boto3.client("transcribe", region_name=region)
|
| 107 |
|
|
@@ -126,7 +116,7 @@ class AudioTranscriber(Tool): # pylint: disable=C0115
|
|
| 126 |
try:
|
| 127 |
bytes_result = s3_download_file(os.getenv("TARGET_BUCKET"), transcript_url.split("/")[-1])
|
| 128 |
transcription_data = json.loads(bytes_result.read().decode("utf-8"))
|
| 129 |
-
return transcription_data["transcripts"][0]["transcript"]
|
| 130 |
except json.JSONDecodeError as e:
|
| 131 |
print(f"Error decoding transcription JSON: {e}")
|
| 132 |
raise
|
|
@@ -134,6 +124,26 @@ class AudioTranscriber(Tool): # pylint: disable=C0115
|
|
| 134 |
print(f"Error downloading or processing transcription file: {e}")
|
| 135 |
raise
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
def forward(self, task_id: str, file_name: str) -> str: # pylint: disable=W0221
|
| 138 |
try:
|
| 139 |
file_content = get_file(task_id)
|
|
@@ -213,7 +223,9 @@ def search_engine(search_term: str) -> str:
|
|
| 213 |
try:
|
| 214 |
html_content = BeautifulSoup(_get_content(url), "html.parser")
|
| 215 |
# Remove headers and footers
|
| 216 |
-
for tag in html_content.find_all(
|
|
|
|
|
|
|
| 217 |
tag.decompose()
|
| 218 |
except (ReadTimeoutError, HTTPError) as ex:
|
| 219 |
print("Got HTTP error when requesting %s. Error %s", url, ex)
|
|
@@ -233,3 +245,37 @@ def search_engine(search_term: str) -> str:
|
|
| 233 |
return html_text.replace("\n", "")
|
| 234 |
|
| 235 |
return "Could not retrieve any content from the search results."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import base64
|
| 6 |
+
from io import BytesIO
|
| 7 |
from time import sleep
|
| 8 |
from uuid import uuid4
|
| 9 |
|
|
|
|
| 13 |
from bs4 import BeautifulSoup
|
| 14 |
from googlesearch import search
|
| 15 |
from pandas import read_excel
|
| 16 |
+
from pytubefix import YouTube
|
| 17 |
+
from pytubefix.cli import on_progress
|
| 18 |
from smolagents import tool, Tool
|
| 19 |
from requests.exceptions import HTTPError
|
| 20 |
from urllib3.exceptions import ReadTimeoutError
|
|
|
|
| 88 |
return f"Error reading PDF file {file_name}: {e}"
|
| 89 |
|
| 90 |
|
| 91 |
+
class AudioTranscriber:
|
| 92 |
+
"""A class to handle audio transcription using AWS Transcribe."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
def __init__(self):
|
|
|
|
| 95 |
region = os.getenv("AWS_REGION", "us-east-1")
|
| 96 |
self.client = boto3.client("transcribe", region_name=region)
|
| 97 |
|
|
|
|
| 116 |
try:
|
| 117 |
bytes_result = s3_download_file(os.getenv("TARGET_BUCKET"), transcript_url.split("/")[-1])
|
| 118 |
transcription_data = json.loads(bytes_result.read().decode("utf-8"))
|
| 119 |
+
return transcription_data["results"]["transcripts"][0]["transcript"]
|
| 120 |
except json.JSONDecodeError as e:
|
| 121 |
print(f"Error decoding transcription JSON: {e}")
|
| 122 |
raise
|
|
|
|
| 124 |
print(f"Error downloading or processing transcription file: {e}")
|
| 125 |
raise
|
| 126 |
|
| 127 |
+
|
| 128 |
+
class AudioTranscriberTool(Tool, AudioTranscriber): # pylint: disable=C0115
|
| 129 |
+
name = "AudioTranscriber"
|
| 130 |
+
description = "Extract text from audio files, such as MP3, MP4, WAV, etc."
|
| 131 |
+
inputs = {
|
| 132 |
+
"task_id": {
|
| 133 |
+
"type": "string",
|
| 134 |
+
"description": "The ID of the task associated with the audio file.",
|
| 135 |
+
},
|
| 136 |
+
"file_name": {
|
| 137 |
+
"type": "string",
|
| 138 |
+
"description": "The name of the audio file to transcribe.",
|
| 139 |
+
},
|
| 140 |
+
}
|
| 141 |
+
output_type = "string"
|
| 142 |
+
|
| 143 |
+
def __init__(self, *args, **kwargs):
|
| 144 |
+
Tool.__init__(self, *args, **kwargs)
|
| 145 |
+
AudioTranscriber.__init__(self, *args, **kwargs)
|
| 146 |
+
|
| 147 |
def forward(self, task_id: str, file_name: str) -> str: # pylint: disable=W0221
|
| 148 |
try:
|
| 149 |
file_content = get_file(task_id)
|
|
|
|
| 223 |
try:
|
| 224 |
html_content = BeautifulSoup(_get_content(url), "html.parser")
|
| 225 |
# Remove headers and footers
|
| 226 |
+
for tag in html_content.find_all(
|
| 227 |
+
["header", "footer", "nav", "aside", "script", "style", "noscript", "form"]
|
| 228 |
+
):
|
| 229 |
tag.decompose()
|
| 230 |
except (ReadTimeoutError, HTTPError) as ex:
|
| 231 |
print("Got HTTP error when requesting %s. Error %s", url, ex)
|
|
|
|
| 245 |
return html_text.replace("\n", "")
|
| 246 |
|
| 247 |
return "Could not retrieve any content from the search results."
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
class YoutubeTranscriberTool(Tool, AudioTranscriber): # pylint: disable=C0115
|
| 251 |
+
name = "YoutubeTranscriber"
|
| 252 |
+
description = "Extract text from YouTube videos, do not work for video understanding."
|
| 253 |
+
inputs = {
|
| 254 |
+
"youtube_url": {
|
| 255 |
+
"type": "string",
|
| 256 |
+
"description": "The URL of the YouTube video to transcribe.",
|
| 257 |
+
},
|
| 258 |
+
}
|
| 259 |
+
output_type = "string"
|
| 260 |
+
|
| 261 |
+
def __init__(self, *args, **kwargs):
|
| 262 |
+
Tool.__init__(self, *args, **kwargs)
|
| 263 |
+
AudioTranscriber.__init__(self, *args, **kwargs)
|
| 264 |
+
|
| 265 |
+
def forward(self, youtube_url: str) -> str: # pylint: disable=W0221
|
| 266 |
+
file_name = f"{uuid4()}.mp4"
|
| 267 |
+
buffer = BytesIO()
|
| 268 |
+
try:
|
| 269 |
+
youtube_obj = YouTube(youtube_url, on_progress_callback=on_progress)
|
| 270 |
+
youtube_obj.streams.filter(progressive=True).first().stream_to_buffer(buffer)
|
| 271 |
+
except Exception as e:
|
| 272 |
+
return f"Error fetching YouTube video {youtube_url}: {e}"
|
| 273 |
+
try:
|
| 274 |
+
s3_upload_file(buffer, os.getenv("SOURCE_BUCKET"), file_name)
|
| 275 |
+
media_uri = f"s3://{os.getenv('SOURCE_BUCKET')}/{file_name}"
|
| 276 |
+
job_name = f"{uuid4()}-{file_name.split('.', maxsplit=1)[0]}"
|
| 277 |
+
self._transcribe_audio(job_name, media_uri)
|
| 278 |
+
transcription = self._get_transcription(job_name)
|
| 279 |
+
return transcription
|
| 280 |
+
except Exception as e:
|
| 281 |
+
return f"Error starting transcription job for {file_name}: {e}"
|
utils.py
CHANGED
|
@@ -62,12 +62,7 @@ def s3_upload_file(file_content: BytesIO, bucket_name: str, object_name: str) ->
|
|
| 62 |
"""
|
| 63 |
try:
|
| 64 |
s3_client = boto3.client("s3")
|
| 65 |
-
s3_client.put_object(
|
| 66 |
-
Bucket=bucket_name,
|
| 67 |
-
Key=object_name,
|
| 68 |
-
Body=file_content.getvalue(),
|
| 69 |
-
ContentType="application/octet-stream",
|
| 70 |
-
)
|
| 71 |
except Exception as e:
|
| 72 |
print(f"Error uploading file to S3: {e}")
|
| 73 |
raise
|
|
|
|
| 62 |
"""
|
| 63 |
try:
|
| 64 |
s3_client = boto3.client("s3")
|
| 65 |
+
s3_client.put_object(Bucket=bucket_name, Key=object_name, Body=file_content.getvalue())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
except Exception as e:
|
| 67 |
print(f"Error uploading file to S3: {e}")
|
| 68 |
raise
|