zaldivards commited on
Commit
741f470
·
1 Parent(s): 7bfc491

Add youtube transcription tool

Browse files

- Refactor AudioTranscriber for improved structure and error handling

Files changed (3) hide show
  1. requirements.txt +2 -0
  2. tools.py +64 -18
  3. utils.py +1 -6
requirements.txt CHANGED
@@ -59,6 +59,8 @@ pymupdf==1.26.0
59
  python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
60
  python-dotenv==1.1.0
61
  python-multipart==0.0.20 ; python_version >= '3.8'
 
 
62
  pytz==2025.2
63
  pyyaml==6.0.2 ; python_version >= '3.8'
64
  regex==2024.11.6 ; python_version >= '3.8'
 
59
  python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
60
  python-dotenv==1.1.0
61
  python-multipart==0.0.20 ; python_version >= '3.8'
62
+ pytube==15.0.0
63
+ pytubefix==9.0.1
64
  pytz==2025.2
65
  pyyaml==6.0.2 ; python_version >= '3.8'
66
  regex==2024.11.6 ; python_version >= '3.8'
tools.py CHANGED
@@ -3,6 +3,7 @@ import ast
3
  import json
4
  import os
5
  import base64
 
6
  from time import sleep
7
  from uuid import uuid4
8
 
@@ -12,6 +13,8 @@ import requests
12
  from bs4 import BeautifulSoup
13
  from googlesearch import search
14
  from pandas import read_excel
 
 
15
  from smolagents import tool, Tool
16
  from requests.exceptions import HTTPError
17
  from urllib3.exceptions import ReadTimeoutError
@@ -85,23 +88,10 @@ def pdf_reader(task_id: str, file_name: str) -> str:
85
  return f"Error reading PDF file {file_name}: {e}"
86
 
87
 
88
- class AudioTranscriber(Tool): # pylint: disable=C0115
89
- name = "AudioTranscriber"
90
- description = "Extract text from audio files, such as MP3, MP4, WAV, etc."
91
- inputs = {
92
- "task_id": {
93
- "type": "string",
94
- "description": "The ID of the task associated with the audio file.",
95
- },
96
- "file_name": {
97
- "type": "string",
98
- "description": "The name of the audio file to transcribe.",
99
- },
100
- }
101
- output_type = "string"
102
 
103
- def __init__(self, *args, **kwargs):
104
- super().__init__(*args, **kwargs)
105
  region = os.getenv("AWS_REGION", "us-east-1")
106
  self.client = boto3.client("transcribe", region_name=region)
107
 
@@ -126,7 +116,7 @@ class AudioTranscriber(Tool): # pylint: disable=C0115
126
  try:
127
  bytes_result = s3_download_file(os.getenv("TARGET_BUCKET"), transcript_url.split("/")[-1])
128
  transcription_data = json.loads(bytes_result.read().decode("utf-8"))
129
- return transcription_data["transcripts"][0]["transcript"]
130
  except json.JSONDecodeError as e:
131
  print(f"Error decoding transcription JSON: {e}")
132
  raise
@@ -134,6 +124,26 @@ class AudioTranscriber(Tool): # pylint: disable=C0115
134
  print(f"Error downloading or processing transcription file: {e}")
135
  raise
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def forward(self, task_id: str, file_name: str) -> str: # pylint: disable=W0221
138
  try:
139
  file_content = get_file(task_id)
@@ -213,7 +223,9 @@ def search_engine(search_term: str) -> str:
213
  try:
214
  html_content = BeautifulSoup(_get_content(url), "html.parser")
215
  # Remove headers and footers
216
- for tag in html_content.find_all(["header", "footer", "nav", "aside"]):
 
 
217
  tag.decompose()
218
  except (ReadTimeoutError, HTTPError) as ex:
219
  print("Got HTTP error when requesting %s. Error %s", url, ex)
@@ -233,3 +245,37 @@ def search_engine(search_term: str) -> str:
233
  return html_text.replace("\n", "")
234
 
235
  return "Could not retrieve any content from the search results."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import json
4
  import os
5
  import base64
6
+ from io import BytesIO
7
  from time import sleep
8
  from uuid import uuid4
9
 
 
13
  from bs4 import BeautifulSoup
14
  from googlesearch import search
15
  from pandas import read_excel
16
+ from pytubefix import YouTube
17
+ from pytubefix.cli import on_progress
18
  from smolagents import tool, Tool
19
  from requests.exceptions import HTTPError
20
  from urllib3.exceptions import ReadTimeoutError
 
88
  return f"Error reading PDF file {file_name}: {e}"
89
 
90
 
91
+ class AudioTranscriber:
92
+ """A class to handle audio transcription using AWS Transcribe."""
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ def __init__(self):
 
95
  region = os.getenv("AWS_REGION", "us-east-1")
96
  self.client = boto3.client("transcribe", region_name=region)
97
 
 
116
  try:
117
  bytes_result = s3_download_file(os.getenv("TARGET_BUCKET"), transcript_url.split("/")[-1])
118
  transcription_data = json.loads(bytes_result.read().decode("utf-8"))
119
+ return transcription_data["results"]["transcripts"][0]["transcript"]
120
  except json.JSONDecodeError as e:
121
  print(f"Error decoding transcription JSON: {e}")
122
  raise
 
124
  print(f"Error downloading or processing transcription file: {e}")
125
  raise
126
 
127
+
128
+ class AudioTranscriberTool(Tool, AudioTranscriber): # pylint: disable=C0115
129
+ name = "AudioTranscriber"
130
+ description = "Extract text from audio files, such as MP3, MP4, WAV, etc."
131
+ inputs = {
132
+ "task_id": {
133
+ "type": "string",
134
+ "description": "The ID of the task associated with the audio file.",
135
+ },
136
+ "file_name": {
137
+ "type": "string",
138
+ "description": "The name of the audio file to transcribe.",
139
+ },
140
+ }
141
+ output_type = "string"
142
+
143
+ def __init__(self, *args, **kwargs):
144
+ Tool.__init__(self, *args, **kwargs)
145
+ AudioTranscriber.__init__(self, *args, **kwargs)
146
+
147
  def forward(self, task_id: str, file_name: str) -> str: # pylint: disable=W0221
148
  try:
149
  file_content = get_file(task_id)
 
223
  try:
224
  html_content = BeautifulSoup(_get_content(url), "html.parser")
225
  # Remove headers and footers
226
+ for tag in html_content.find_all(
227
+ ["header", "footer", "nav", "aside", "script", "style", "noscript", "form"]
228
+ ):
229
  tag.decompose()
230
  except (ReadTimeoutError, HTTPError) as ex:
231
  print("Got HTTP error when requesting %s. Error %s", url, ex)
 
245
  return html_text.replace("\n", "")
246
 
247
  return "Could not retrieve any content from the search results."
248
+
249
+
250
+ class YoutubeTranscriberTool(Tool, AudioTranscriber): # pylint: disable=C0115
251
+ name = "YoutubeTranscriber"
252
+ description = "Extract text from YouTube videos, do not work for video understanding."
253
+ inputs = {
254
+ "youtube_url": {
255
+ "type": "string",
256
+ "description": "The URL of the YouTube video to transcribe.",
257
+ },
258
+ }
259
+ output_type = "string"
260
+
261
+ def __init__(self, *args, **kwargs):
262
+ Tool.__init__(self, *args, **kwargs)
263
+ AudioTranscriber.__init__(self, *args, **kwargs)
264
+
265
+ def forward(self, youtube_url: str) -> str: # pylint: disable=W0221
266
+ file_name = f"{uuid4()}.mp4"
267
+ buffer = BytesIO()
268
+ try:
269
+ youtube_obj = YouTube(youtube_url, on_progress_callback=on_progress)
270
+ youtube_obj.streams.filter(progressive=True).first().stream_to_buffer(buffer)
271
+ except Exception as e:
272
+ return f"Error fetching YouTube video {youtube_url}: {e}"
273
+ try:
274
+ s3_upload_file(buffer, os.getenv("SOURCE_BUCKET"), file_name)
275
+ media_uri = f"s3://{os.getenv('SOURCE_BUCKET')}/{file_name}"
276
+ job_name = f"{uuid4()}-{file_name.split('.', maxsplit=1)[0]}"
277
+ self._transcribe_audio(job_name, media_uri)
278
+ transcription = self._get_transcription(job_name)
279
+ return transcription
280
+ except Exception as e:
281
+ return f"Error starting transcription job for {file_name}: {e}"
utils.py CHANGED
@@ -62,12 +62,7 @@ def s3_upload_file(file_content: BytesIO, bucket_name: str, object_name: str) ->
62
  """
63
  try:
64
  s3_client = boto3.client("s3")
65
- s3_client.put_object(
66
- Bucket=bucket_name,
67
- Key=object_name,
68
- Body=file_content.getvalue(),
69
- ContentType="application/octet-stream",
70
- )
71
  except Exception as e:
72
  print(f"Error uploading file to S3: {e}")
73
  raise
 
62
  """
63
  try:
64
  s3_client = boto3.client("s3")
65
+ s3_client.put_object(Bucket=bucket_name, Key=object_name, Body=file_content.getvalue())
 
 
 
 
 
66
  except Exception as e:
67
  print(f"Error uploading file to S3: {e}")
68
  raise