zaldivards commited on
Commit
aa94df3
·
1 Parent(s): 694a9b2

Refactor tools

Browse files

- Update image transcriber
- Add YouTube video descriptor tool

Files changed (2) hide show
  1. requirements.txt +7 -6
  2. tools.py +107 -26
requirements.txt CHANGED
@@ -3,8 +3,8 @@ aiofiles==24.1.0 ; python_version >= '3.8'
3
  annotated-types==0.7.0 ; python_version >= '3.8'
4
  anyio==4.9.0 ; python_version >= '3.9'
5
  beautifulsoup4==4.13.4 ; python_full_version >= '3.7.0'
6
- boto3==1.38.23
7
- botocore==1.38.23 ; python_version >= '3.9'
8
  certifi==2025.4.26 ; python_version >= '3.6'
9
  charset-normalizer==3.4.2 ; python_version >= '3.7'
10
  click==8.2.1 ; python_version >= '3.10'
@@ -23,14 +23,14 @@ h11==0.16.0 ; python_version >= '3.8'
23
  hf-xet==1.1.2 ; platform_machine == 'x86_64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'aarch64'
24
  httpcore==1.0.9 ; python_version >= '3.8'
25
  httpx==0.28.1 ; python_version >= '3.8'
26
- huggingface-hub==0.32.1 ; python_full_version >= '3.8.0'
27
  idna==3.10 ; python_version >= '3.6'
28
  jinja2==3.1.6 ; python_version >= '3.7'
29
  jiter==0.10.0 ; python_version >= '3.9'
30
  jmespath==1.0.1 ; python_version >= '3.7'
31
  jsonpatch==1.33 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
32
  jsonpointer==3.0.0 ; python_version >= '3.7'
33
- langchain-core==0.3.61 ; python_version >= '3.9'
34
  langchain-openai==0.3.18
35
  langgraph==0.4.7
36
  langgraph-checkpoint==2.0.26 ; python_version >= '3.9'
@@ -44,6 +44,7 @@ markupsafe==3.0.2 ; python_version >= '3.9'
44
  mdurl==0.1.2 ; python_version >= '3.7'
45
  numpy==2.2.6 ; python_version >= '3.10'
46
  openai==1.82.0
 
47
  openpyxl==3.1.5
48
  orjson==3.10.18 ; python_version >= '3.9'
49
  ormsgpack==1.10.0 ; python_version >= '3.9'
@@ -60,7 +61,7 @@ python-dateutil==2.9.0.post0 ; python_version >= '2.7' and python_version not in
60
  python-dotenv==1.1.0
61
  python-multipart==0.0.20 ; python_version >= '3.8'
62
  pytube==15.0.0
63
- pytubefix==9.0.1
64
  pytz==2025.2
65
  pyyaml==6.0.2 ; python_version >= '3.8'
66
  regex==2024.11.6 ; python_version >= '3.8'
@@ -73,7 +74,7 @@ safehttpx==0.1.6 ; python_version >= '3.10'
73
  semantic-version==2.10.0 ; python_version >= '2.7'
74
  shellingham==1.5.4 ; python_version >= '3.7'
75
  six==1.17.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
76
- smolagents==1.16.1
77
  sniffio==1.3.1 ; python_version >= '3.7'
78
  soupsieve==2.7 ; python_version >= '3.8'
79
  starlette==0.46.2 ; sys_platform != 'emscripten'
 
3
  annotated-types==0.7.0 ; python_version >= '3.8'
4
  anyio==4.9.0 ; python_version >= '3.9'
5
  beautifulsoup4==4.13.4 ; python_full_version >= '3.7.0'
6
+ boto3==1.38.24
7
+ botocore==1.38.24 ; python_version >= '3.9'
8
  certifi==2025.4.26 ; python_version >= '3.6'
9
  charset-normalizer==3.4.2 ; python_version >= '3.7'
10
  click==8.2.1 ; python_version >= '3.10'
 
23
  hf-xet==1.1.2 ; platform_machine == 'x86_64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'aarch64'
24
  httpcore==1.0.9 ; python_version >= '3.8'
25
  httpx==0.28.1 ; python_version >= '3.8'
26
+ huggingface-hub==0.32.2 ; python_full_version >= '3.8.0'
27
  idna==3.10 ; python_version >= '3.6'
28
  jinja2==3.1.6 ; python_version >= '3.7'
29
  jiter==0.10.0 ; python_version >= '3.9'
30
  jmespath==1.0.1 ; python_version >= '3.7'
31
  jsonpatch==1.33 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
32
  jsonpointer==3.0.0 ; python_version >= '3.7'
33
+ langchain-core==0.3.62 ; python_version >= '3.9'
34
  langchain-openai==0.3.18
35
  langgraph==0.4.7
36
  langgraph-checkpoint==2.0.26 ; python_version >= '3.9'
 
44
  mdurl==0.1.2 ; python_version >= '3.7'
45
  numpy==2.2.6 ; python_version >= '3.10'
46
  openai==1.82.0
47
+ opencv-python==4.11.0.86
48
  openpyxl==3.1.5
49
  orjson==3.10.18 ; python_version >= '3.9'
50
  ormsgpack==1.10.0 ; python_version >= '3.9'
 
61
  python-dotenv==1.1.0
62
  python-multipart==0.0.20 ; python_version >= '3.8'
63
  pytube==15.0.0
64
+ pytubefix==9.1.1
65
  pytz==2025.2
66
  pyyaml==6.0.2 ; python_version >= '3.8'
67
  regex==2024.11.6 ; python_version >= '3.8'
 
74
  semantic-version==2.10.0 ; python_version >= '2.7'
75
  shellingham==1.5.4 ; python_version >= '3.7'
76
  six==1.17.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
77
+ smolagents==1.17.0
78
  sniffio==1.3.1 ; python_version >= '3.7'
79
  soupsieve==2.7 ; python_version >= '3.8'
80
  starlette==0.46.2 ; sys_platform != 'emscripten'
tools.py CHANGED
@@ -3,11 +3,14 @@ import ast
3
  import json
4
  import os
5
  import base64
 
6
  from io import BytesIO
7
  from time import sleep
 
8
  from uuid import uuid4
9
 
10
  import boto3
 
11
  import fitz
12
  import requests
13
  from bs4 import BeautifulSoup
@@ -20,7 +23,7 @@ from requests.exceptions import HTTPError
20
  from urllib3.exceptions import ReadTimeoutError
21
 
22
  from definitions import TranscriptionJob
23
- from utils import get_file, s3_upload_file, s3_download_file, bedrock_runtime, BEDROCK_MODEL_ID
24
 
25
 
26
  @tool
@@ -159,43 +162,32 @@ class AudioTranscriberTool(Tool, AudioTranscriber): # pylint: disable=C0115
159
 
160
 
161
  @tool
162
- def image_transcriber(text_prompt: str, task_id: str, file_name: str) -> str:
163
- """Transcribes text from an image file
164
 
165
  Args:
166
- text_prompt (str): The text prompt to guide the transcription.
167
  task_id (str): The ID of the task associated with the image file.
168
  file_name (str): The name of the image file to transcribe.
169
  """
170
  try:
171
  file_content = get_file(task_id)
172
  base64_image = base64.b64encode(file_content.getvalue()).decode("utf-8")
173
- response = bedrock_runtime.invoke_model(
174
- modelId=BEDROCK_MODEL_ID,
175
- body=json.dumps(
176
  {
177
- "anthropic_version": "bedrock-2023-05-31",
178
- "max_tokens": 4096,
179
- "messages": [
180
  {
181
- "role": "user",
182
- "content": [
183
- {
184
- "type": "image",
185
- "source": {
186
- "type": "base64",
187
- "media_type": f"image/{file_name.split('.')[-1]}",
188
- "data": base64_image,
189
- },
190
- },
191
- {"type": "text", "text": text_prompt},
192
- ],
193
- }
194
  ],
195
  }
196
- ),
197
- )["body"].read()
198
- return json.loads(response)["content"][0]["text"]
199
  except Exception as e:
200
  return f"Error processing image file {file_name}: {e}"
201
 
@@ -279,3 +271,92 @@ class YoutubeTranscriberTool(Tool, AudioTranscriber): # pylint: disable=C0115
279
  return transcription
280
  except Exception as e:
281
  return f"Error starting transcription job for {file_name}: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import json
4
  import os
5
  import base64
6
+ import tempfile
7
  from io import BytesIO
8
  from time import sleep
9
+ from typing import Generator
10
  from uuid import uuid4
11
 
12
  import boto3
13
+ import cv2 # type: ignore
14
  import fitz
15
  import requests
16
  from bs4 import BeautifulSoup
 
23
  from urllib3.exceptions import ReadTimeoutError
24
 
25
  from definitions import TranscriptionJob
26
+ from utils import get_file, s3_upload_file, s3_download_file, invoke_bedrock_model, invoke_openai_model
27
 
28
 
29
  @tool
 
162
 
163
 
164
  @tool
165
+ def image_analyzer(task: str, task_id: str, file_name: str) -> str:
166
+ """Analyzes an image file and returns a response based on the task provided.
167
 
168
  Args:
169
+ task (str): The description of the information to extract from the image.
170
  task_id (str): The ID of the task associated with the image file.
171
  file_name (str): The name of the image file to transcribe.
172
  """
173
  try:
174
  file_content = get_file(task_id)
175
  base64_image = base64.b64encode(file_content.getvalue()).decode("utf-8")
176
+ response = invoke_openai_model(
177
+ [
 
178
  {
179
+ "role": "user",
180
+ "content": [
 
181
  {
182
+ "type": "input_image",
183
+ "image_url": f"data:image/{file_name.split('.')[-1]};base64,{base64_image}",
184
+ },
185
+ {"type": "input_text", "text": task},
 
 
 
 
 
 
 
 
 
186
  ],
187
  }
188
+ ]
189
+ )
190
+ return response
191
  except Exception as e:
192
  return f"Error processing image file {file_name}: {e}"
193
 
 
271
  return transcription
272
  except Exception as e:
273
  return f"Error starting transcription job for {file_name}: {e}"
274
+
275
+
276
+ class YoutubeVideoDescriptorTool(Tool): # pylint: disable=C0115
277
+ name = "YoutubeVideoDescriptor"
278
+ description = (
279
+ "Describe a youtube video based on the video. Use this tool for tasks like video understanding,"
280
+ "not for audio transcription. Example: 'What is in the video?'"
281
+ )
282
+ inputs = {
283
+ "youtube_url": {
284
+ "type": "string",
285
+ "description": "The URL of the YouTube video to get the description from.",
286
+ },
287
+ "task": {
288
+ "type": "string",
289
+ "description": "The task to perform on the video, e.g., 'Describe the video content'.",
290
+ },
291
+ }
292
+ output_type = "string"
293
+
294
+ # pylint: disable=E1101
295
+ def _base64_frames(self, video_buffer: BytesIO, target_fps: int = 10) -> Generator[list[str], None, None]:
296
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as input_temp:
297
+ input_temp.write(video_buffer.getvalue())
298
+ input_temp_path = input_temp.name
299
+
300
+ cap = cv2.VideoCapture(input_temp_path)
301
+ orig_fps = cap.get(cv2.CAP_PROP_FPS)
302
+ frame_interval = int(round(orig_fps / target_fps))
303
+
304
+ frames = []
305
+ i = 0
306
+ while cap.isOpened():
307
+ ret, frame = cap.read()
308
+ if not ret:
309
+ break
310
+
311
+ # Keep every Nth frame to reduce to target_fps
312
+ if i % frame_interval == 0:
313
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # store frame in memory (RGB)
314
+
315
+ i += 1
316
+
317
+ cap.release()
318
+
319
+ base64_frames = []
320
+ for frame in frames:
321
+ _, buffer = cv2.imencode(".jpg", frame)
322
+ encoded_buffer = base64.b64encode(buffer).decode("utf-8")
323
+ base64_frames.append(encoded_buffer)
324
+ if len(base64_frames) == 20: # yield every 20 frames
325
+ yield base64_frames
326
+ base64_frames = []
327
+
328
+ def forward(self, task: str, youtube_url: str) -> str: # pylint: disable=W0221
329
+ file_name = f"{uuid4()}.mp4"
330
+ buffer = BytesIO()
331
+ try:
332
+ youtube_obj = YouTube(youtube_url, on_progress_callback=on_progress)
333
+ youtube_obj.streams.filter(progressive=True).first().stream_to_buffer(buffer)
334
+ except Exception as e:
335
+ return f"Error fetching YouTube video {youtube_url}: {e}"
336
+ try:
337
+ vision_messages = []
338
+ responses = []
339
+ for base64_frame_chunk in self._base64_frames(buffer, target_fps=1):
340
+ vision_messages = [
341
+ {"type": "input_image", "image_url": f"data:image/jpeg;base64,{base64_frame}"}
342
+ for base64_frame in base64_frame_chunk
343
+ ]
344
+ response = invoke_openai_model(
345
+ [{"role": "user", "content": [*vision_messages, {"type": "input_text", "text": task}]}]
346
+ )
347
+ responses.append(response)
348
+ response = "\n".join(responses)
349
+ final_response = invoke_bedrock_model(
350
+ [
351
+ {
352
+ "role": "user",
353
+ "content": [
354
+ {"type": "text", "text": response},
355
+ {"type": "text", "text": "Please summarize the above text shortly."},
356
+ ],
357
+ }
358
+ ]
359
+ )
360
+ return final_response
361
+ except Exception as e:
362
+ return f"Error starting transcription job for {file_name}: {e}"