giulia-fontanella commited on
Commit
8234b7a
·
verified ·
1 Parent(s): 27a61fa

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +75 -18
tools.py CHANGED
@@ -3,8 +3,9 @@ import pandas as pd
3
  from langchain_core.messages import HumanMessage
4
  from langchain.tools import tool
5
  from langchain_community.tools.tavily_search import TavilySearchResults
6
- from langchain_community.document_loaders import WikipediaLoader
7
- from langchain_community.document_loaders import ArxivLoader
 
8
 
9
 
10
  @tool
@@ -49,8 +50,8 @@ def read_python(file_path: str) -> str:
49
 
50
 
51
  class ExtractTextFromImage:
52
- def __init__(self, vision_llm):
53
- self.vision_llm = vision_llm
54
 
55
  def __call__(self, img_path: str) -> str:
56
  """
@@ -92,7 +93,7 @@ class ExtractTextFromImage:
92
  ]
93
 
94
  # Call the vision-capable model
95
- response = self.vision_llm.invoke(message)
96
 
97
  # Append extracted text
98
  all_text += response.content + "\n\n"
@@ -105,10 +106,10 @@ class ExtractTextFromImage:
105
 
106
 
107
  class DescribeImage:
108
- def __init__(self, vision_llm):
109
- self.vision_llm = vision_llm
110
 
111
- def __call__(self, img_path: str) -> str:
112
  """
113
  Generate a detailed description of an image.
114
  This function reads a image from an url, encodes it, and sends it to a
@@ -148,7 +149,7 @@ class DescribeImage:
148
  ]
149
  )
150
  ]
151
- response = self.vision_llm.invoke(message)
152
  return response.content.strip()
153
 
154
  except Exception as e:
@@ -158,16 +159,10 @@ class DescribeImage:
158
 
159
 
160
  class TranscribeAudio:
161
- def __init__(self, audio_llm):
162
- """
163
- Initialize with a LangChain-compatible vision+audio GPT-4o model.
164
-
165
- Args:
166
- audio_llm: A LangChain Runnable for GPT-4o (must support audio inputs).
167
- """
168
- self.audio_llm = audio_llm
169
 
170
- def __call__(self, audio_path: str) -> str:
171
  """
172
  Transcribe an MP3 file.
173
 
@@ -212,6 +207,68 @@ class TranscribeAudio:
212
  print(error_msg)
213
  return ""
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  @tool
216
  def wiki_search(query: str) -> str:
217
  """Search Wikipedia for a query and return maximum 2 results.
 
3
  from langchain_core.messages import HumanMessage
4
  from langchain.tools import tool
5
  from langchain_community.tools.tavily_search import TavilySearchResults
6
+ from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
7
+ from langchain_core.video import VideoFile
8
+ import yt_dlp
9
 
10
 
11
  @tool
 
50
 
51
 
52
  class ExtractTextFromImage:
53
+ def __init__(self, multimodal_model):
54
+ self.multimodal_model = multimodal_model
55
 
56
  def __call__(self, img_path: str) -> str:
57
  """
 
93
  ]
94
 
95
  # Call the vision-capable model
96
+ response = self.multimodal_model.invoke(message)
97
 
98
  # Append extracted text
99
  all_text += response.content + "\n\n"
 
106
 
107
 
108
  class DescribeImage:
109
+ def __init__(self, multimodal_model):
110
+ self.multimodal_model = multimodal_model
111
 
112
+ def __call__(self, img_path: str, query: str) -> str:
113
  """
114
  Generate a detailed description of an image.
115
  This function reads a image from an url, encodes it, and sends it to a
 
149
  ]
150
  )
151
  ]
152
+ response = self.multimodal_model.invoke(message)
153
  return response.content.strip()
154
 
155
  except Exception as e:
 
159
 
160
 
161
  class TranscribeAudio:
162
+ def __init__(self, multimodal_model):
163
+ self.multimodal_model = multimodal_model
 
 
 
 
 
 
164
 
165
+ def __call__(self, audio_path: str, query:str) -> str:
166
  """
167
  Transcribe an MP3 file.
168
 
 
207
  print(error_msg)
208
  return ""
209
 
210
+
211
+ def download_youtube_video(youtube_url: str, output_path: str) -> str:
212
+ """
213
+ Download a YouTube video as an MP4 file.
214
+
215
+ Args:
216
+ youtube_url: The YouTube video URL.
217
+ output_path: Desired output path for the downloaded MP4 file.
218
+
219
+ Returns:
220
+ Path to the saved video file.
221
+ """
222
+ ydl_opts = {
223
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
224
+ 'outtmpl': output_path,
225
+ 'merge_output_format': 'mp4',
226
+ 'quiet': True,
227
+ }
228
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
229
+ ydl.download([youtube_url])
230
+ return output_path
231
+
232
+
233
+ class AnalyzeVideo:
234
+ def __init__(self, multimodal_model):
235
+ self.multimodal_model = multimodal_model
236
+
237
+ def __call__(self, video_path: str, query: str) -> str:
238
+ try:
239
+ with open(video_path, "rb") as video_file:
240
+ video_bytes = video_file.read()
241
+
242
+ video_data = VideoFile(
243
+ mime_type="video/mp4",
244
+ data=video_bytes
245
+ )
246
+
247
+ message = [
248
+ HumanMessage(
249
+ content=[
250
+ {
251
+ "type": "text",
252
+ "text": (
253
+ f"In relation to this video, answer the following request: {query} "
254
+ ),
255
+ },
256
+ {
257
+ "type": "video",
258
+ "video": video_data,
259
+ },
260
+ ]
261
+ )
262
+ ]
263
+
264
+ response = self.multimodal_model.invoke(message)
265
+ return response.content.strip()
266
+
267
+ except Exception as e:
268
+ print(f"Error analyzing video: {str(e)}")
269
+ return ""
270
+
271
+
272
  @tool
273
  def wiki_search(query: str) -> str:
274
  """Search Wikipedia for a query and return maximum 2 results.