nikhmr1235 commited on
Commit
e92eb81
·
verified ·
1 Parent(s): 590f46b

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +102 -4
helper.py CHANGED
@@ -127,7 +127,7 @@ import requests
127
  from langchain.tools import Tool
128
 
129
  def download_limited_content(url: str, max_chars: int = 10000) -> str: # Limit to ~2500 tokens
130
- """Downloads content from a URL, truncating if it exceeds max_chars."""
131
  try:
132
  with requests.get(url, stream=True, timeout=10) as response:
133
  response.raise_for_status()
@@ -159,13 +159,111 @@ def download_limited_content(url: str, max_chars: int = 10000) -> str: # Limit t
159
  except Exception as e:
160
  return f"Error processing content from {url}: {e}"
161
 
162
- web_downloader_limited_tool = Tool(
163
- name="web_downloader_limited",
164
  description="""
165
- Downloads content from a URL, automatically truncating it to save tokens.
166
  Useful when you need information from a web page but want to avoid
167
  exceeding token limits by downloading excessively large content.
168
  Input should be a single, valid URL.
 
 
 
169
  """,
170
  func=download_limited_content,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  )
 
127
  from langchain.tools import Tool
128
 
129
  def download_limited_content(url: str, max_chars: int = 10000) -> str: # Limit to ~2500 tokens
130
+ """Downloads text content from a URL, truncating if it exceeds max_chars."""
131
  try:
132
  with requests.get(url, stream=True, timeout=10) as response:
133
  response.raise_for_status()
 
159
  except Exception as e:
160
  return f"Error processing content from {url}: {e}"
161
 
162
+ text_downloader_limited_tool = Tool(
163
+ name="text_downloader_limited_tool",
164
  description="""
165
+ Downloads text content from a URL, automatically truncating it to save tokens.
166
  Useful when you need information from a web page but want to avoid
167
  exceeding token limits by downloading excessively large content.
168
  Input should be a single, valid URL.
169
+ NOTE: use this tool only for text-based-content URLs (e.g., articles, documentation, python code file).
170
+ The content will be truncated to approximately 10,000 characters (~2500 tokens).
171
+ If the content is larger, it will be cut off with a note indicating truncation.
172
  """,
173
  func=download_limited_content,
174
+ )
175
+
176
+ import speech_recognition as sr
177
+ from pydub import AudioSegment
178
+ import os
179
+ import requests # Needed for downloading the URL content
180
+
181
+ def transcribe_audio_from_path_or_url(audio_source: str, language: str = "en-US") -> str:
182
+ """
183
+ Transcribes audio content from a local file path or a URL to a text string.
184
+
185
+ This tool is designed to convert spoken content from audio into written text.
186
+ It automatically handles downloading the audio if a URL is provided.
187
+ Supports various audio formats (e.g., MP3, WAV) and converts them to WAV internally for transcription.
188
+ For best results, specify the correct language code (e.g., 'en-US' for US English, 'es-ES' for Spanish).
189
+
190
+ Args:
191
+ audio_source (str): The local file path to the audio (e.g., "my_recording.mp3")
192
+ OR a direct URL to an audio file (e.g., "https://example.com/audio.wav").
193
+ language (str, optional): The spoken language in the audio. Defaults to "en-US".
194
+ Refer to Google Speech Recognition language codes for options.
195
+
196
+ Returns:
197
+ str: The transcribed text, or an informative error message if transcription fails.
198
+ """
199
+ r = sr.Recognizer()
200
+ temp_download_path = None
201
+ transcribed_text = ""
202
+
203
+ try:
204
+ # Determine if the input is a URL or a local path
205
+ if audio_source.startswith("http://") or audio_source.startswith("https://"):
206
+ # It's a URL, use requests to download
207
+ response = requests.get(audio_source, stream=True, timeout=30)
208
+ response.raise_for_status() # Raise an exception for bad status codes
209
+
210
+ # Save to a temporary file
211
+ # Determine file extension from URL or assume common audio type
212
+ ext = os.path.splitext(audio_source.split('?')[0])[-1] # Get extension, handle query params
213
+ if not ext:
214
+ ext = ".mp3" # Default if no extension in URL
215
+
216
+ temp_download_path = f"temp_download_audio{ext}"
217
+ with open(temp_download_path, 'wb') as f:
218
+ for chunk in response.iter_content(chunk_size=8192):
219
+ f.write(chunk)
220
+ current_audio_path = temp_download_path
221
+ else:
222
+ # It's a local file path
223
+ current_audio_path = audio_source
224
+
225
+ # Convert to WAV if not already (SpeechRecognition prefers WAV)
226
+ temp_wav_path = "temp_audio_to_transcribe.wav"
227
+ audio = AudioSegment.from_file(current_audio_path)
228
+ audio.export(temp_wav_path, format="wav")
229
+
230
+ # Transcribe the audio
231
+ with sr.AudioFile(temp_wav_path) as source:
232
+ audio_listened = r.record(source)
233
+ try:
234
+ transcribed_text = r.recognize_google(audio_listened, language=language)
235
+ except sr.UnknownValueError:
236
+ return "Could not understand audio (speech not clear or too short)."
237
+ except sr.RequestError as e:
238
+ return f"Could not request results from Google Speech Recognition service; {e}"
239
+
240
+ except FileNotFoundError:
241
+ return f"Error: Audio file not found at '{audio_source}'."
242
+ except requests.exceptions.RequestException as e:
243
+ return f"Error downloading audio from URL '{audio_source}': {e}"
244
+ except Exception as e:
245
+ return f"An unexpected error occurred during audio processing or transcription: {e}"
246
+ finally:
247
+ # Clean up temporary files
248
+ if temp_download_path and os.path.exists(temp_download_path):
249
+ os.remove(temp_download_path)
250
+ if os.path.exists(temp_wav_path):
251
+ os.remove(temp_wav_path)
252
+
253
+ return transcribed_text.strip()
254
+
255
+ # Get your audio_transcriber tool
256
+ from langchain.tools import Tool
257
+
258
+ audio_transcriber_tool = Tool(
259
+ name="audio_transcriber_tool",
260
+ description=(
261
+ "Converts an audio file (local path or URL) to a text transcript. "
262
+ "This tool is useful for extracting spoken information from audio recordings. "
263
+ "Input should be either a local file path (e.g., 'path/to/audio.mp3') "
264
+ "or a direct URL to an audio file (e.g., 'https://example.com/speech.wav'). "
265
+ "Optionally, provide the 'language' parameter (e.g., 'en-US', 'es-ES') for better accuracy. "
266
+ "Returns the transcribed text or an error message if transcription fails."
267
+ ),
268
+ func=transcribe_audio_from_path_or_url,
269
  )