Spaces:
Runtime error
Runtime error
Update helper.py
Browse files
helper.py
CHANGED
|
@@ -127,7 +127,7 @@ import requests
|
|
| 127 |
from langchain.tools import Tool
|
| 128 |
|
| 129 |
def download_limited_content(url: str, max_chars: int = 10000) -> str: # Limit to ~2500 tokens
|
| 130 |
-
"""Downloads content from a URL, truncating if it exceeds max_chars."""
|
| 131 |
try:
|
| 132 |
with requests.get(url, stream=True, timeout=10) as response:
|
| 133 |
response.raise_for_status()
|
|
@@ -159,13 +159,111 @@ def download_limited_content(url: str, max_chars: int = 10000) -> str: # Limit t
|
|
| 159 |
except Exception as e:
|
| 160 |
return f"Error processing content from {url}: {e}"
|
| 161 |
|
| 162 |
-
|
| 163 |
-
name="
|
| 164 |
description="""
|
| 165 |
-
Downloads content from a URL, automatically truncating it to save tokens.
|
| 166 |
Useful when you need information from a web page but want to avoid
|
| 167 |
exceeding token limits by downloading excessively large content.
|
| 168 |
Input should be a single, valid URL.
|
|
|
|
|
|
|
|
|
|
| 169 |
""",
|
| 170 |
func=download_limited_content,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
)
|
|
|
|
| 127 |
from langchain.tools import Tool
|
| 128 |
|
| 129 |
def download_limited_content(url: str, max_chars: int = 10000) -> str: # Limit to ~2500 tokens
|
| 130 |
+
"""Downloads text content from a URL, truncating if it exceeds max_chars."""
|
| 131 |
try:
|
| 132 |
with requests.get(url, stream=True, timeout=10) as response:
|
| 133 |
response.raise_for_status()
|
|
|
|
| 159 |
except Exception as e:
|
| 160 |
return f"Error processing content from {url}: {e}"
|
| 161 |
|
| 162 |
+
text_downloader_limited_tool = Tool(
|
| 163 |
+
name="text_downloader_limited_tool",
|
| 164 |
description="""
|
| 165 |
+
Downloads text content from a URL, automatically truncating it to save tokens.
|
| 166 |
Useful when you need information from a web page but want to avoid
|
| 167 |
exceeding token limits by downloading excessively large content.
|
| 168 |
Input should be a single, valid URL.
|
| 169 |
+
NOTE: use this tool only for text-based-content URLs (e.g., articles, documentation, python code file).
|
| 170 |
+
The content will be truncated to approximately 10,000 characters (~2500 tokens).
|
| 171 |
+
If the content is larger, it will be cut off with a note indicating truncation.
|
| 172 |
""",
|
| 173 |
func=download_limited_content,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
import speech_recognition as sr
|
| 177 |
+
from pydub import AudioSegment
|
| 178 |
+
import os
|
| 179 |
+
import requests # Needed for downloading the URL content
|
| 180 |
+
|
| 181 |
+
def transcribe_audio_from_path_or_url(audio_source: str, language: str = "en-US") -> str:
|
| 182 |
+
"""
|
| 183 |
+
Transcribes audio content from a local file path or a URL to a text string.
|
| 184 |
+
|
| 185 |
+
This tool is designed to convert spoken content from audio into written text.
|
| 186 |
+
It automatically handles downloading the audio if a URL is provided.
|
| 187 |
+
Supports various audio formats (e.g., MP3, WAV) and converts them to WAV internally for transcription.
|
| 188 |
+
For best results, specify the correct language code (e.g., 'en-US' for US English, 'es-ES' for Spanish).
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
audio_source (str): The local file path to the audio (e.g., "my_recording.mp3")
|
| 192 |
+
OR a direct URL to an audio file (e.g., "https://example.com/audio.wav").
|
| 193 |
+
language (str, optional): The spoken language in the audio. Defaults to "en-US".
|
| 194 |
+
Refer to Google Speech Recognition language codes for options.
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
str: The transcribed text, or an informative error message if transcription fails.
|
| 198 |
+
"""
|
| 199 |
+
r = sr.Recognizer()
|
| 200 |
+
temp_download_path = None
|
| 201 |
+
transcribed_text = ""
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
# Determine if the input is a URL or a local path
|
| 205 |
+
if audio_source.startswith("http://") or audio_source.startswith("https://"):
|
| 206 |
+
# It's a URL, use requests to download
|
| 207 |
+
response = requests.get(audio_source, stream=True, timeout=30)
|
| 208 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 209 |
+
|
| 210 |
+
# Save to a temporary file
|
| 211 |
+
# Determine file extension from URL or assume common audio type
|
| 212 |
+
ext = os.path.splitext(audio_source.split('?')[0])[-1] # Get extension, handle query params
|
| 213 |
+
if not ext:
|
| 214 |
+
ext = ".mp3" # Default if no extension in URL
|
| 215 |
+
|
| 216 |
+
temp_download_path = f"temp_download_audio{ext}"
|
| 217 |
+
with open(temp_download_path, 'wb') as f:
|
| 218 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 219 |
+
f.write(chunk)
|
| 220 |
+
current_audio_path = temp_download_path
|
| 221 |
+
else:
|
| 222 |
+
# It's a local file path
|
| 223 |
+
current_audio_path = audio_source
|
| 224 |
+
|
| 225 |
+
# Convert to WAV if not already (SpeechRecognition prefers WAV)
|
| 226 |
+
temp_wav_path = "temp_audio_to_transcribe.wav"
|
| 227 |
+
audio = AudioSegment.from_file(current_audio_path)
|
| 228 |
+
audio.export(temp_wav_path, format="wav")
|
| 229 |
+
|
| 230 |
+
# Transcribe the audio
|
| 231 |
+
with sr.AudioFile(temp_wav_path) as source:
|
| 232 |
+
audio_listened = r.record(source)
|
| 233 |
+
try:
|
| 234 |
+
transcribed_text = r.recognize_google(audio_listened, language=language)
|
| 235 |
+
except sr.UnknownValueError:
|
| 236 |
+
return "Could not understand audio (speech not clear or too short)."
|
| 237 |
+
except sr.RequestError as e:
|
| 238 |
+
return f"Could not request results from Google Speech Recognition service; {e}"
|
| 239 |
+
|
| 240 |
+
except FileNotFoundError:
|
| 241 |
+
return f"Error: Audio file not found at '{audio_source}'."
|
| 242 |
+
except requests.exceptions.RequestException as e:
|
| 243 |
+
return f"Error downloading audio from URL '{audio_source}': {e}"
|
| 244 |
+
except Exception as e:
|
| 245 |
+
return f"An unexpected error occurred during audio processing or transcription: {e}"
|
| 246 |
+
finally:
|
| 247 |
+
# Clean up temporary files
|
| 248 |
+
if temp_download_path and os.path.exists(temp_download_path):
|
| 249 |
+
os.remove(temp_download_path)
|
| 250 |
+
if os.path.exists(temp_wav_path):
|
| 251 |
+
os.remove(temp_wav_path)
|
| 252 |
+
|
| 253 |
+
return transcribed_text.strip()
|
| 254 |
+
|
| 255 |
+
# Get your audio_transcriber tool
|
| 256 |
+
from langchain.tools import Tool
|
| 257 |
+
|
| 258 |
+
audio_transcriber_tool = Tool(
|
| 259 |
+
name="audio_transcriber_tool",
|
| 260 |
+
description=(
|
| 261 |
+
"Converts an audio file (local path or URL) to a text transcript. "
|
| 262 |
+
"This tool is useful for extracting spoken information from audio recordings. "
|
| 263 |
+
"Input should be either a local file path (e.g., 'path/to/audio.mp3') "
|
| 264 |
+
"or a direct URL to an audio file (e.g., 'https://example.com/speech.wav'). "
|
| 265 |
+
"Optionally, provide the 'language' parameter (e.g., 'en-US', 'es-ES') for better accuracy. "
|
| 266 |
+
"Returns the transcribed text or an error message if transcription fails."
|
| 267 |
+
),
|
| 268 |
+
func=transcribe_audio_from_path_or_url,
|
| 269 |
)
|