Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -288,53 +288,41 @@ def segment_audio_from_video(video_path):
|
|
| 288 |
|
| 289 |
segment_result, speech_audio_path = segment_background_audio(audio_path)
|
| 290 |
print(f"Saved non-speech (background) audio to local")
|
| 291 |
-
|
| 292 |
-
# Set up device
|
| 293 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 294 |
logger.info(f"Using device: {device}")
|
| 295 |
-
|
| 296 |
try:
|
| 297 |
-
# Load a medium model with float32 for broader compatibility
|
| 298 |
model = whisperx.load_model("large-v3", device=device, compute_type="float32")
|
| 299 |
logger.info("WhisperX model loaded")
|
| 300 |
-
|
| 301 |
-
# Transcribe
|
| 302 |
-
result = model.transcribe(speech_audio_path, chunk_size=4, print_progress = True)
|
| 303 |
logger.info("Audio transcription completed")
|
| 304 |
-
|
| 305 |
except Exception as e:
|
| 306 |
logger.error(f"❌ WhisperX pipeline failed: {e}")
|
|
|
|
| 307 |
|
| 308 |
-
#
|
| 309 |
transcript_with_speakers = [
|
| 310 |
{
|
| 311 |
"start": segment["start"],
|
| 312 |
"end": segment["end"]
|
| 313 |
}
|
| 314 |
for segment in result["segments"]
|
|
|
|
| 315 |
]
|
| 316 |
|
| 317 |
return audio_path, transcript_with_speakers
|
| 318 |
|
| 319 |
-
def
|
| 320 |
-
"""
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
| 323 |
|
| 324 |
-
|
| 325 |
-
full_audio_path (str): The path to the full extracted audio file.
|
| 326 |
-
segments (list): A list of dictionaries, where each dictionary
|
| 327 |
-
represents a segment with 'start' and 'end' timestamps in seconds.
|
| 328 |
-
|
| 329 |
-
Returns:
|
| 330 |
-
tuple: A tuple containing:
|
| 331 |
-
- transcribed_segments (list): A list of dictionaries, where each dictionary
|
| 332 |
-
represents a transcribed segment with 'start', 'end', and 'text'.
|
| 333 |
-
- detected_language (str): The language detected by the API (e.g., "en", "es").
|
| 334 |
-
- error_message (str, optional): An error message if transcription fails.
|
| 335 |
-
"""
|
| 336 |
transcribed_segments = []
|
| 337 |
-
detected_language = "unknown"
|
| 338 |
error_message = None
|
| 339 |
|
| 340 |
if not os.path.exists(full_audio_path):
|
|
@@ -342,94 +330,63 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
|
|
| 342 |
|
| 343 |
try:
|
| 344 |
audio_clip = AudioFileClip(full_audio_path)
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
}
|
| 349 |
-
data = {
|
| 350 |
-
"model_id": "scribe_v1",
|
| 351 |
-
}
|
| 352 |
-
# Explicitly set diarize to false, as it's not needed.
|
| 353 |
-
params = {
|
| 354 |
-
"diarize": "false",
|
| 355 |
-
}
|
| 356 |
|
| 357 |
logger.info(f"Starting transcription of {len(segments)} segments with ElevenLabs Scribe...")
|
| 358 |
|
| 359 |
for i, segment in enumerate(segments):
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
# Ensure segment duration is positive
|
| 364 |
-
if segment_end <= segment_start:
|
| 365 |
-
logger.warning(f"Skipping segment {i} due to invalid duration: {segment_start:.2f}s -> {segment_end:.2f}s")
|
| 366 |
continue
|
| 367 |
|
| 368 |
temp_segment_audio_path = f"temp_segment_{i}.wav"
|
| 369 |
try:
|
| 370 |
-
|
| 371 |
-
sub_clip = audio_clip.subclip(segment_start, segment_end)
|
| 372 |
-
# Save as 16-bit PCM WAV for Scribe API compatibility
|
| 373 |
sub_clip.write_audiofile(temp_segment_audio_path, codec='pcm_s16le')
|
| 374 |
-
|
| 375 |
-
logger.info(f"Transcribing segment {i+1}/{len(segments)}: {segment_start:.2f}s - {segment_end:.2f}s")
|
| 376 |
|
| 377 |
with open(temp_segment_audio_path, "rb") as audio_file:
|
| 378 |
-
files = {
|
| 379 |
-
"file": (os.path.basename(temp_segment_audio_path), audio_file, "audio/wav")
|
| 380 |
-
}
|
| 381 |
response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
|
| 382 |
response.raise_for_status()
|
| 383 |
scribe_result = response.json()
|
| 384 |
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
if segment_text:
|
| 393 |
transcribed_segments.append({
|
| 394 |
-
"start":
|
| 395 |
-
"end":
|
| 396 |
-
"text":
|
| 397 |
"speaker": "SPEAKER_00"
|
| 398 |
})
|
| 399 |
else:
|
| 400 |
-
logger.
|
| 401 |
|
| 402 |
-
# Update detected language from the first successful transcription
|
| 403 |
if "language_code" in scribe_result and detected_language == "unknown":
|
| 404 |
detected_language = scribe_result["language_code"]
|
| 405 |
|
| 406 |
-
except requests.exceptions.HTTPError as http_err:
|
| 407 |
-
error_message = f"HTTP error for segment {i+1}: {http_err} - {response.text}"
|
| 408 |
-
logger.error(error_message)
|
| 409 |
-
# Continue to next segment even if one fails
|
| 410 |
-
except requests.exceptions.RequestException as req_err:
|
| 411 |
-
error_message = f"Request error for segment {i+1}: {req_err}"
|
| 412 |
-
logger.error(error_message)
|
| 413 |
-
# Continue to next segment
|
| 414 |
except Exception as e:
|
| 415 |
-
|
| 416 |
-
logger.error(error_message)
|
| 417 |
-
# Continue to next segment
|
| 418 |
finally:
|
| 419 |
if os.path.exists(temp_segment_audio_path):
|
| 420 |
os.remove(temp_segment_audio_path)
|
| 421 |
-
|
| 422 |
logger.info("All segments processed by ElevenLabs Scribe.")
|
| 423 |
|
| 424 |
except Exception as e:
|
| 425 |
-
error_message = f"An error occurred
|
| 426 |
logger.error(error_message)
|
| 427 |
finally:
|
| 428 |
-
if 'audio_clip' in locals()
|
| 429 |
audio_clip.close()
|
| 430 |
|
| 431 |
return transcribed_segments, detected_language, error_message
|
| 432 |
-
|
| 433 |
|
| 434 |
# Function to get the appropriate translation model based on target language
|
| 435 |
def get_translation_model(source_language, target_language):
|
|
|
|
| 288 |
|
| 289 |
segment_result, speech_audio_path = segment_background_audio(audio_path)
|
| 290 |
print(f"Saved non-speech (background) audio to local")
|
| 291 |
+
|
|
|
|
| 292 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 293 |
logger.info(f"Using device: {device}")
|
| 294 |
+
|
| 295 |
try:
|
|
|
|
| 296 |
model = whisperx.load_model("large-v3", device=device, compute_type="float32")
|
| 297 |
logger.info("WhisperX model loaded")
|
| 298 |
+
result = model.transcribe(speech_audio_path, chunk_size=4, print_progress=True)
|
|
|
|
|
|
|
| 299 |
logger.info("Audio transcription completed")
|
|
|
|
| 300 |
except Exception as e:
|
| 301 |
logger.error(f"❌ WhisperX pipeline failed: {e}")
|
| 302 |
+
return audio_path, []
|
| 303 |
|
| 304 |
+
# Return segment boundaries (only timestamps, not text)
|
| 305 |
transcript_with_speakers = [
|
| 306 |
{
|
| 307 |
"start": segment["start"],
|
| 308 |
"end": segment["end"]
|
| 309 |
}
|
| 310 |
for segment in result["segments"]
|
| 311 |
+
if segment["end"] > segment["start"]
|
| 312 |
]
|
| 313 |
|
| 314 |
return audio_path, transcript_with_speakers
|
| 315 |
|
| 316 |
+
def clean_transcribed_text(text: str) -> str:
|
| 317 |
+
"""Remove repetitive symbols and artifacts from text."""
|
| 318 |
+
# Remove only-punctuation or repeated tokens
|
| 319 |
+
cleaned = re.sub(r"[_,.~`^•·。!?!?,,\.\/\\\-–—=+]+", " ", text)
|
| 320 |
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
| 321 |
+
return cleaned
|
| 322 |
|
| 323 |
+
def transcribe_segments_with_scribe(full_audio_path, segments):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
transcribed_segments = []
|
| 325 |
+
detected_language = "unknown"
|
| 326 |
error_message = None
|
| 327 |
|
| 328 |
if not os.path.exists(full_audio_path):
|
|
|
|
| 330 |
|
| 331 |
try:
|
| 332 |
audio_clip = AudioFileClip(full_audio_path)
|
| 333 |
+
headers = {"xi-api-key": ELEVENLABS_API_KEY}
|
| 334 |
+
data = {"model_id": "scribe_v1"}
|
| 335 |
+
params = {"diarize": "false"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
logger.info(f"Starting transcription of {len(segments)} segments with ElevenLabs Scribe...")
|
| 338 |
|
| 339 |
for i, segment in enumerate(segments):
|
| 340 |
+
start, end = segment["start"], segment["end"]
|
| 341 |
+
if end <= start:
|
| 342 |
+
logger.warning(f"Skipping invalid segment {i}: {start:.2f}s → {end:.2f}s")
|
|
|
|
|
|
|
|
|
|
| 343 |
continue
|
| 344 |
|
| 345 |
temp_segment_audio_path = f"temp_segment_{i}.wav"
|
| 346 |
try:
|
| 347 |
+
sub_clip = audio_clip.subclip(start, end)
|
|
|
|
|
|
|
| 348 |
sub_clip.write_audiofile(temp_segment_audio_path, codec='pcm_s16le')
|
|
|
|
|
|
|
| 349 |
|
| 350 |
with open(temp_segment_audio_path, "rb") as audio_file:
|
| 351 |
+
files = {"file": (os.path.basename(temp_segment_audio_path), audio_file, "audio/wav")}
|
|
|
|
|
|
|
| 352 |
response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
|
| 353 |
response.raise_for_status()
|
| 354 |
scribe_result = response.json()
|
| 355 |
|
| 356 |
+
raw_text = scribe_result.get("text") or " ".join(
|
| 357 |
+
[w.get("text", "") for w in scribe_result.get("words", []) if w.get("type") == "word"]
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
cleaned_text = clean_transcribed_text(raw_text)
|
| 361 |
+
if cleaned_text:
|
|
|
|
|
|
|
| 362 |
transcribed_segments.append({
|
| 363 |
+
"start": start,
|
| 364 |
+
"end": end,
|
| 365 |
+
"text": cleaned_text,
|
| 366 |
"speaker": "SPEAKER_00"
|
| 367 |
})
|
| 368 |
else:
|
| 369 |
+
logger.info(f"Segment {i+1} discarded: cleaned text is empty.")
|
| 370 |
|
|
|
|
| 371 |
if "language_code" in scribe_result and detected_language == "unknown":
|
| 372 |
detected_language = scribe_result["language_code"]
|
| 373 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
except Exception as e:
|
| 375 |
+
logger.error(f"Error processing segment {i+1}: {e}")
|
|
|
|
|
|
|
| 376 |
finally:
|
| 377 |
if os.path.exists(temp_segment_audio_path):
|
| 378 |
os.remove(temp_segment_audio_path)
|
| 379 |
+
|
| 380 |
logger.info("All segments processed by ElevenLabs Scribe.")
|
| 381 |
|
| 382 |
except Exception as e:
|
| 383 |
+
error_message = f"An error occurred: {e}"
|
| 384 |
logger.error(error_message)
|
| 385 |
finally:
|
| 386 |
+
if 'audio_clip' in locals():
|
| 387 |
audio_clip.close()
|
| 388 |
|
| 389 |
return transcribed_segments, detected_language, error_message
|
|
|
|
| 390 |
|
| 391 |
# Function to get the appropriate translation model based on target language
|
| 392 |
def get_translation_model(source_language, target_language):
|