Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -313,6 +313,7 @@ def segment_audio_from_video(video_path):
|
|
| 313 |
]
|
| 314 |
|
| 315 |
return audio_path, transcript_with_speakers
|
|
|
|
| 316 |
def clean_transcribed_text(text: str) -> str:
|
| 317 |
"""
|
| 318 |
Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
|
|
@@ -390,6 +391,104 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
|
|
| 390 |
|
| 391 |
return transcribed_segments, detected_language, error_message
|
| 392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
# Function to get the appropriate translation model based on target language
|
| 394 |
def get_translation_model(source_language, target_language):
|
| 395 |
"""
|
|
|
|
| 313 |
]
|
| 314 |
|
| 315 |
return audio_path, transcript_with_speakers
|
| 316 |
+
|
| 317 |
def clean_transcribed_text(text: str) -> str:
|
| 318 |
"""
|
| 319 |
Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
|
|
|
|
| 391 |
|
| 392 |
return transcribed_segments, detected_language, error_message
|
| 393 |
|
| 394 |
+
from collections import Counter
|
| 395 |
+
|
| 396 |
+
def process_scribe_output(scribe_response, max_line_length=50):
|
| 397 |
+
"""
|
| 398 |
+
Processes the Scribe API response to clean the text and generate line-level timestamps.
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
scribe_response (dict): The raw response dictionary from the Scribe API.
|
| 402 |
+
max_line_length (int): The maximum number of characters desired per line before
|
| 403 |
+
a new line is created. This is an approximate guide.
|
| 404 |
+
|
| 405 |
+
Returns:
|
| 406 |
+
list: A list of dictionaries, where each dictionary represents a line
|
| 407 |
+
and contains 'text', 'start_time', 'end_time', and 'speaker_id'.
|
| 408 |
+
"""
|
| 409 |
+
cleaned_words = []
|
| 410 |
+
for word_info in scribe_response['words']:
|
| 411 |
+
text = word_info['text']
|
| 412 |
+
start = word_info['start']
|
| 413 |
+
end = word_info['end']
|
| 414 |
+
word_type = word_info['type']
|
| 415 |
+
speaker_id = word_info.get('speaker_id', None)
|
| 416 |
+
|
| 417 |
+
if word_type == 'audio_event':
|
| 418 |
+
continue # Remove audio event tags like [背景音]
|
| 419 |
+
elif word_type == 'spacing':
|
| 420 |
+
if cleaned_words and cleaned_words[-1]['text'].endswith(' '):
|
| 421 |
+
continue
|
| 422 |
+
text = ' '
|
| 423 |
+
|
| 424 |
+
cleaned_words.append({
|
| 425 |
+
'text': text,
|
| 426 |
+
'start': start,
|
| 427 |
+
'end': end,
|
| 428 |
+
'speaker_id': speaker_id
|
| 429 |
+
})
|
| 430 |
+
|
| 431 |
+
lines = []
|
| 432 |
+
current_line_words = []
|
| 433 |
+
current_line_start_time = None
|
| 434 |
+
|
| 435 |
+
for i, word_info in enumerate(cleaned_words):
|
| 436 |
+
if not current_line_words:
|
| 437 |
+
current_line_start_time = word_info['start']
|
| 438 |
+
|
| 439 |
+
current_line_words.append(word_info)
|
| 440 |
+
|
| 441 |
+
current_line_text = "".join([w['text'] for w in current_line_words]).strip()
|
| 442 |
+
|
| 443 |
+
line_should_end = (
|
| 444 |
+
len(current_line_text) >= max_line_length or
|
| 445 |
+
i == len(cleaned_words) - 1 or
|
| 446 |
+
word_info['text'].endswith(('。', '?', '!'))
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
if line_should_end:
|
| 450 |
+
line_text = current_line_text
|
| 451 |
+
line_end_time = word_info['end']
|
| 452 |
+
|
| 453 |
+
# Majority speaker_id in this line
|
| 454 |
+
speaker_ids = [w['speaker_id'] for w in current_line_words if w['speaker_id'] is not None]
|
| 455 |
+
speaker_id = Counter(speaker_ids).most_common(1)[0][0] if speaker_ids else None
|
| 456 |
+
|
| 457 |
+
lines.append({
|
| 458 |
+
'original': line_text,
|
| 459 |
+
'start': current_line_start_time,
|
| 460 |
+
'end': line_end_time,
|
| 461 |
+
'speaker': speaker_id
|
| 462 |
+
})
|
| 463 |
+
|
| 464 |
+
current_line_words = []
|
| 465 |
+
current_line_start_time = None
|
| 466 |
+
|
| 467 |
+
return lines
|
| 468 |
+
|
| 469 |
+
def transcribe_with_scribe(full_audio_path):
|
| 470 |
+
transcribed_segments = []
|
| 471 |
+
detected_language = "unknown"
|
| 472 |
+
error_message = None
|
| 473 |
+
|
| 474 |
+
if not os.path.exists(full_audio_path):
|
| 475 |
+
return [], detected_language, f"Full audio file not found at {full_audio_path}"
|
| 476 |
+
|
| 477 |
+
headers = {"xi-api-key": ELEVENLABS_API_KEY}
|
| 478 |
+
data = {
|
| 479 |
+
"model_id": "scribe_v1",
|
| 480 |
+
"diarize": "true"
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
logger.info(f"Starting transcription for full audio: {full_audio_path}")
|
| 484 |
+
|
| 485 |
+
with open(full_audio_path, "rb") as audio_file:
|
| 486 |
+
files = {"file": (os.path.basename(full_audio_path), audio_file, "audio/wav")}
|
| 487 |
+
response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data)
|
| 488 |
+
response.raise_for_status()
|
| 489 |
+
scribe_result = response.json()
|
| 490 |
+
return scribe_result
|
| 491 |
+
|
| 492 |
# Function to get the appropriate translation model based on target language
|
| 493 |
def get_translation_model(source_language, target_language):
|
| 494 |
"""
|