Update app.py
Browse files
app.py
CHANGED
|
@@ -408,7 +408,7 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
| 408 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 409 |
return None
|
| 410 |
|
| 411 |
-
def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
|
| 412 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 413 |
error_message = None
|
| 414 |
|
|
@@ -424,13 +424,15 @@ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover,
|
|
| 424 |
try:
|
| 425 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
| 426 |
desired_duration = entry["end"] - entry["start"]
|
| 427 |
-
|
| 428 |
-
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 429 |
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
-
|
| 433 |
-
|
| 434 |
|
| 435 |
if not output_path or not os.path.exists(segment_audio_path):
|
| 436 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
|
@@ -453,28 +455,28 @@ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover,
|
|
| 453 |
|
| 454 |
return i, txt_clip, audio_segment, error_message
|
| 455 |
|
| 456 |
-
def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
|
| 457 |
video = VideoFileClip(video_path)
|
| 458 |
font_path = "./NotoSansSC-Regular.ttf"
|
| 459 |
|
| 460 |
text_clips = []
|
| 461 |
audio_segments = []
|
| 462 |
error_messages = []
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
|
| 476 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 477 |
-
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path,
|
| 478 |
for i, entry in enumerate(translated_json)]
|
| 479 |
|
| 480 |
results = []
|
|
@@ -526,26 +528,56 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
| 526 |
|
| 527 |
return error_messages
|
| 528 |
|
| 529 |
-
def
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
if not speaker_wav_path or not os.path.exists(speaker_wav_path):
|
| 538 |
msg = f"β Speaker audio not found: {speaker_wav_path}"
|
| 539 |
logger.error(msg)
|
| 540 |
return None, msg, msg
|
| 541 |
|
| 542 |
-
speed_tts = calibrated_speed(full_text, desired_duration)
|
| 543 |
tts_model.tts_to_file(
|
| 544 |
text=full_text,
|
| 545 |
speaker_wav=speaker_wav_path,
|
| 546 |
language=target_language,
|
| 547 |
file_path=output_audio_path,
|
| 548 |
-
speed=
|
| 549 |
split_sentences=True
|
| 550 |
)
|
| 551 |
|
|
@@ -584,7 +616,6 @@ def calibrated_speed(text, desired_duration):
|
|
| 584 |
slope = (2 - 1.0) / (30 - 14)
|
| 585 |
return 1.0 + slope * (cps - 14)
|
| 586 |
|
| 587 |
-
|
| 588 |
def upload_and_manage(file, target_language, mode="transcription"):
|
| 589 |
if file is None:
|
| 590 |
logger.info("No file uploaded. Please upload a video/audio file.")
|
|
@@ -702,6 +733,7 @@ def build_interface():
|
|
| 702 |
return demo
|
| 703 |
|
| 704 |
tts_model = None
|
|
|
|
| 705 |
# Launch the Gradio interface
|
| 706 |
demo = build_interface()
|
| 707 |
demo.launch()
|
|
|
|
| 408 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 409 |
return None
|
| 410 |
|
| 411 |
+
def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, use_clone, speaker_sample_paths=None):
|
| 412 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 413 |
error_message = None
|
| 414 |
|
|
|
|
| 424 |
try:
|
| 425 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
| 426 |
desired_duration = entry["end"] - entry["start"]
|
| 427 |
+
desired_speed = calibrated_speed(entry['translated'], desired_duration)
|
|
|
|
| 428 |
|
| 429 |
+
if use_clone:
|
| 430 |
+
speaker = entry.get("speaker", "default")
|
| 431 |
+
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 432 |
+
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
| 433 |
|
| 434 |
+
else:
|
| 435 |
+
generate_voiceover_OpenAI(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
| 436 |
|
| 437 |
if not output_path or not os.path.exists(segment_audio_path):
|
| 438 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
|
|
|
| 455 |
|
| 456 |
return i, txt_clip, audio_segment, error_message
|
| 457 |
|
| 458 |
+
def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None, use_clone=False):
|
| 459 |
video = VideoFileClip(video_path)
|
| 460 |
font_path = "./NotoSansSC-Regular.ttf"
|
| 461 |
|
| 462 |
text_clips = []
|
| 463 |
audio_segments = []
|
| 464 |
error_messages = []
|
| 465 |
+
|
| 466 |
+
if use_clone:
|
| 467 |
+
if tts_model is None:
|
| 468 |
+
try:
|
| 469 |
+
print("π Loading XTTS model...")
|
| 470 |
+
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
| 471 |
+
print("β
XTTS model loaded successfully.")
|
| 472 |
+
except Exception as e:
|
| 473 |
+
print("β Error loading XTTS model:")
|
| 474 |
+
traceback.print_exc()
|
| 475 |
+
return f"Error loading XTTS model: {e}"
|
| 476 |
+
## Need to implmenet backup option.
|
| 477 |
|
| 478 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 479 |
+
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, use_clone, speaker_sample_paths)
|
| 480 |
for i, entry in enumerate(translated_json)]
|
| 481 |
|
| 482 |
results = []
|
|
|
|
| 528 |
|
| 529 |
return error_messages
|
| 530 |
|
| 531 |
+
def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
|
| 532 |
+
"""
|
| 533 |
+
Generate voiceover from translated text for a given language using OpenAI TTS API.
|
| 534 |
+
"""
|
| 535 |
+
# Define the voice based on the language (for now, use 'alloy' as default)
|
| 536 |
+
voice = "alloy" # Adjust based on language if needed
|
| 537 |
+
|
| 538 |
+
# Define the model (use tts-1 for real-time applications)
|
| 539 |
+
model = "tts-1"
|
| 540 |
+
|
| 541 |
+
max_retries = 3
|
| 542 |
+
retry_count = 0
|
| 543 |
+
|
| 544 |
+
while retry_count < max_retries:
|
| 545 |
+
try:
|
| 546 |
+
# Create the speech using OpenAI TTS API
|
| 547 |
+
response = client.audio.speech.create(
|
| 548 |
+
model=model,
|
| 549 |
+
voice=voice,
|
| 550 |
+
input=full_text,
|
| 551 |
+
speed=desired_speed
|
| 552 |
+
)
|
| 553 |
+
# Save the audio to the specified path
|
| 554 |
+
with open(output_audio_path, 'wb') as f:
|
| 555 |
+
for chunk in response.iter_bytes():
|
| 556 |
+
f.write(chunk)
|
| 557 |
+
logging.info(f"Voiceover generated successfully for {output_audio_path}")
|
| 558 |
+
break
|
| 559 |
|
| 560 |
+
except Exception as e:
|
| 561 |
+
retry_count += 1
|
| 562 |
+
logging.error(f"Error generating voiceover (retry {retry_count}/{max_retries}): {e}")
|
| 563 |
+
time.sleep(5) # Wait 5 seconds before retrying
|
| 564 |
+
|
| 565 |
+
if retry_count == max_retries:
|
| 566 |
+
raise ValueError(f"Failed to generate voiceover after {max_retries} retries.")
|
| 567 |
+
|
| 568 |
+
def generate_voiceover_clone(full_text, tts_model, desired_speed, target_language, speaker_wav_path, output_audio_path, use_clone=False):
|
| 569 |
+
try:
|
| 570 |
if not speaker_wav_path or not os.path.exists(speaker_wav_path):
|
| 571 |
msg = f"β Speaker audio not found: {speaker_wav_path}"
|
| 572 |
logger.error(msg)
|
| 573 |
return None, msg, msg
|
| 574 |
|
|
|
|
| 575 |
tts_model.tts_to_file(
|
| 576 |
text=full_text,
|
| 577 |
speaker_wav=speaker_wav_path,
|
| 578 |
language=target_language,
|
| 579 |
file_path=output_audio_path,
|
| 580 |
+
speed=desired_speed,
|
| 581 |
split_sentences=True
|
| 582 |
)
|
| 583 |
|
|
|
|
| 616 |
slope = (2 - 1.0) / (30 - 14)
|
| 617 |
return 1.0 + slope * (cps - 14)
|
| 618 |
|
|
|
|
| 619 |
def upload_and_manage(file, target_language, mode="transcription"):
|
| 620 |
if file is None:
|
| 621 |
logger.info("No file uploaded. Please upload a video/audio file.")
|
|
|
|
| 733 |
return demo
|
| 734 |
|
| 735 |
tts_model = None
|
| 736 |
+
global tts_model
|
| 737 |
# Launch the Gradio interface
|
| 738 |
demo = build_interface()
|
| 739 |
demo.launch()
|