Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import numpy as np
|
|
|
|
| 2 |
import re
|
| 3 |
import concurrent.futures
|
| 4 |
import gradio as gr
|
|
@@ -382,6 +383,46 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
| 382 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 383 |
return None
|
| 384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 386 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 387 |
error_message = None
|
|
@@ -394,6 +435,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 394 |
txt_clip = None
|
| 395 |
|
| 396 |
audio_segment = None
|
|
|
|
| 397 |
if process_mode > 1:
|
| 398 |
try:
|
| 399 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
|
@@ -402,10 +444,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 402 |
|
| 403 |
speaker = entry.get("speaker", "default")
|
| 404 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 405 |
-
|
| 406 |
-
# Assume this is the list of supported languages for the TTS model
|
| 407 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
| 408 |
-
|
| 409 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
| 410 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
| 411 |
else:
|
|
@@ -415,14 +456,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 415 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
| 416 |
|
| 417 |
audio_clip = AudioFileClip(segment_audio_path)
|
| 418 |
-
|
| 419 |
|
| 420 |
-
|
| 421 |
-
silence_duration = desired_duration - audio_clip.duration
|
| 422 |
-
audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
|
| 423 |
-
logger.info(f"Padded audio with {silence_duration} seconds of silence.")
|
| 424 |
-
|
| 425 |
-
audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
|
| 426 |
|
| 427 |
except Exception as e:
|
| 428 |
err = f"❌ Failed to generate audio segment for entry {i}: {e}"
|
|
@@ -430,28 +466,31 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 430 |
error_message = error_message + " | " + err if error_message else err
|
| 431 |
audio_segment = None
|
| 432 |
|
| 433 |
-
return i, txt_clip, audio_segment, error_message
|
| 434 |
-
|
|
|
|
| 435 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
|
|
|
| 436 |
video = VideoFileClip(video_path)
|
| 437 |
font_path = "./NotoSansSC-Regular.ttf"
|
| 438 |
|
| 439 |
text_clips = []
|
| 440 |
audio_segments = []
|
|
|
|
| 441 |
error_messages = []
|
| 442 |
-
|
| 443 |
if process_mode == 3:
|
| 444 |
global tts_model
|
| 445 |
if tts_model is None:
|
| 446 |
try:
|
| 447 |
print("🔄 Loading XTTS model...")
|
|
|
|
| 448 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
| 449 |
print("✅ XTTS model loaded successfully.")
|
| 450 |
except Exception as e:
|
| 451 |
print("❌ Error loading XTTS model:")
|
| 452 |
traceback.print_exc()
|
| 453 |
return f"Error loading XTTS model: {e}"
|
| 454 |
-
## Need to implmenet backup option.
|
| 455 |
|
| 456 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 457 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
|
@@ -460,50 +499,47 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
| 460 |
results = []
|
| 461 |
for future in concurrent.futures.as_completed(futures):
|
| 462 |
try:
|
| 463 |
-
i, txt_clip, audio_segment, error = future.result()
|
| 464 |
-
results.append((i, txt_clip, audio_segment))
|
| 465 |
if error:
|
| 466 |
error_messages.append(f"[Entry {i}] {error}")
|
| 467 |
except Exception as e:
|
| 468 |
err = f"❌ Unexpected error in future result: {e}"
|
| 469 |
-
logger.error(err)
|
| 470 |
error_messages.append(err)
|
| 471 |
|
| 472 |
-
# Sort by entry index to ensure order
|
| 473 |
results.sort(key=lambda x: x[0])
|
| 474 |
-
text_clips = [clip for _, clip, _ in results if clip]
|
| 475 |
-
if
|
| 476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
final_video = CompositeVideoClip([video] + text_clips)
|
| 479 |
|
| 480 |
-
if process_mode>1 and audio_segments:
|
| 481 |
try:
|
| 482 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
| 483 |
|
| 484 |
if background_audio_path and os.path.exists(background_audio_path):
|
| 485 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
| 486 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
| 487 |
-
# final_audio = voice_audio
|
| 488 |
-
logger.info("✅ Background audio loaded and merged with voiceover.")
|
| 489 |
else:
|
| 490 |
final_audio = voice_audio
|
| 491 |
-
logger.info("⚠️ No background audio found. Using voiceover only.")
|
| 492 |
|
| 493 |
final_video = final_video.set_audio(final_audio)
|
| 494 |
|
| 495 |
except Exception as e:
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
logger.info(f"Saving the final video to: {output_path}")
|
| 499 |
-
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
| 500 |
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
if error_messages:
|
| 504 |
-
logger.warning("⚠️ Errors encountered during processing:")
|
| 505 |
-
for msg in error_messages:
|
| 506 |
-
logger.warning(msg)
|
| 507 |
|
| 508 |
return error_messages
|
| 509 |
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
+
import cvxpy as cp
|
| 3 |
import re
|
| 4 |
import concurrent.futures
|
| 5 |
import gradio as gr
|
|
|
|
| 383 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 384 |
return None
|
| 385 |
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
| 389 |
+
"""
|
| 390 |
+
Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
|
| 391 |
+
Modifies and returns the translated_json with updated 'start' and 'end'.
|
| 392 |
+
"""
|
| 393 |
+
N = len(original_segments)
|
| 394 |
+
d = np.array(generated_durations)
|
| 395 |
+
m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
|
| 396 |
+
|
| 397 |
+
try:
|
| 398 |
+
s = cp.Variable(N)
|
| 399 |
+
objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
|
| 400 |
+
|
| 401 |
+
constraints = [s[0] >= 0]
|
| 402 |
+
for i in range(N - 1):
|
| 403 |
+
constraints.append(s[i] + d[i] <= s[i + 1])
|
| 404 |
+
constraints.append(s[N - 1] + d[N - 1] == total_duration)
|
| 405 |
+
|
| 406 |
+
problem = cp.Problem(objective, constraints)
|
| 407 |
+
problem.solve()
|
| 408 |
+
|
| 409 |
+
if s.value is None:
|
| 410 |
+
raise ValueError("Solver failed")
|
| 411 |
+
|
| 412 |
+
for i in range(N):
|
| 413 |
+
original_segments[i]['start'] = round(s.value[i], 3)
|
| 414 |
+
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
| 415 |
+
|
| 416 |
+
except Exception as e:
|
| 417 |
+
print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
|
| 418 |
+
|
| 419 |
+
current_time = 0.0
|
| 420 |
+
for i in range(N):
|
| 421 |
+
original_segments[i]['start'] = round(current_time, 3)
|
| 422 |
+
original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
|
| 423 |
+
current_time += generated_durations[i]
|
| 424 |
+
|
| 425 |
+
return original_segments
|
| 426 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 427 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 428 |
error_message = None
|
|
|
|
| 435 |
txt_clip = None
|
| 436 |
|
| 437 |
audio_segment = None
|
| 438 |
+
actual_duration = 0.0
|
| 439 |
if process_mode > 1:
|
| 440 |
try:
|
| 441 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
|
|
|
| 444 |
|
| 445 |
speaker = entry.get("speaker", "default")
|
| 446 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 447 |
+
|
|
|
|
| 448 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
| 449 |
+
|
| 450 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
| 451 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
| 452 |
else:
|
|
|
|
| 456 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
| 457 |
|
| 458 |
audio_clip = AudioFileClip(segment_audio_path)
|
| 459 |
+
actual_duration = audio_clip.duration
|
| 460 |
|
| 461 |
+
audio_segment = audio_clip # Do not set start here, alignment happens later
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
except Exception as e:
|
| 464 |
err = f"❌ Failed to generate audio segment for entry {i}: {e}"
|
|
|
|
| 466 |
error_message = error_message + " | " + err if error_message else err
|
| 467 |
audio_segment = None
|
| 468 |
|
| 469 |
+
return i, txt_clip, audio_segment, actual_duration, error_message
|
| 470 |
+
|
| 471 |
+
|
| 472 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
| 473 |
+
|
| 474 |
video = VideoFileClip(video_path)
|
| 475 |
font_path = "./NotoSansSC-Regular.ttf"
|
| 476 |
|
| 477 |
text_clips = []
|
| 478 |
audio_segments = []
|
| 479 |
+
actual_durations = []
|
| 480 |
error_messages = []
|
| 481 |
+
|
| 482 |
if process_mode == 3:
|
| 483 |
global tts_model
|
| 484 |
if tts_model is None:
|
| 485 |
try:
|
| 486 |
print("🔄 Loading XTTS model...")
|
| 487 |
+
from TTS.api import TTS
|
| 488 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
| 489 |
print("✅ XTTS model loaded successfully.")
|
| 490 |
except Exception as e:
|
| 491 |
print("❌ Error loading XTTS model:")
|
| 492 |
traceback.print_exc()
|
| 493 |
return f"Error loading XTTS model: {e}"
|
|
|
|
| 494 |
|
| 495 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 496 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
|
|
|
| 499 |
results = []
|
| 500 |
for future in concurrent.futures.as_completed(futures):
|
| 501 |
try:
|
| 502 |
+
i, txt_clip, audio_segment, actual_duration, error = future.result()
|
| 503 |
+
results.append((i, txt_clip, audio_segment, actual_duration))
|
| 504 |
if error:
|
| 505 |
error_messages.append(f"[Entry {i}] {error}")
|
| 506 |
except Exception as e:
|
| 507 |
err = f"❌ Unexpected error in future result: {e}"
|
|
|
|
| 508 |
error_messages.append(err)
|
| 509 |
|
|
|
|
| 510 |
results.sort(key=lambda x: x[0])
|
| 511 |
+
text_clips = [clip for _, clip, _, _ in results if clip]
|
| 512 |
+
generated_durations = [dur for _, _, _, dur in results if dur > 0]
|
| 513 |
+
|
| 514 |
+
# Align using optimization (modifies translated_json in-place)
|
| 515 |
+
translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
|
| 516 |
+
|
| 517 |
+
# Set aligned timings
|
| 518 |
+
audio_segments = []
|
| 519 |
+
for i, entry in enumerate(translated_json):
|
| 520 |
+
segment = results[i][2] # AudioFileClip
|
| 521 |
+
if segment:
|
| 522 |
+
segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
|
| 523 |
+
audio_segments.append(segment)
|
| 524 |
|
| 525 |
final_video = CompositeVideoClip([video] + text_clips)
|
| 526 |
|
| 527 |
+
if process_mode > 1 and audio_segments:
|
| 528 |
try:
|
| 529 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
| 530 |
|
| 531 |
if background_audio_path and os.path.exists(background_audio_path):
|
| 532 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
| 533 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
|
|
|
|
|
|
| 534 |
else:
|
| 535 |
final_audio = voice_audio
|
|
|
|
| 536 |
|
| 537 |
final_video = final_video.set_audio(final_audio)
|
| 538 |
|
| 539 |
except Exception as e:
|
| 540 |
+
print(f"❌ Failed to set audio: {e}")
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
+
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
return error_messages
|
| 545 |
|