Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import numpy as np
|
| 2 |
-
import cvxpy as cp
|
| 3 |
import re
|
| 4 |
import concurrent.futures
|
| 5 |
import gradio as gr
|
|
@@ -79,29 +78,24 @@ css = """
|
|
| 79 |
.dataframe-container tr {
|
| 80 |
height: 50px !important;
|
| 81 |
}
|
| 82 |
-
|
| 83 |
/* Ensure text wrapping and prevent overflow */
|
| 84 |
.dataframe-container td {
|
| 85 |
white-space: normal !important;
|
| 86 |
word-break: break-word !important;
|
| 87 |
}
|
| 88 |
-
|
| 89 |
/* Set column widths */
|
| 90 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
|
| 91 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
|
| 92 |
width: 6%; /* Start column */
|
| 93 |
}
|
| 94 |
-
|
| 95 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
|
| 96 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
|
| 97 |
width: 47%; /* Original text */
|
| 98 |
}
|
| 99 |
-
|
| 100 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
|
| 101 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
|
| 102 |
width: 47%; /* Translated text */
|
| 103 |
}
|
| 104 |
-
|
| 105 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
|
| 106 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
|
| 107 |
display: none !important;
|
|
@@ -173,7 +167,7 @@ def transcribe_video_with_speakers(video_path):
|
|
| 173 |
logger.info("WhisperX model loaded")
|
| 174 |
|
| 175 |
# Transcribe
|
| 176 |
-
result = model.transcribe(audio_path, chunk_size=
|
| 177 |
logger.info("Audio transcription completed")
|
| 178 |
|
| 179 |
# Get the detected language
|
|
@@ -238,7 +232,6 @@ def transcribe_video_with_speakers(video_path):
|
|
| 238 |
def get_translation_model(source_language, target_language):
|
| 239 |
"""
|
| 240 |
Get the translation model based on the source and target language.
|
| 241 |
-
|
| 242 |
Parameters:
|
| 243 |
- target_language (str): The language to translate the content into (e.g., 'es', 'fr').
|
| 244 |
- source_language (str): The language of the input content (default is 'en' for English).
|
|
@@ -383,44 +376,6 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
| 383 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 384 |
return None
|
| 385 |
|
| 386 |
-
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
| 387 |
-
"""
|
| 388 |
-
Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
|
| 389 |
-
Modifies and returns the translated_json with updated 'start' and 'end'.
|
| 390 |
-
"""
|
| 391 |
-
N = len(original_segments)
|
| 392 |
-
d = np.array(generated_durations)
|
| 393 |
-
m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
|
| 394 |
-
|
| 395 |
-
try:
|
| 396 |
-
s = cp.Variable(N)
|
| 397 |
-
objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
|
| 398 |
-
|
| 399 |
-
constraints = [s[0] >= 0]
|
| 400 |
-
for i in range(N - 1):
|
| 401 |
-
constraints.append(s[i] + d[i] <= s[i + 1])
|
| 402 |
-
constraints.append(s[N - 1] + d[N - 1] == total_duration)
|
| 403 |
-
|
| 404 |
-
problem = cp.Problem(objective, constraints)
|
| 405 |
-
problem.solve()
|
| 406 |
-
|
| 407 |
-
if s.value is None:
|
| 408 |
-
raise ValueError("Solver failed")
|
| 409 |
-
|
| 410 |
-
for i in range(N):
|
| 411 |
-
original_segments[i]['start'] = round(s.value[i], 3)
|
| 412 |
-
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
| 413 |
-
|
| 414 |
-
except Exception as e:
|
| 415 |
-
print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
|
| 416 |
-
|
| 417 |
-
current_time = 0.0
|
| 418 |
-
for i in range(N):
|
| 419 |
-
original_segments[i]['start'] = round(current_time, 3)
|
| 420 |
-
original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
|
| 421 |
-
current_time += generated_durations[i]
|
| 422 |
-
|
| 423 |
-
return original_segments
|
| 424 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 425 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 426 |
error_message = None
|
|
@@ -433,7 +388,6 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 433 |
txt_clip = None
|
| 434 |
|
| 435 |
audio_segment = None
|
| 436 |
-
actual_duration = 0.0
|
| 437 |
if process_mode > 1:
|
| 438 |
try:
|
| 439 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
|
@@ -442,9 +396,10 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 442 |
|
| 443 |
speaker = entry.get("speaker", "default")
|
| 444 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 445 |
-
|
|
|
|
| 446 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
| 447 |
-
|
| 448 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
| 449 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
| 450 |
else:
|
|
@@ -454,9 +409,14 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 454 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
| 455 |
|
| 456 |
audio_clip = AudioFileClip(segment_audio_path)
|
| 457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
-
audio_segment = audio_clip
|
| 460 |
|
| 461 |
except Exception as e:
|
| 462 |
err = f"❌ Failed to generate audio segment for entry {i}: {e}"
|
|
@@ -464,31 +424,28 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 464 |
error_message = error_message + " | " + err if error_message else err
|
| 465 |
audio_segment = None
|
| 466 |
|
| 467 |
-
return i, txt_clip, audio_segment,
|
| 468 |
-
|
| 469 |
-
|
| 470 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
| 471 |
-
|
| 472 |
video = VideoFileClip(video_path)
|
| 473 |
font_path = "./NotoSansSC-Regular.ttf"
|
| 474 |
|
| 475 |
text_clips = []
|
| 476 |
audio_segments = []
|
| 477 |
-
actual_durations = []
|
| 478 |
error_messages = []
|
| 479 |
-
|
| 480 |
if process_mode == 3:
|
| 481 |
global tts_model
|
| 482 |
if tts_model is None:
|
| 483 |
try:
|
| 484 |
print("🔄 Loading XTTS model...")
|
| 485 |
-
from TTS.api import TTS
|
| 486 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
| 487 |
print("✅ XTTS model loaded successfully.")
|
| 488 |
except Exception as e:
|
| 489 |
print("❌ Error loading XTTS model:")
|
| 490 |
traceback.print_exc()
|
| 491 |
return f"Error loading XTTS model: {e}"
|
|
|
|
| 492 |
|
| 493 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 494 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
|
@@ -497,48 +454,51 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
| 497 |
results = []
|
| 498 |
for future in concurrent.futures.as_completed(futures):
|
| 499 |
try:
|
| 500 |
-
i, txt_clip, audio_segment,
|
| 501 |
-
results.append((i, txt_clip, audio_segment
|
| 502 |
if error:
|
| 503 |
error_messages.append(f"[Entry {i}] {error}")
|
| 504 |
except Exception as e:
|
| 505 |
err = f"❌ Unexpected error in future result: {e}"
|
|
|
|
| 506 |
error_messages.append(err)
|
| 507 |
|
|
|
|
| 508 |
results.sort(key=lambda x: x[0])
|
| 509 |
-
text_clips = [clip for _, clip, _
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
# Align using optimization (modifies translated_json in-place)
|
| 513 |
-
translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
|
| 514 |
-
|
| 515 |
-
# Set aligned timings
|
| 516 |
-
audio_segments = []
|
| 517 |
-
for i, entry in enumerate(translated_json):
|
| 518 |
-
segment = results[i][2] # AudioFileClip
|
| 519 |
-
if segment:
|
| 520 |
-
segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
|
| 521 |
-
audio_segments.append(segment)
|
| 522 |
|
| 523 |
final_video = CompositeVideoClip([video] + text_clips)
|
| 524 |
|
| 525 |
-
if process_mode
|
| 526 |
try:
|
| 527 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
| 528 |
|
| 529 |
if background_audio_path and os.path.exists(background_audio_path):
|
| 530 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
| 531 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
|
|
|
|
|
|
| 532 |
else:
|
| 533 |
final_audio = voice_audio
|
|
|
|
| 534 |
|
| 535 |
final_video = final_video.set_audio(final_audio)
|
| 536 |
|
| 537 |
except Exception as e:
|
| 538 |
-
|
| 539 |
-
|
|
|
|
| 540 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
return error_messages
|
| 543 |
|
| 544 |
def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
|
|
@@ -737,5 +697,4 @@ def build_interface():
|
|
| 737 |
|
| 738 |
tts_model = None
|
| 739 |
# Launch the Gradio interface
|
| 740 |
-
demo = build_interface()
|
| 741 |
-
demo.launch()
|
|
|
|
| 1 |
import numpy as np
|
|
|
|
| 2 |
import re
|
| 3 |
import concurrent.futures
|
| 4 |
import gradio as gr
|
|
|
|
| 78 |
.dataframe-container tr {
|
| 79 |
height: 50px !important;
|
| 80 |
}
|
|
|
|
| 81 |
/* Ensure text wrapping and prevent overflow */
|
| 82 |
.dataframe-container td {
|
| 83 |
white-space: normal !important;
|
| 84 |
word-break: break-word !important;
|
| 85 |
}
|
|
|
|
| 86 |
/* Set column widths */
|
| 87 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
|
| 88 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
|
| 89 |
width: 6%; /* Start column */
|
| 90 |
}
|
|
|
|
| 91 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
|
| 92 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
|
| 93 |
width: 47%; /* Original text */
|
| 94 |
}
|
|
|
|
| 95 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
|
| 96 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
|
| 97 |
width: 47%; /* Translated text */
|
| 98 |
}
|
|
|
|
| 99 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
|
| 100 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
|
| 101 |
display: none !important;
|
|
|
|
| 167 |
logger.info("WhisperX model loaded")
|
| 168 |
|
| 169 |
# Transcribe
|
| 170 |
+
result = model.transcribe(audio_path, chunk_size=10, print_progress = True)
|
| 171 |
logger.info("Audio transcription completed")
|
| 172 |
|
| 173 |
# Get the detected language
|
|
|
|
| 232 |
def get_translation_model(source_language, target_language):
|
| 233 |
"""
|
| 234 |
Get the translation model based on the source and target language.
|
|
|
|
| 235 |
Parameters:
|
| 236 |
- target_language (str): The language to translate the content into (e.g., 'es', 'fr').
|
| 237 |
- source_language (str): The language of the input content (default is 'en' for English).
|
|
|
|
| 376 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 377 |
return None
|
| 378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 380 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 381 |
error_message = None
|
|
|
|
| 388 |
txt_clip = None
|
| 389 |
|
| 390 |
audio_segment = None
|
|
|
|
| 391 |
if process_mode > 1:
|
| 392 |
try:
|
| 393 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
|
|
|
| 396 |
|
| 397 |
speaker = entry.get("speaker", "default")
|
| 398 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 399 |
+
|
| 400 |
+
# Assume this is the list of supported languages for the TTS model
|
| 401 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
| 402 |
+
|
| 403 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
| 404 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
| 405 |
else:
|
|
|
|
| 409 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
| 410 |
|
| 411 |
audio_clip = AudioFileClip(segment_audio_path)
|
| 412 |
+
logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
|
| 413 |
+
|
| 414 |
+
if audio_clip.duration < desired_duration:
|
| 415 |
+
silence_duration = desired_duration - audio_clip.duration
|
| 416 |
+
audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
|
| 417 |
+
logger.info(f"Padded audio with {silence_duration} seconds of silence.")
|
| 418 |
|
| 419 |
+
audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
|
| 420 |
|
| 421 |
except Exception as e:
|
| 422 |
err = f"❌ Failed to generate audio segment for entry {i}: {e}"
|
|
|
|
| 424 |
error_message = error_message + " | " + err if error_message else err
|
| 425 |
audio_segment = None
|
| 426 |
|
| 427 |
+
return i, txt_clip, audio_segment, error_message
|
| 428 |
+
|
|
|
|
| 429 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
|
|
|
| 430 |
video = VideoFileClip(video_path)
|
| 431 |
font_path = "./NotoSansSC-Regular.ttf"
|
| 432 |
|
| 433 |
text_clips = []
|
| 434 |
audio_segments = []
|
|
|
|
| 435 |
error_messages = []
|
| 436 |
+
|
| 437 |
if process_mode == 3:
|
| 438 |
global tts_model
|
| 439 |
if tts_model is None:
|
| 440 |
try:
|
| 441 |
print("🔄 Loading XTTS model...")
|
|
|
|
| 442 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
| 443 |
print("✅ XTTS model loaded successfully.")
|
| 444 |
except Exception as e:
|
| 445 |
print("❌ Error loading XTTS model:")
|
| 446 |
traceback.print_exc()
|
| 447 |
return f"Error loading XTTS model: {e}"
|
| 448 |
+
## Need to implmenet backup option.
|
| 449 |
|
| 450 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 451 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
|
|
|
| 454 |
results = []
|
| 455 |
for future in concurrent.futures.as_completed(futures):
|
| 456 |
try:
|
| 457 |
+
i, txt_clip, audio_segment, error = future.result()
|
| 458 |
+
results.append((i, txt_clip, audio_segment))
|
| 459 |
if error:
|
| 460 |
error_messages.append(f"[Entry {i}] {error}")
|
| 461 |
except Exception as e:
|
| 462 |
err = f"❌ Unexpected error in future result: {e}"
|
| 463 |
+
logger.error(err)
|
| 464 |
error_messages.append(err)
|
| 465 |
|
| 466 |
+
# Sort by entry index to ensure order
|
| 467 |
results.sort(key=lambda x: x[0])
|
| 468 |
+
text_clips = [clip for _, clip, _ in results if clip]
|
| 469 |
+
if process_mode>1:
|
| 470 |
+
audio_segments = [segment for _, _, segment in results if segment]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
|
| 472 |
final_video = CompositeVideoClip([video] + text_clips)
|
| 473 |
|
| 474 |
+
if process_mode>1 and audio_segments:
|
| 475 |
try:
|
| 476 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
| 477 |
|
| 478 |
if background_audio_path and os.path.exists(background_audio_path):
|
| 479 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
| 480 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
| 481 |
+
# final_audio = voice_audio
|
| 482 |
+
logger.info("✅ Background audio loaded and merged with voiceover.")
|
| 483 |
else:
|
| 484 |
final_audio = voice_audio
|
| 485 |
+
logger.info("⚠️ No background audio found. Using voiceover only.")
|
| 486 |
|
| 487 |
final_video = final_video.set_audio(final_audio)
|
| 488 |
|
| 489 |
except Exception as e:
|
| 490 |
+
logger.error(f"❌ Failed to set audio: {e}")
|
| 491 |
+
|
| 492 |
+
logger.info(f"Saving the final video to: {output_path}")
|
| 493 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
| 494 |
|
| 495 |
+
logger.info("Video processing completed successfully.")
|
| 496 |
+
|
| 497 |
+
if error_messages:
|
| 498 |
+
logger.warning("⚠️ Errors encountered during processing:")
|
| 499 |
+
for msg in error_messages:
|
| 500 |
+
logger.warning(msg)
|
| 501 |
+
|
| 502 |
return error_messages
|
| 503 |
|
| 504 |
def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
|
|
|
|
| 697 |
|
| 698 |
tts_model = None
|
| 699 |
# Launch the Gradio interface
|
| 700 |
+
demo = build_interface()
|
|
|