Spaces:

Curify
/

studio_V1

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 7, 2025

Commit

d301d25

verified ·

1 Parent(s): dd18209

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -35

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import numpy as np
 import re
 import concurrent.futures
 import gradio as gr
@@ -382,6 +383,46 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
         logger.error(f"\u274c Failed to create subtitle clip: {e}")
         return None
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
     error_message = None
@@ -394,6 +435,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
         txt_clip = None
     audio_segment = None
     if process_mode > 1:
         try:
             segment_audio_path = f"segment_{i}_voiceover.wav"
@@ -402,10 +444,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
             speaker = entry.get("speaker", "default")
             speaker_wav_path = f"speaker_{speaker}_sample.wav"
-            # Assume this is the list of supported languages for the TTS model
             supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
             if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
                 generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
             else:
@@ -415,14 +456,9 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
                 raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
             audio_clip = AudioFileClip(segment_audio_path)
-            logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
-            if audio_clip.duration < desired_duration:
-                silence_duration = desired_duration - audio_clip.duration
-                audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
-                logger.info(f"Padded audio with {silence_duration} seconds of silence.")
-            audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
         except Exception as e:
             err = f"❌ Failed to generate audio segment for entry {i}: {e}"
@@ -430,28 +466,31 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
             error_message = error_message + " | " + err if error_message else err
             audio_segment = None
-    return i, txt_clip, audio_segment, error_message
 def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
     video = VideoFileClip(video_path)
     font_path = "./NotoSansSC-Regular.ttf"
     text_clips = []
     audio_segments = []
     error_messages = []
     if process_mode == 3:
         global tts_model
         if tts_model is None:
             try:
                 print("🔄 Loading XTTS model...")
                 tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
                 print("✅ XTTS model loaded successfully.")
             except Exception as e:
                 print("❌ Error loading XTTS model:")
                 traceback.print_exc()
                 return f"Error loading XTTS model: {e}"
-                ## Need to implmenet backup option.
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
@@ -460,50 +499,47 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
         results = []
         for future in concurrent.futures.as_completed(futures):
             try:
-                i, txt_clip, audio_segment, error = future.result()
-                results.append((i, txt_clip, audio_segment))
                 if error:
                     error_messages.append(f"[Entry {i}] {error}")
             except Exception as e:
                 err = f"❌ Unexpected error in future result: {e}"
-                logger.error(err)
                 error_messages.append(err)
-    # Sort by entry index to ensure order
     results.sort(key=lambda x: x[0])
-    text_clips = [clip for _, clip, _ in results if clip]
-    if process_mode>1:
-        audio_segments = [segment for _, _, segment in results if segment]
     final_video = CompositeVideoClip([video] + text_clips)
-    if process_mode>1 and audio_segments:
         try:
             voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
             if background_audio_path and os.path.exists(background_audio_path):
                 background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
                 final_audio = CompositeAudioClip([voice_audio, background_audio])
-                # final_audio = voice_audio
-                logger.info("✅ Background audio loaded and merged with voiceover.")
             else:
                 final_audio = voice_audio
-                logger.info("⚠️ No background audio found. Using voiceover only.")
             final_video = final_video.set_audio(final_audio)
         except Exception as e:
-            logger.error(f"❌ Failed to set audio: {e}")
-    logger.info(f"Saving the final video to: {output_path}")
-    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
-    logger.info("Video processing completed successfully.")
-    if error_messages:
-        logger.warning("⚠️ Errors encountered during processing:")
-        for msg in error_messages:
-            logger.warning(msg)
     return error_messages

 import numpy as np
+import cvxpy as cp
 import re
 import concurrent.futures
 import gradio as gr
         logger.error(f"\u274c Failed to create subtitle clip: {e}")
         return None
+def solve_optimal_alignment(original_segments, generated_durations, total_duration):
+    """
+    Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
+    Modifies and returns the translated_json with updated 'start' and 'end'.
+    """
+    N = len(original_segments)
+    d = np.array(generated_durations)
+    m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
+    try:
+        s = cp.Variable(N)
+        objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
+        constraints = [s[0] >= 0]
+        for i in range(N - 1):
+            constraints.append(s[i] + d[i] <= s[i + 1])
+        constraints.append(s[N - 1] + d[N - 1] == total_duration)
+        problem = cp.Problem(objective, constraints)
+        problem.solve()
+        if s.value is None:
+            raise ValueError("Solver failed")
+        for i in range(N):
+            original_segments[i]['start'] = round(s.value[i], 3)
+            original_segments[i]['end'] = round(s.value[i] + d[i], 3)
+    except Exception as e:
+        print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
+        current_time = 0.0
+        for i in range(N):
+            original_segments[i]['start'] = round(current_time, 3)
+            original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
+            current_time += generated_durations[i]
+    return original_segments
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
     error_message = None
         txt_clip = None
     audio_segment = None
+    actual_duration = 0.0
     if process_mode > 1:
         try:
             segment_audio_path = f"segment_{i}_voiceover.wav"
             speaker = entry.get("speaker", "default")
             speaker_wav_path = f"speaker_{speaker}_sample.wav"
             supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
             if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
                 generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
             else:
                 raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
             audio_clip = AudioFileClip(segment_audio_path)
+            actual_duration = audio_clip.duration
+            audio_segment = audio_clip  # Do not set start here, alignment happens later
         except Exception as e:
             err = f"❌ Failed to generate audio segment for entry {i}: {e}"
             error_message = error_message + " | " + err if error_message else err
             audio_segment = None
+    return i, txt_clip, audio_segment, actual_duration, error_message
 def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
     video = VideoFileClip(video_path)
     font_path = "./NotoSansSC-Regular.ttf"
     text_clips = []
     audio_segments = []
+    actual_durations = []
     error_messages = []
     if process_mode == 3:
         global tts_model
         if tts_model is None:
             try:
                 print("🔄 Loading XTTS model...")
+                from TTS.api import TTS
                 tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
                 print("✅ XTTS model loaded successfully.")
             except Exception as e:
                 print("❌ Error loading XTTS model:")
                 traceback.print_exc()
                 return f"Error loading XTTS model: {e}"
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
         results = []
         for future in concurrent.futures.as_completed(futures):
             try:
+                i, txt_clip, audio_segment, actual_duration, error = future.result()
+                results.append((i, txt_clip, audio_segment, actual_duration))
                 if error:
                     error_messages.append(f"[Entry {i}] {error}")
             except Exception as e:
                 err = f"❌ Unexpected error in future result: {e}"
                 error_messages.append(err)
     results.sort(key=lambda x: x[0])
+    text_clips = [clip for _, clip, _, _ in results if clip]
+    generated_durations = [dur for _, _, _, dur in results if dur > 0]
+    # Align using optimization (modifies translated_json in-place)
+    translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
+    # Set aligned timings
+    audio_segments = []
+    for i, entry in enumerate(translated_json):
+        segment = results[i][2]  # AudioFileClip
+        if segment:
+            segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
+            audio_segments.append(segment)
     final_video = CompositeVideoClip([video] + text_clips)
+    if process_mode > 1 and audio_segments:
         try:
             voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
             if background_audio_path and os.path.exists(background_audio_path):
                 background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
                 final_audio = CompositeAudioClip([voice_audio, background_audio])
             else:
                 final_audio = voice_audio
             final_video = final_video.set_audio(final_audio)
         except Exception as e:
+            print(f"❌ Failed to set audio: {e}")
+    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
     return error_messages