Spaces:

Curify
/

studio_V1

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 8, 2025

Commit

40e96a3

verified ·

1 Parent(s): bfa4d77

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -78

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import numpy as np
-import cvxpy as cp
 import re
 import concurrent.futures
 import gradio as gr
@@ -79,29 +78,24 @@ css = """
 .dataframe-container tr {
     height: 50px !important;
 }
 /* Ensure text wrapping and prevent overflow */
 .dataframe-container td {
     white-space: normal !important;
     word-break: break-word !important;
 }
 /* Set column widths */
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
     width: 6%; /* Start column */
 }
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
     width: 47%; /* Original text */
 }
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
     width: 47%; /* Translated text */
 }
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
     display: none !important;
@@ -173,7 +167,7 @@ def transcribe_video_with_speakers(video_path):
         logger.info("WhisperX model loaded")
         # Transcribe
-        result = model.transcribe(audio_path, chunk_size=6, print_progress = True)
         logger.info("Audio transcription completed")
         # Get the detected language
@@ -238,7 +232,6 @@ def transcribe_video_with_speakers(video_path):
 def get_translation_model(source_language, target_language):
     """
     Get the translation model based on the source and target language.
     Parameters:
     - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
     - source_language (str): The language of the input content (default is 'en' for English).
@@ -383,44 +376,6 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
         logger.error(f"\u274c Failed to create subtitle clip: {e}")
         return None
-def solve_optimal_alignment(original_segments, generated_durations, total_duration):
-    """
-    Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
-    Modifies and returns the translated_json with updated 'start' and 'end'.
-    """
-    N = len(original_segments)
-    d = np.array(generated_durations)
-    m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
-    try:
-        s = cp.Variable(N)
-        objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
-        constraints = [s[0] >= 0]
-        for i in range(N - 1):
-            constraints.append(s[i] + d[i] <= s[i + 1])
-        constraints.append(s[N - 1] + d[N - 1] == total_duration)
-        problem = cp.Problem(objective, constraints)
-        problem.solve()
-        if s.value is None:
-            raise ValueError("Solver failed")
-        for i in range(N):
-            original_segments[i]['start'] = round(s.value[i], 3)
-            original_segments[i]['end'] = round(s.value[i] + d[i], 3)
-    except Exception as e:
-        print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
-        current_time = 0.0
-        for i in range(N):
-            original_segments[i]['start'] = round(current_time, 3)
-            original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
-            current_time += generated_durations[i]
-    return original_segments
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
     error_message = None
@@ -433,7 +388,6 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
         txt_clip = None
     audio_segment = None
-    actual_duration = 0.0
     if process_mode > 1:
         try:
             segment_audio_path = f"segment_{i}_voiceover.wav"
@@ -442,9 +396,10 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
             speaker = entry.get("speaker", "default")
             speaker_wav_path = f"speaker_{speaker}_sample.wav"
             supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
             if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
                 generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
             else:
@@ -454,9 +409,14 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
                 raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
             audio_clip = AudioFileClip(segment_audio_path)
-            actual_duration = audio_clip.duration
-            audio_segment = audio_clip  # Do not set start here, alignment happens later
         except Exception as e:
             err = f"❌ Failed to generate audio segment for entry {i}: {e}"
@@ -464,31 +424,28 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
             error_message = error_message + " | " + err if error_message else err
             audio_segment = None
-    return i, txt_clip, audio_segment, actual_duration, error_message
 def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
     video = VideoFileClip(video_path)
     font_path = "./NotoSansSC-Regular.ttf"
     text_clips = []
     audio_segments = []
-    actual_durations = []
     error_messages = []
     if process_mode == 3:
         global tts_model
         if tts_model is None:
             try:
                 print("🔄 Loading XTTS model...")
-                from TTS.api import TTS
                 tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
                 print("✅ XTTS model loaded successfully.")
             except Exception as e:
                 print("❌ Error loading XTTS model:")
                 traceback.print_exc()
                 return f"Error loading XTTS model: {e}"
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
@@ -497,48 +454,51 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
         results = []
         for future in concurrent.futures.as_completed(futures):
             try:
-                i, txt_clip, audio_segment, actual_duration, error = future.result()
-                results.append((i, txt_clip, audio_segment, actual_duration))
                 if error:
                     error_messages.append(f"[Entry {i}] {error}")
             except Exception as e:
                 err = f"❌ Unexpected error in future result: {e}"
                 error_messages.append(err)
     results.sort(key=lambda x: x[0])
-    text_clips = [clip for _, clip, _, _ in results if clip]
-    generated_durations = [dur for _, _, _, dur in results if dur > 0]
-    # Align using optimization (modifies translated_json in-place)
-    translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
-    # Set aligned timings
-    audio_segments = []
-    for i, entry in enumerate(translated_json):
-        segment = results[i][2]  # AudioFileClip
-        if segment:
-            segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
-            audio_segments.append(segment)
     final_video = CompositeVideoClip([video] + text_clips)
-    if process_mode > 1 and audio_segments:
         try:
             voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
             if background_audio_path and os.path.exists(background_audio_path):
                 background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
                 final_audio = CompositeAudioClip([voice_audio, background_audio])
             else:
                 final_audio = voice_audio
             final_video = final_video.set_audio(final_audio)
         except Exception as e:
-            print(f"❌ Failed to set audio: {e}")
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
     return error_messages
 def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
@@ -737,5 +697,4 @@ def build_interface():
 tts_model = None
 # Launch the Gradio interface
-demo = build_interface()
-demo.launch()

 import numpy as np
 import re
 import concurrent.futures
 import gradio as gr
 .dataframe-container tr {
     height: 50px !important;
 }
 /* Ensure text wrapping and prevent overflow */
 .dataframe-container td {
     white-space: normal !important;
     word-break: break-word !important;
 }
 /* Set column widths */
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
     width: 6%; /* Start column */
 }
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
     width: 47%; /* Original text */
 }
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
     width: 47%; /* Translated text */
 }
 [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
 [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
     display: none !important;
         logger.info("WhisperX model loaded")
         # Transcribe
+        result = model.transcribe(audio_path, chunk_size=10, print_progress = True)
         logger.info("Audio transcription completed")
         # Get the detected language
 def get_translation_model(source_language, target_language):
     """
     Get the translation model based on the source and target language.
     Parameters:
     - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
     - source_language (str): The language of the input content (default is 'en' for English).
         logger.error(f"\u274c Failed to create subtitle clip: {e}")
         return None
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
     error_message = None
         txt_clip = None
     audio_segment = None
     if process_mode > 1:
         try:
             segment_audio_path = f"segment_{i}_voiceover.wav"
             speaker = entry.get("speaker", "default")
             speaker_wav_path = f"speaker_{speaker}_sample.wav"
+            # Assume this is the list of supported languages for the TTS model
             supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
             if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
                 generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
             else:
                 raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
             audio_clip = AudioFileClip(segment_audio_path)
+            logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
+            if audio_clip.duration < desired_duration:
+                silence_duration = desired_duration - audio_clip.duration
+                audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
+                logger.info(f"Padded audio with {silence_duration} seconds of silence.")
+            audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
         except Exception as e:
             err = f"❌ Failed to generate audio segment for entry {i}: {e}"
             error_message = error_message + " | " + err if error_message else err
             audio_segment = None
+    return i, txt_clip, audio_segment, error_message
 def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
     video = VideoFileClip(video_path)
     font_path = "./NotoSansSC-Regular.ttf"
     text_clips = []
     audio_segments = []
     error_messages = []
     if process_mode == 3:
         global tts_model
         if tts_model is None:
             try:
                 print("🔄 Loading XTTS model...")
                 tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
                 print("✅ XTTS model loaded successfully.")
             except Exception as e:
                 print("❌ Error loading XTTS model:")
                 traceback.print_exc()
                 return f"Error loading XTTS model: {e}"
+                ## Need to implmenet backup option.
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
         results = []
         for future in concurrent.futures.as_completed(futures):
             try:
+                i, txt_clip, audio_segment, error = future.result()
+                results.append((i, txt_clip, audio_segment))
                 if error:
                     error_messages.append(f"[Entry {i}] {error}")
             except Exception as e:
                 err = f"❌ Unexpected error in future result: {e}"
+                logger.error(err)
                 error_messages.append(err)
+    # Sort by entry index to ensure order
     results.sort(key=lambda x: x[0])
+    text_clips = [clip for _, clip, _ in results if clip]
+    if process_mode>1:
+        audio_segments = [segment for _, _, segment in results if segment]
     final_video = CompositeVideoClip([video] + text_clips)
+    if process_mode>1 and audio_segments:
         try:
             voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
             if background_audio_path and os.path.exists(background_audio_path):
                 background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
                 final_audio = CompositeAudioClip([voice_audio, background_audio])
+                # final_audio = voice_audio
+                logger.info("✅ Background audio loaded and merged with voiceover.")
             else:
                 final_audio = voice_audio
+                logger.info("⚠️ No background audio found. Using voiceover only.")
             final_video = final_video.set_audio(final_audio)
         except Exception as e:
+            logger.error(f"❌ Failed to set audio: {e}")
+    logger.info(f"Saving the final video to: {output_path}")
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
+    logger.info("Video processing completed successfully.")
+    if error_messages:
+        logger.warning("⚠️ Errors encountered during processing:")
+        for msg in error_messages:
+            logger.warning(msg)
     return error_messages
 def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
 tts_model = None
 # Launch the Gradio interface
+demo = build_interface()