Spaces:

Curify
/

studio_V1

Sleeping

App Files Files Community

qqwjq1981 commited on Mar 18, 2025

Commit

e1b0b64

verified ·

1 Parent(s): 0eccd9c

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -33

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import moviepy
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 from moviepy.editor import (
     VideoFileClip,
     TextClip,
     CompositeVideoClip,
@@ -16,6 +17,7 @@ from moviepy.editor import (
     concatenate_videoclips,
     concatenate_audioclips
 )
 from moviepy.audio.AudioClip import AudioArrayClip
 import subprocess
 import speech_recognition as sr
@@ -306,47 +308,83 @@ def update_translations(file, edited_table, mode):
     except Exception as e:
         raise ValueError(f"Error updating translations: {e}")
 def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
-    # Create text clip for subtitles
-    txt_clip = TextClip(
-        txt=entry["translated"],
-        font="./NotoSansSC-Regular.ttf",
-        color='yellow',
-        stroke_color='black',
-        stroke_width=2,
-        fontsize=int(video_height // 20),
-        method='label',
-    ).with_start(entry["start"]).with_duration(entry["end"] - entry["start"]).with_position(('bottom')).with_opacity(0.8)
     audio_segment = None
     if add_voiceover:
-        segment_audio_path = f"segment_{i}_voiceover.wav"
-        desired_duration = entry["end"] - entry["start"]
-        speaker_id = entry["speaker"]  # Extract the speaker ID
-        speaker_wav_path = f"speaker_{speaker_id}_sample.wav" # pass the intermediate value to prevent from breaking.
-        generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
-        audio_clip = AudioFileClip(segment_audio_path)
-        # Get and log all methods in AudioFileClip
-        logger.info("Methods in AudioFileClip:")
-        for method in dir(audio_clip):
-            logger.info(method)
-        # Log duration of the audio clip and the desired duration for debugging.
-        logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
-        if audio_clip.duration < desired_duration:
-            # Pad with silence if audio is too short
-            silence_duration = desired_duration - audio_clip.duration
-            # Concatenate the original audio and silence
-            audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
-            logger.info(f"Padded audio with {silence_duration} seconds of silence.")
-        # Set the audio_segment to the required duration.
-        audio_segment = audio_clip.with_start(entry["start"]).with_duration(desired_duration)
     return i, txt_clip, audio_segment

 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 from moviepy.editor import (
+    ImageClip,
     VideoFileClip,
     TextClip,
     CompositeVideoClip,
     concatenate_videoclips,
     concatenate_audioclips
 )
+from PIL import Image, ImageDraw, ImageFont
 from moviepy.audio.AudioClip import AudioArrayClip
 import subprocess
 import speech_recognition as sr
     except Exception as e:
         raise ValueError(f"Error updating translations: {e}")
+def create_subtitle_clip_pil(entry, video_width, video_height, font_path="./NotoSansSC-Regular.ttf"):
+    """
+    Creates a PIL-based ImageClip for subtitle text (no ImageMagick needed).
+    """
+    subtitle_font_size = int(video_height // 20)
+    subtitle_width = int(video_width * 0.8)
+    text = entry["translated"]
+    try:
+        font = ImageFont.truetype(font_path, subtitle_font_size)
+    except Exception as e:
+        print(f"⚠️ Could not load font from {font_path}, using default font: {e}")
+        font = ImageFont.load_default()
+    # Estimate text height using multiline
+    dummy_img = Image.new("RGBA", (subtitle_width, 1), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(dummy_img)
+    lines = []
+    line = ""
+    for word in text.split():
+        test_line = f"{line} {word}".strip()
+        w, _ = draw.textsize(test_line, font=font)
+        if w <= subtitle_width - 10:
+            line = test_line
+        else:
+            lines.append(line)
+            line = word
+    lines.append(line)
+    line_height = subtitle_font_size + 4
+    total_height = len(lines) * line_height + 10
+    img = Image.new("RGBA", (subtitle_width, total_height), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    for idx, l in enumerate(lines):
+        draw.text((5, 5 + idx * line_height), l, font=font, fill=(255, 255, 0, 255))
+    np_img = np.array(img)
+    txt_clip = ImageClip(np_img, ismask=False).set_position(("center", "bottom")) \
+        .set_start(entry["start"]).set_duration(entry["end"] - entry["start"]).set_opacity(0.8)
+    return txt_clip
 def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
+    try:
+        # Subtitle clip via PIL (robust, no ImageMagick needed)
+        txt_clip = create_subtitle_clip_pil(entry, video_width, video_height)
+    except Exception as e:
+        logger.error(f"❌ Failed to create subtitle clip for entry {i}: {e}")
+        txt_clip = None
     audio_segment = None
     if add_voiceover:
+        try:
+            segment_audio_path = f"segment_{i}_voiceover.wav"
+            desired_duration = entry["end"] - entry["start"]
+            speaker_id = entry.get("speaker", "default")
+            speaker_wav_path = speaker_sample_paths.get(speaker_id, None) if speaker_sample_paths else None
+            generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
+            audio_clip = AudioFileClip(segment_audio_path)
+            logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
+            if audio_clip.duration < desired_duration:
+                silence_duration = desired_duration - audio_clip.duration
+                audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
+                logger.info(f"Padded audio with {silence_duration:.2f}s silence.")
+            audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
+        except Exception as e:
+            logger.error(f"❌ Failed to generate audio segment for entry {i}: {e}")
+            audio_segment = None
     return i, txt_clip, audio_segment