Spaces:

vclmax
/

Element-8-Video

Running on Zero

App Files Files Community

Vicente Alvarez commited on 5 days ago

Commit

cc800d1

1 Parent(s): 3b38a35

Add Whisper subtitles (elegant animated) + PNG watermark support - all CPU work, free

Browse files

Files changed (2) hide show

app.py +153 -6
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -396,6 +396,131 @@ def loop_clips_with_audio_track(clip_paths: list[str], audio_path: str) -> str:
         return clip_paths[0] if clip_paths else None
 @spaces.GPU(duration=90)
 @torch.inference_mode()
 def generate_video(
@@ -531,6 +656,8 @@ def full_generation_process(
     negative_prompt: str,
     blur_amount: int,
     remove_music: bool,
     audio_track,
     progress=gr.Progress(track_tqdm=True),
 ):
@@ -557,13 +684,30 @@ def full_generation_process(
     if audio_track and len(clips) > 1:
         print("[CPU] Looping clips to match audio duration...")
         final_video = loop_clips_with_audio_track(clips, audio_track)
-        return final_video, final_seed
     elif len(clips) == 1:
-        # Single clip - return it directly
-        return clips[0], final_seed
     else:
-        # Multiple clips, no audio - return first clip (could be gallery in future)
-        return clips[0], final_seed
 with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # cleanup: check every 1h, delete files >2h old
@@ -614,6 +758,9 @@ with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # c
                 with gr.Row():
                     blur_amount = gr.Number(label="Blur (0=off, 36=heavy)", value=0, precision=0)
                     remove_music = gr.Checkbox(label="Remove Music", value=False)
                 negative_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value=DEFAULT_NEGATIVE_PROMPT,
@@ -674,7 +821,7 @@ with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # c
         inputs=[
             first_image, last_image, prompt1, prompt2, prompt3, duration, enhance_prompt,
             seed, randomize_seed, height, width, negative_prompt, blur_amount, remove_music,
-            audio_track,
         ],
         outputs=[output_video, seed],
     )

         return clip_paths[0] if clip_paths else None
+def transcribe_with_whisper(video_path: str, model_size: str = "small") -> list[dict]:
+    """Transcribe video audio with Whisper. Returns segments with timestamps."""
+    import whisper
+    try:
+        print(f"[whisper] Loading {model_size} model...")
+        model = whisper.load_model(model_size)
+        print(f"[whisper] Transcribing audio...")
+        result = model.transcribe(video_path, word_timestamps=True)
+        print(f"[whisper] Transcription complete: {len(result['segments'])} segments")
+        return result['segments']
+    except Exception as e:
+        print(f"[whisper] Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return []
+def create_beautiful_ass_subtitles(segments: list[dict], output_path: str, video_width: int, video_height: int):
+    """Create elegant animated ASS subtitles with Google Fonts styling."""
+    # Download elegant font (Montserrat)
+    import urllib.request
+    font_url = "https://github.com/JulietaUla/Montserrat/raw/master/fonts/ttf/Montserrat-SemiBold.ttf"
+    font_path = "/tmp/Montserrat-SemiBold.ttf"
+    try:
+        if not os.path.exists(font_path):
+            urllib.request.urlretrieve(font_url, font_path)
+    except:
+        font_path = "Arial"  # Fallback
+    # ASS subtitle header with beautiful styling
+    ass_content = f"""[Script Info]
+Title: Elegant Subtitles
+ScriptType: v4.00+
+WrapStyle: 0
+PlayResX: {video_width}
+PlayResY: {video_height}
+ScaledBorderAndShadow: yes
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: Default,Montserrat SemiBold,{int(video_height * 0.05)},&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,1,5,10,10,{int(video_height * 0.42)},1
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+    # Add each segment with fade animation
+    for seg in segments:
+        start_time = format_ass_time(seg['start'])
+        end_time = format_ass_time(seg['end'])
+        text = seg['text'].strip()
+        # Add fade in/out animation
+        fade_duration = 200  # ms
+        animated_text = f"{{\\fad({fade_duration},{fade_duration})}}{text}"
+        ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{animated_text}\n"
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(ass_content)
+    print(f"[subtitles] Created ASS file with {len(segments)} segments")
+def format_ass_time(seconds: float) -> str:
+    """Convert seconds to ASS timestamp format (h:mm:ss.cc)."""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    centisecs = int((seconds % 1) * 100)
+    return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"
+def burn_subtitles_and_watermark(video_path: str, output_path: str, subtitle_path: str = None, watermark_path: str = None):
+    """Burn subtitles and/or watermark into video using FFmpeg. CPU work - free."""
+    import subprocess
+    try:
+        # Build filter complex
+        filters = []
+        if subtitle_path and os.path.exists(subtitle_path):
+            # Burn subtitles
+            subtitle_filter = f"subtitles={subtitle_path}:force_style='FontName=Montserrat SemiBold'"
+            filters.append(subtitle_filter)
+        if watermark_path and os.path.exists(watermark_path):
+            # Add watermark (bottom-right corner, 10% width)
+            watermark_filter = f"[0:v][1:v]overlay=W-w-10:H-h-10"
+        # Build FFmpeg command
+        cmd = ['ffmpeg', '-y', '-i', video_path]
+        if watermark_path and os.path.exists(watermark_path):
+            cmd.extend(['-i', watermark_path])
+        if filters:
+            filter_complex = ";".join(filters)
+            if watermark_path and os.path.exists(watermark_path):
+                filter_complex = f"[0:v]subtitles={subtitle_path}:force_style='FontName=Montserrat SemiBold'[v];[v][1:v]overlay=W-w-10:H-h-10" if subtitle_path else "[0:v][1:v]overlay=W-w-10:H-h-10"
+            cmd.extend(['-filter_complex', filter_complex])
+        cmd.extend(['-c:a', 'copy', output_path])
+        print(f"[burn] Burning subtitles/watermark...")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Exception(f"Burn failed: {result.stderr[-200:]}")
+        print(f"[burn] Successfully burned subtitles/watermark")
+        return True
+    except Exception as e:
+        print(f"[burn] Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
 @spaces.GPU(duration=90)
 @torch.inference_mode()
 def generate_video(
     negative_prompt: str,
     blur_amount: int,
     remove_music: bool,
+    add_subtitles: bool,
+    watermark,
     audio_track,
     progress=gr.Progress(track_tqdm=True),
 ):
     if audio_track and len(clips) > 1:
         print("[CPU] Looping clips to match audio duration...")
         final_video = loop_clips_with_audio_track(clips, audio_track)
     elif len(clips) == 1:
+        final_video = clips[0]
     else:
+        final_video = clips[0]
+    # Phase 3: CPU work (free) - add subtitles and/or watermark
+    if add_subtitles or watermark:
+        print("[CPU] Adding subtitles/watermark...")
+        # Transcribe if subtitles requested
+        subtitle_file = None
+        if add_subtitles:
+            segments = transcribe_with_whisper(final_video, model_size="small")
+            if segments:
+                subtitle_file = tempfile.mktemp(suffix=".ass")
+                create_beautiful_ass_subtitles(segments, subtitle_file, int(width), int(height))
+        # Burn subtitles and/or watermark
+        output_with_extras = tempfile.mktemp(suffix=".mp4")
+        success = burn_subtitles_and_watermark(final_video, output_with_extras, subtitle_file, watermark)
+        if success:
+            final_video = output_with_extras
+    return final_video, final_seed
 with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # cleanup: check every 1h, delete files >2h old
                 with gr.Row():
                     blur_amount = gr.Number(label="Blur (0=off, 36=heavy)", value=0, precision=0)
                     remove_music = gr.Checkbox(label="Remove Music", value=False)
+                with gr.Row():
+                    add_subtitles = gr.Checkbox(label="Add Subtitles (Whisper)", value=False)
+                    watermark = gr.Image(label="Watermark (PNG)", type="filepath", sources=["upload"])
                 negative_prompt = gr.Textbox(
                     label="Negative Prompt",
                     value=DEFAULT_NEGATIVE_PROMPT,
         inputs=[
             first_image, last_image, prompt1, prompt2, prompt3, duration, enhance_prompt,
             seed, randomize_seed, height, width, negative_prompt, blur_amount, remove_music,
+            add_subtitles, watermark, audio_track,
         ],
         outputs=[output_video, seed],
     )

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ flashpack==0.1.2
 torchaudio==2.8.0
 demucs
 soundfile
-pydub

 torchaudio==2.8.0
 demucs
 soundfile
+pydub
+openai-whisper