Vicente Alvarez commited on
Commit
cc800d1
·
1 Parent(s): 3b38a35

Add Whisper subtitles (elegant animated) + PNG watermark support - all CPU work, free

Browse files
Files changed (2) hide show
  1. app.py +153 -6
  2. requirements.txt +2 -1
app.py CHANGED
@@ -396,6 +396,131 @@ def loop_clips_with_audio_track(clip_paths: list[str], audio_path: str) -> str:
396
  return clip_paths[0] if clip_paths else None
397
 
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  @spaces.GPU(duration=90)
400
  @torch.inference_mode()
401
  def generate_video(
@@ -531,6 +656,8 @@ def full_generation_process(
531
  negative_prompt: str,
532
  blur_amount: int,
533
  remove_music: bool,
 
 
534
  audio_track,
535
  progress=gr.Progress(track_tqdm=True),
536
  ):
@@ -557,13 +684,30 @@ def full_generation_process(
557
  if audio_track and len(clips) > 1:
558
  print("[CPU] Looping clips to match audio duration...")
559
  final_video = loop_clips_with_audio_track(clips, audio_track)
560
- return final_video, final_seed
561
  elif len(clips) == 1:
562
- # Single clip - return it directly
563
- return clips[0], final_seed
564
  else:
565
- # Multiple clips, no audio - return first clip (could be gallery in future)
566
- return clips[0], final_seed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
 
568
 
569
  with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo: # cleanup: check every 1h, delete files >2h old
@@ -614,6 +758,9 @@ with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo: # c
614
  with gr.Row():
615
  blur_amount = gr.Number(label="Blur (0=off, 36=heavy)", value=0, precision=0)
616
  remove_music = gr.Checkbox(label="Remove Music", value=False)
 
 
 
617
  negative_prompt = gr.Textbox(
618
  label="Negative Prompt",
619
  value=DEFAULT_NEGATIVE_PROMPT,
@@ -674,7 +821,7 @@ with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo: # c
674
  inputs=[
675
  first_image, last_image, prompt1, prompt2, prompt3, duration, enhance_prompt,
676
  seed, randomize_seed, height, width, negative_prompt, blur_amount, remove_music,
677
- audio_track,
678
  ],
679
  outputs=[output_video, seed],
680
  )
 
396
  return clip_paths[0] if clip_paths else None
397
 
398
 
399
+ def transcribe_with_whisper(video_path: str, model_size: str = "small") -> list[dict]:
400
+ """Transcribe video audio with Whisper. Returns segments with timestamps."""
401
+ import whisper
402
+
403
+ try:
404
+ print(f"[whisper] Loading {model_size} model...")
405
+ model = whisper.load_model(model_size)
406
+
407
+ print(f"[whisper] Transcribing audio...")
408
+ result = model.transcribe(video_path, word_timestamps=True)
409
+
410
+ print(f"[whisper] Transcription complete: {len(result['segments'])} segments")
411
+ return result['segments']
412
+ except Exception as e:
413
+ print(f"[whisper] Error: {e}")
414
+ import traceback
415
+ traceback.print_exc()
416
+ return []
417
+
418
+
419
+ def create_beautiful_ass_subtitles(segments: list[dict], output_path: str, video_width: int, video_height: int):
420
+ """Create elegant animated ASS subtitles with Google Fonts styling."""
421
+
422
+ # Download elegant font (Montserrat)
423
+ import urllib.request
424
+ font_url = "https://github.com/JulietaUla/Montserrat/raw/master/fonts/ttf/Montserrat-SemiBold.ttf"
425
+ font_path = "/tmp/Montserrat-SemiBold.ttf"
426
+
427
+ try:
428
+ if not os.path.exists(font_path):
429
+ urllib.request.urlretrieve(font_url, font_path)
430
+ except:
431
+ font_path = "Arial" # Fallback
432
+
433
+ # ASS subtitle header with beautiful styling
434
+ ass_content = f"""[Script Info]
435
+ Title: Elegant Subtitles
436
+ ScriptType: v4.00+
437
+ WrapStyle: 0
438
+ PlayResX: {video_width}
439
+ PlayResY: {video_height}
440
+ ScaledBorderAndShadow: yes
441
+
442
+ [V4+ Styles]
443
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
444
+ Style: Default,Montserrat SemiBold,{int(video_height * 0.05)},&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,1,5,10,10,{int(video_height * 0.42)},1
445
+
446
+ [Events]
447
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
448
+ """
449
+
450
+ # Add each segment with fade animation
451
+ for seg in segments:
452
+ start_time = format_ass_time(seg['start'])
453
+ end_time = format_ass_time(seg['end'])
454
+ text = seg['text'].strip()
455
+
456
+ # Add fade in/out animation
457
+ fade_duration = 200 # ms
458
+ animated_text = f"{{\\fad({fade_duration},{fade_duration})}}{text}"
459
+
460
+ ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{animated_text}\n"
461
+
462
+ with open(output_path, 'w', encoding='utf-8') as f:
463
+ f.write(ass_content)
464
+
465
+ print(f"[subtitles] Created ASS file with {len(segments)} segments")
466
+
467
+
468
+ def format_ass_time(seconds: float) -> str:
469
+ """Convert seconds to ASS timestamp format (h:mm:ss.cc)."""
470
+ hours = int(seconds // 3600)
471
+ minutes = int((seconds % 3600) // 60)
472
+ secs = int(seconds % 60)
473
+ centisecs = int((seconds % 1) * 100)
474
+ return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"
475
+
476
+
477
+ def burn_subtitles_and_watermark(video_path: str, output_path: str, subtitle_path: str = None, watermark_path: str = None):
478
+ """Burn subtitles and/or watermark into video using FFmpeg. CPU work - free."""
479
+ import subprocess
480
+
481
+ try:
482
+ # Build filter complex
483
+ filters = []
484
+
485
+ if subtitle_path and os.path.exists(subtitle_path):
486
+ # Burn subtitles
487
+ subtitle_filter = f"subtitles={subtitle_path}:force_style='FontName=Montserrat SemiBold'"
488
+ filters.append(subtitle_filter)
489
+
490
+ if watermark_path and os.path.exists(watermark_path):
491
+ # Add watermark (bottom-right corner, 10% width)
492
+ watermark_filter = f"[0:v][1:v]overlay=W-w-10:H-h-10"
493
+
494
+ # Build FFmpeg command
495
+ cmd = ['ffmpeg', '-y', '-i', video_path]
496
+
497
+ if watermark_path and os.path.exists(watermark_path):
498
+ cmd.extend(['-i', watermark_path])
499
+
500
+ if filters:
501
+ filter_complex = ";".join(filters)
502
+ if watermark_path and os.path.exists(watermark_path):
503
+ filter_complex = f"[0:v]subtitles={subtitle_path}:force_style='FontName=Montserrat SemiBold'[v];[v][1:v]overlay=W-w-10:H-h-10" if subtitle_path else "[0:v][1:v]overlay=W-w-10:H-h-10"
504
+ cmd.extend(['-filter_complex', filter_complex])
505
+
506
+ cmd.extend(['-c:a', 'copy', output_path])
507
+
508
+ print(f"[burn] Burning subtitles/watermark...")
509
+ result = subprocess.run(cmd, capture_output=True, text=True)
510
+
511
+ if result.returncode != 0:
512
+ raise Exception(f"Burn failed: {result.stderr[-200:]}")
513
+
514
+ print(f"[burn] Successfully burned subtitles/watermark")
515
+ return True
516
+
517
+ except Exception as e:
518
+ print(f"[burn] Error: {e}")
519
+ import traceback
520
+ traceback.print_exc()
521
+ return False
522
+
523
+
524
  @spaces.GPU(duration=90)
525
  @torch.inference_mode()
526
  def generate_video(
 
656
  negative_prompt: str,
657
  blur_amount: int,
658
  remove_music: bool,
659
+ add_subtitles: bool,
660
+ watermark,
661
  audio_track,
662
  progress=gr.Progress(track_tqdm=True),
663
  ):
 
684
  if audio_track and len(clips) > 1:
685
  print("[CPU] Looping clips to match audio duration...")
686
  final_video = loop_clips_with_audio_track(clips, audio_track)
 
687
  elif len(clips) == 1:
688
+ final_video = clips[0]
 
689
  else:
690
+ final_video = clips[0]
691
+
692
+ # Phase 3: CPU work (free) - add subtitles and/or watermark
693
+ if add_subtitles or watermark:
694
+ print("[CPU] Adding subtitles/watermark...")
695
+
696
+ # Transcribe if subtitles requested
697
+ subtitle_file = None
698
+ if add_subtitles:
699
+ segments = transcribe_with_whisper(final_video, model_size="small")
700
+ if segments:
701
+ subtitle_file = tempfile.mktemp(suffix=".ass")
702
+ create_beautiful_ass_subtitles(segments, subtitle_file, int(width), int(height))
703
+
704
+ # Burn subtitles and/or watermark
705
+ output_with_extras = tempfile.mktemp(suffix=".mp4")
706
+ success = burn_subtitles_and_watermark(final_video, output_with_extras, subtitle_file, watermark)
707
+ if success:
708
+ final_video = output_with_extras
709
+
710
+ return final_video, final_seed
711
 
712
 
713
  with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo: # cleanup: check every 1h, delete files >2h old
 
758
  with gr.Row():
759
  blur_amount = gr.Number(label="Blur (0=off, 36=heavy)", value=0, precision=0)
760
  remove_music = gr.Checkbox(label="Remove Music", value=False)
761
+ with gr.Row():
762
+ add_subtitles = gr.Checkbox(label="Add Subtitles (Whisper)", value=False)
763
+ watermark = gr.Image(label="Watermark (PNG)", type="filepath", sources=["upload"])
764
  negative_prompt = gr.Textbox(
765
  label="Negative Prompt",
766
  value=DEFAULT_NEGATIVE_PROMPT,
 
821
  inputs=[
822
  first_image, last_image, prompt1, prompt2, prompt3, duration, enhance_prompt,
823
  seed, randomize_seed, height, width, negative_prompt, blur_amount, remove_music,
824
+ add_subtitles, watermark, audio_track,
825
  ],
826
  outputs=[output_video, seed],
827
  )
requirements.txt CHANGED
@@ -12,4 +12,5 @@ flashpack==0.1.2
12
  torchaudio==2.8.0
13
  demucs
14
  soundfile
15
- pydub
 
 
12
  torchaudio==2.8.0
13
  demucs
14
  soundfile
15
+ pydub
16
+ openai-whisper