Vicente Alvarez Claude Sonnet 4.5 commited on
Commit
127cda9
·
1 Parent(s): d1b769c

Fix Whisper transcription: use audio_track instead of video clip

Browse files

Whisper was transcribing the generated video (which has no audio yet), resulting in 0 segments. Now transcribes the uploaded audio_track file.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -567,6 +567,7 @@ def generate_video(
567
  blur_amount: int = 0,
568
  remove_music: bool = False,
569
  add_subtitles: bool = False,
 
570
  progress=gr.Progress(track_tqdm=True),
571
  ):
572
  try:
@@ -664,11 +665,13 @@ def generate_video(
664
 
665
  # Transcribe with Whisper if requested (still within GPU context)
666
  subtitle_segments = []
667
- if add_subtitles and generated_clips:
668
- print("[GPU] Transcribing with Whisper...")
669
- # Transcribe the first clip (or you could transcribe all clips)
670
- subtitle_segments = transcribe_with_whisper_gpu(generated_clips[0], model_size="small")
671
  log_memory("after whisper")
 
 
672
 
673
  # Return all generated clips and subtitle segments
674
  return generated_clips, subtitle_segments, base_seed
@@ -713,7 +716,7 @@ def full_generation_process(
713
  clips, subtitle_segments, final_seed = generate_video(
714
  first_image, last_image, prompts, duration, enhance_prompt,
715
  seed, randomize_seed, height, width, negative_prompt,
716
- blur_amount, remove_music, add_subtitles, progress
717
  )
718
 
719
  if not clips:
 
567
  blur_amount: int = 0,
568
  remove_music: bool = False,
569
  add_subtitles: bool = False,
570
+ audio_track = None,
571
  progress=gr.Progress(track_tqdm=True),
572
  ):
573
  try:
 
665
 
666
  # Transcribe with Whisper if requested (still within GPU context)
667
  subtitle_segments = []
668
+ if add_subtitles and audio_track:
669
+ print("[GPU] Transcribing audio track with Whisper...")
670
+ # Transcribe the audio track file, not the generated video (which has no audio yet)
671
+ subtitle_segments = transcribe_with_whisper_gpu(audio_track, model_size="small")
672
  log_memory("after whisper")
673
+ elif add_subtitles and not audio_track:
674
+ print("[GPU] Warning: Subtitles requested but no audio track provided - skipping transcription")
675
 
676
  # Return all generated clips and subtitle segments
677
  return generated_clips, subtitle_segments, base_seed
 
716
  clips, subtitle_segments, final_seed = generate_video(
717
  first_image, last_image, prompts, duration, enhance_prompt,
718
  seed, randomize_seed, height, width, negative_prompt,
719
+ blur_amount, remove_music, add_subtitles, audio_track, progress
720
  )
721
 
722
  if not clips: