Spaces:

vclmax
/

Element-8-Video

Running on Zero

Vicente Alvarez Claude Sonnet 4.5 commited on 2 days ago

Commit

127cda9

1 Parent(s): d1b769c

Fix Whisper transcription: use audio_track instead of video clip

Whisper was transcribing the generated video (which has no audio yet), resulting in 0 segments. Now transcribes the uploaded audio_track file.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (1) hide show

app.py +8 -5

app.py CHANGED Viewed

@@ -567,6 +567,7 @@ def generate_video(
     blur_amount: int = 0,
     remove_music: bool = False,
     add_subtitles: bool = False,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
@@ -664,11 +665,13 @@ def generate_video(
         # Transcribe with Whisper if requested (still within GPU context)
         subtitle_segments = []
-        if add_subtitles and generated_clips:
-            print("[GPU] Transcribing with Whisper...")
-            # Transcribe the first clip (or you could transcribe all clips)
-            subtitle_segments = transcribe_with_whisper_gpu(generated_clips[0], model_size="small")
             log_memory("after whisper")
         # Return all generated clips and subtitle segments
         return generated_clips, subtitle_segments, base_seed
@@ -713,7 +716,7 @@ def full_generation_process(
     clips, subtitle_segments, final_seed = generate_video(
         first_image, last_image, prompts, duration, enhance_prompt,
         seed, randomize_seed, height, width, negative_prompt,
-        blur_amount, remove_music, add_subtitles, progress
     )
     if not clips:

     blur_amount: int = 0,
     remove_music: bool = False,
     add_subtitles: bool = False,
+    audio_track = None,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         # Transcribe with Whisper if requested (still within GPU context)
         subtitle_segments = []
+        if add_subtitles and audio_track:
+            print("[GPU] Transcribing audio track with Whisper...")
+            # Transcribe the audio track file, not the generated video (which has no audio yet)
+            subtitle_segments = transcribe_with_whisper_gpu(audio_track, model_size="small")
             log_memory("after whisper")
+        elif add_subtitles and not audio_track:
+            print("[GPU] Warning: Subtitles requested but no audio track provided - skipping transcription")
         # Return all generated clips and subtitle segments
         return generated_clips, subtitle_segments, base_seed
     clips, subtitle_segments, final_seed = generate_video(
         first_image, last_image, prompts, duration, enhance_prompt,
         seed, randomize_seed, height, width, negative_prompt,
+        blur_amount, remove_music, add_subtitles, audio_track, progress
     )
     if not clips: