Spaces:
Running on Zero
Running on Zero
Vicente Alvarez Claude Sonnet 4.5 commited on
Commit ·
127cda9
1
Parent(s): d1b769c
Fix Whisper transcription: use audio_track instead of video clip
Browse filesWhisper was transcribing the generated video (which has no audio yet), resulting in 0 segments. Now transcribes the uploaded audio_track file.
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -567,6 +567,7 @@ def generate_video(
|
|
| 567 |
blur_amount: int = 0,
|
| 568 |
remove_music: bool = False,
|
| 569 |
add_subtitles: bool = False,
|
|
|
|
| 570 |
progress=gr.Progress(track_tqdm=True),
|
| 571 |
):
|
| 572 |
try:
|
|
@@ -664,11 +665,13 @@ def generate_video(
|
|
| 664 |
|
| 665 |
# Transcribe with Whisper if requested (still within GPU context)
|
| 666 |
subtitle_segments = []
|
| 667 |
-
if add_subtitles and
|
| 668 |
-
print("[GPU] Transcribing with Whisper...")
|
| 669 |
-
# Transcribe the
|
| 670 |
-
subtitle_segments = transcribe_with_whisper_gpu(
|
| 671 |
log_memory("after whisper")
|
|
|
|
|
|
|
| 672 |
|
| 673 |
# Return all generated clips and subtitle segments
|
| 674 |
return generated_clips, subtitle_segments, base_seed
|
|
@@ -713,7 +716,7 @@ def full_generation_process(
|
|
| 713 |
clips, subtitle_segments, final_seed = generate_video(
|
| 714 |
first_image, last_image, prompts, duration, enhance_prompt,
|
| 715 |
seed, randomize_seed, height, width, negative_prompt,
|
| 716 |
-
blur_amount, remove_music, add_subtitles, progress
|
| 717 |
)
|
| 718 |
|
| 719 |
if not clips:
|
|
|
|
| 567 |
blur_amount: int = 0,
|
| 568 |
remove_music: bool = False,
|
| 569 |
add_subtitles: bool = False,
|
| 570 |
+
audio_track = None,
|
| 571 |
progress=gr.Progress(track_tqdm=True),
|
| 572 |
):
|
| 573 |
try:
|
|
|
|
| 665 |
|
| 666 |
# Transcribe with Whisper if requested (still within GPU context)
|
| 667 |
subtitle_segments = []
|
| 668 |
+
if add_subtitles and audio_track:
|
| 669 |
+
print("[GPU] Transcribing audio track with Whisper...")
|
| 670 |
+
# Transcribe the audio track file, not the generated video (which has no audio yet)
|
| 671 |
+
subtitle_segments = transcribe_with_whisper_gpu(audio_track, model_size="small")
|
| 672 |
log_memory("after whisper")
|
| 673 |
+
elif add_subtitles and not audio_track:
|
| 674 |
+
print("[GPU] Warning: Subtitles requested but no audio track provided - skipping transcription")
|
| 675 |
|
| 676 |
# Return all generated clips and subtitle segments
|
| 677 |
return generated_clips, subtitle_segments, base_seed
|
|
|
|
| 716 |
clips, subtitle_segments, final_seed = generate_video(
|
| 717 |
first_image, last_image, prompts, duration, enhance_prompt,
|
| 718 |
seed, randomize_seed, height, width, negative_prompt,
|
| 719 |
+
blur_amount, remove_music, add_subtitles, audio_track, progress
|
| 720 |
)
|
| 721 |
|
| 722 |
if not clips:
|