Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

BoxOfColors commited on 13 days ago

Commit

a32944b

1 Parent(s): a4f4e65

Fix MMAudio and HunyuanFoley video output retaining original audio

Both models were passing video_file (with original audio) to ffmpeg
when slicing segments and muxing the final output. Added strip_audio_from_video
call at the start of both functions, matching TARO's existing approach.

Files changed (1) hide show

app.py +14 -6

app.py CHANGED Viewed

@@ -466,6 +466,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     tmp_dir = tempfile.mkdtemp()
     outputs = []
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
     # with a crossfade overlap and stitch the results into a full-length track.
     total_dur_s  = get_video_duration(video_file)
@@ -488,7 +492,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
             seg_dur = seg_end - seg_start
             # Trim a clean video clip for this segment
             seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
-            ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
                 seg_path, vcodec="libx264", acodec="aac", strict="experimental"
             ).run(overwrite_output=True, quiet=True)
@@ -538,7 +542,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
-        mux_video_audio(video_file, audio_path, video_path)
         outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)
@@ -619,16 +623,20 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     tmp_dir = tempfile.mkdtemp()
     outputs = []
     # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
     # input into overlapping segments, generate audio for each, then crossfade-
     # stitch the results into a single full-length audio track.
-    total_dur_s  = get_video_duration(video_file)
     segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-encode text features once (same for every segment)
     _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
-    ffmpeg.input(video_file, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
         _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
     ).run(overwrite_output=True, quiet=True)
     _, text_feats, _ = feature_process(
@@ -647,7 +655,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
             seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
-            ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
                 seg_path, vcodec="libx264", acodec="aac", strict="experimental"
             ).run(overwrite_output=True, quiet=True)
@@ -695,7 +703,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
-        merge_audio_video(audio_path, video_file, video_path)
         outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)

     tmp_dir = tempfile.mkdtemp()
     outputs = []
+    # Strip original audio so the muxed output only contains the generated track
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
     # with a crossfade overlap and stitch the results into a full-length track.
     total_dur_s  = get_video_duration(video_file)
             seg_dur = seg_end - seg_start
             # Trim a clean video clip for this segment
             seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
+            ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
                 seg_path, vcodec="libx264", acodec="aac", strict="experimental"
             ).run(overwrite_output=True, quiet=True)
         torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
+        mux_video_audio(silent_video, audio_path, video_path)
         outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)
     tmp_dir = tempfile.mkdtemp()
     outputs = []
+    # Strip original audio so the muxed output only contains the generated track
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
     # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
     # input into overlapping segments, generate audio for each, then crossfade-
     # stitch the results into a single full-length audio track.
+    total_dur_s  = get_video_duration(silent_video)
     segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-encode text features once (same for every segment)
     _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
+    ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
         _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
     ).run(overwrite_output=True, quiet=True)
     _, text_feats, _ = feature_process(
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
             seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
+            ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
                 seg_path, vcodec="libx264", acodec="aac", strict="experimental"
             ).run(overwrite_output=True, quiet=True)
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
+        merge_audio_video(audio_path, silent_video, video_path)
         outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)