BoxOfColors commited on
Commit
a32944b
·
1 Parent(s): a4f4e65

Fix MMAudio and HunyuanFoley video output retaining original audio

Browse files

Both models were passing video_file (with original audio) to ffmpeg
when slicing segments and muxing the final output. Added strip_audio_from_video
call at the start of both functions, matching TARO's existing approach.

Files changed (1) hide show
  1. app.py +14 -6
app.py CHANGED
@@ -466,6 +466,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
466
  tmp_dir = tempfile.mkdtemp()
467
  outputs = []
468
 
 
 
 
 
469
  # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
470
  # with a crossfade overlap and stitch the results into a full-length track.
471
  total_dur_s = get_video_duration(video_file)
@@ -488,7 +492,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
488
  seg_dur = seg_end - seg_start
489
  # Trim a clean video clip for this segment
490
  seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
491
- ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
492
  seg_path, vcodec="libx264", acodec="aac", strict="experimental"
493
  ).run(overwrite_output=True, quiet=True)
494
 
@@ -538,7 +542,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
538
  torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
539
 
540
  video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
541
- mux_video_audio(video_file, audio_path, video_path)
542
  outputs.append((video_path, audio_path))
543
 
544
  return _pad_outputs(outputs)
@@ -619,16 +623,20 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
619
  tmp_dir = tempfile.mkdtemp()
620
  outputs = []
621
 
 
 
 
 
622
  # HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
623
  # input into overlapping segments, generate audio for each, then crossfade-
624
  # stitch the results into a single full-length audio track.
625
- total_dur_s = get_video_duration(video_file)
626
  segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
627
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
628
 
629
  # Pre-encode text features once (same for every segment)
630
  _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
631
- ffmpeg.input(video_file, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
632
  _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
633
  ).run(overwrite_output=True, quiet=True)
634
  _, text_feats, _ = feature_process(
@@ -647,7 +655,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
647
  for seg_i, (seg_start, seg_end) in enumerate(segments):
648
  seg_dur = seg_end - seg_start
649
  seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
650
- ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
651
  seg_path, vcodec="libx264", acodec="aac", strict="experimental"
652
  ).run(overwrite_output=True, quiet=True)
653
 
@@ -695,7 +703,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
695
  audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
696
  torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
697
  video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
698
- merge_audio_video(audio_path, video_file, video_path)
699
  outputs.append((video_path, audio_path))
700
 
701
  return _pad_outputs(outputs)
 
466
  tmp_dir = tempfile.mkdtemp()
467
  outputs = []
468
 
469
+ # Strip original audio so the muxed output only contains the generated track
470
+ silent_video = os.path.join(tmp_dir, "silent_input.mp4")
471
+ strip_audio_from_video(video_file, silent_video)
472
+
473
  # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
474
  # with a crossfade overlap and stitch the results into a full-length track.
475
  total_dur_s = get_video_duration(video_file)
 
492
  seg_dur = seg_end - seg_start
493
  # Trim a clean video clip for this segment
494
  seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
495
+ ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
496
  seg_path, vcodec="libx264", acodec="aac", strict="experimental"
497
  ).run(overwrite_output=True, quiet=True)
498
 
 
542
  torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
543
 
544
  video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
545
+ mux_video_audio(silent_video, audio_path, video_path)
546
  outputs.append((video_path, audio_path))
547
 
548
  return _pad_outputs(outputs)
 
623
  tmp_dir = tempfile.mkdtemp()
624
  outputs = []
625
 
626
+ # Strip original audio so the muxed output only contains the generated track
627
+ silent_video = os.path.join(tmp_dir, "silent_input.mp4")
628
+ strip_audio_from_video(video_file, silent_video)
629
+
630
  # HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
631
  # input into overlapping segments, generate audio for each, then crossfade-
632
  # stitch the results into a single full-length audio track.
633
+ total_dur_s = get_video_duration(silent_video)
634
  segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
635
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
636
 
637
  # Pre-encode text features once (same for every segment)
638
  _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
639
+ ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
640
  _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
641
  ).run(overwrite_output=True, quiet=True)
642
  _, text_feats, _ = feature_process(
 
655
  for seg_i, (seg_start, seg_end) in enumerate(segments):
656
  seg_dur = seg_end - seg_start
657
  seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
658
+ ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
659
  seg_path, vcodec="libx264", acodec="aac", strict="experimental"
660
  ).run(overwrite_output=True, quiet=True)
661
 
 
703
  audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
704
  torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
705
  video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
706
+ merge_audio_video(audio_path, silent_video, video_path)
707
  outputs.append((video_path, audio_path))
708
 
709
  return _pad_outputs(outputs)