Commit ·
a32944b
1
Parent(s): a4f4e65
Fix MMAudio and HunyuanFoley video output retaining original audio
Browse filesBoth models were passing video_file (with original audio) to ffmpeg
when slicing segments and muxing the final output. Added strip_audio_from_video
call at the start of both functions, matching TARO's existing approach.
app.py
CHANGED
|
@@ -466,6 +466,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 466 |
tmp_dir = tempfile.mkdtemp()
|
| 467 |
outputs = []
|
| 468 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
# MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
|
| 470 |
# with a crossfade overlap and stitch the results into a full-length track.
|
| 471 |
total_dur_s = get_video_duration(video_file)
|
|
@@ -488,7 +492,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 488 |
seg_dur = seg_end - seg_start
|
| 489 |
# Trim a clean video clip for this segment
|
| 490 |
seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
|
| 491 |
-
ffmpeg.input(
|
| 492 |
seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 493 |
).run(overwrite_output=True, quiet=True)
|
| 494 |
|
|
@@ -538,7 +542,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 538 |
torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
|
| 539 |
|
| 540 |
video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
|
| 541 |
-
mux_video_audio(
|
| 542 |
outputs.append((video_path, audio_path))
|
| 543 |
|
| 544 |
return _pad_outputs(outputs)
|
|
@@ -619,16 +623,20 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 619 |
tmp_dir = tempfile.mkdtemp()
|
| 620 |
outputs = []
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
# HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
|
| 623 |
# input into overlapping segments, generate audio for each, then crossfade-
|
| 624 |
# stitch the results into a single full-length audio track.
|
| 625 |
-
total_dur_s = get_video_duration(
|
| 626 |
segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
|
| 627 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
| 628 |
|
| 629 |
# Pre-encode text features once (same for every segment)
|
| 630 |
_dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
|
| 631 |
-
ffmpeg.input(
|
| 632 |
_dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 633 |
).run(overwrite_output=True, quiet=True)
|
| 634 |
_, text_feats, _ = feature_process(
|
|
@@ -647,7 +655,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 647 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 648 |
seg_dur = seg_end - seg_start
|
| 649 |
seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
|
| 650 |
-
ffmpeg.input(
|
| 651 |
seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 652 |
).run(overwrite_output=True, quiet=True)
|
| 653 |
|
|
@@ -695,7 +703,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 695 |
audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
|
| 696 |
torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
|
| 697 |
video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
|
| 698 |
-
merge_audio_video(audio_path,
|
| 699 |
outputs.append((video_path, audio_path))
|
| 700 |
|
| 701 |
return _pad_outputs(outputs)
|
|
|
|
| 466 |
tmp_dir = tempfile.mkdtemp()
|
| 467 |
outputs = []
|
| 468 |
|
| 469 |
+
# Strip original audio so the muxed output only contains the generated track
|
| 470 |
+
silent_video = os.path.join(tmp_dir, "silent_input.mp4")
|
| 471 |
+
strip_audio_from_video(video_file, silent_video)
|
| 472 |
+
|
| 473 |
# MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
|
| 474 |
# with a crossfade overlap and stitch the results into a full-length track.
|
| 475 |
total_dur_s = get_video_duration(video_file)
|
|
|
|
| 492 |
seg_dur = seg_end - seg_start
|
| 493 |
# Trim a clean video clip for this segment
|
| 494 |
seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
|
| 495 |
+
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 496 |
seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 497 |
).run(overwrite_output=True, quiet=True)
|
| 498 |
|
|
|
|
| 542 |
torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
|
| 543 |
|
| 544 |
video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
|
| 545 |
+
mux_video_audio(silent_video, audio_path, video_path)
|
| 546 |
outputs.append((video_path, audio_path))
|
| 547 |
|
| 548 |
return _pad_outputs(outputs)
|
|
|
|
| 623 |
tmp_dir = tempfile.mkdtemp()
|
| 624 |
outputs = []
|
| 625 |
|
| 626 |
+
# Strip original audio so the muxed output only contains the generated track
|
| 627 |
+
silent_video = os.path.join(tmp_dir, "silent_input.mp4")
|
| 628 |
+
strip_audio_from_video(video_file, silent_video)
|
| 629 |
+
|
| 630 |
# HunyuanFoley is limited to 15 s per pass. For longer videos we slice the
|
| 631 |
# input into overlapping segments, generate audio for each, then crossfade-
|
| 632 |
# stitch the results into a single full-length audio track.
|
| 633 |
+
total_dur_s = get_video_duration(silent_video)
|
| 634 |
segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
|
| 635 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
| 636 |
|
| 637 |
# Pre-encode text features once (same for every segment)
|
| 638 |
_dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
|
| 639 |
+
ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
|
| 640 |
_dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 641 |
).run(overwrite_output=True, quiet=True)
|
| 642 |
_, text_feats, _ = feature_process(
|
|
|
|
| 655 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 656 |
seg_dur = seg_end - seg_start
|
| 657 |
seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
|
| 658 |
+
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 659 |
seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 660 |
).run(overwrite_output=True, quiet=True)
|
| 661 |
|
|
|
|
| 703 |
audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
|
| 704 |
torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(full_wav)), sr)
|
| 705 |
video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
|
| 706 |
+
merge_audio_video(audio_path, silent_video, video_path)
|
| 707 |
outputs.append((video_path, audio_path))
|
| 708 |
|
| 709 |
return _pad_outputs(outputs)
|