Spaces:
Running on Zero
Running on Zero
Commit ·
bdf9957
1
Parent(s): 2b2a599
MMAudio: sliding-window segmentation for videos longer than 8 s; remove duration slider
Browse filesload_video(video_file, duration) hard-caps to the duration param, so
generation was silently truncated to 8 s for any longer video. Fix:
segment the input with ffmpeg into overlapping <=8 s clips, run
generate() on each, and crossfade-stitch into a full-length track.
Also remove the Duration slider from the UI — window size is fixed at
8 s (MMAudio's native window) and segmentation handles long videos.
app.py
CHANGED
|
@@ -353,9 +353,11 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 353 |
# ================================================================== #
|
| 354 |
|
| 355 |
@spaces.GPU(duration=600)
|
|
|
|
|
|
|
| 356 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 357 |
-
cfg_strength, num_steps,
|
| 358 |
-
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s
|
| 359 |
# MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
|
| 360 |
import sys as _sys, os as _os
|
| 361 |
_mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
|
|
@@ -370,7 +372,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 370 |
|
| 371 |
seed_val = int(seed_val)
|
| 372 |
num_samples = int(num_samples)
|
| 373 |
-
duration = float(duration)
|
| 374 |
|
| 375 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 376 |
dtype = torch.bfloat16
|
|
@@ -404,6 +405,30 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 404 |
tmp_dir = tempfile.mkdtemp()
|
| 405 |
outputs = []
|
| 406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
for sample_idx in range(num_samples):
|
| 408 |
rng = torch.Generator(device=device)
|
| 409 |
if seed_val >= 0:
|
|
@@ -411,38 +436,65 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 411 |
else:
|
| 412 |
rng.seed()
|
| 413 |
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
|
| 442 |
-
torchaudio.save(audio_path,
|
| 443 |
|
| 444 |
video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
|
| 445 |
-
|
| 446 |
outputs.append((video_path, audio_path))
|
| 447 |
|
| 448 |
return _pad_outputs(outputs)
|
|
@@ -705,7 +757,6 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 705 |
mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
|
| 706 |
mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
|
| 707 |
mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
|
| 708 |
-
mma_dur = gr.Slider(label="Duration (s)", minimum=1, maximum=10, value=8, step=0.5)
|
| 709 |
mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
|
| 710 |
mma_btn = gr.Button("Generate", variant="primary")
|
| 711 |
|
|
@@ -725,8 +776,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 725 |
outputs=mma_slot_grps,
|
| 726 |
)
|
| 727 |
|
| 728 |
-
def _run_mmaudio(video, prompt, neg, seed, cfg, steps,
|
| 729 |
-
flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps,
|
| 730 |
n = int(n)
|
| 731 |
grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
|
| 732 |
vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
|
|
@@ -736,7 +787,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
|
|
| 736 |
mma_btn.click(
|
| 737 |
fn=_run_mmaudio,
|
| 738 |
inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
|
| 739 |
-
mma_cfg, mma_steps,
|
| 740 |
outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
|
| 741 |
)
|
| 742 |
|
|
|
|
| 353 |
# ================================================================== #
|
| 354 |
|
| 355 |
@spaces.GPU(duration=600)
|
| 356 |
+
MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
|
| 357 |
+
|
| 358 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 359 |
+
cfg_strength, num_steps, num_samples):
|
| 360 |
+
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
|
| 361 |
# MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
|
| 362 |
import sys as _sys, os as _os
|
| 363 |
_mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
|
|
|
|
| 372 |
|
| 373 |
seed_val = int(seed_val)
|
| 374 |
num_samples = int(num_samples)
|
|
|
|
| 375 |
|
| 376 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 377 |
dtype = torch.bfloat16
|
|
|
|
| 405 |
tmp_dir = tempfile.mkdtemp()
|
| 406 |
outputs = []
|
| 407 |
|
| 408 |
+
# MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
|
| 409 |
+
# with a 1 s crossfade overlap and stitch the results into a full-length track.
|
| 410 |
+
total_dur_s = get_video_duration(video_file)
|
| 411 |
+
MMA_CF_S = 1.0 # crossfade seconds between segments
|
| 412 |
+
MMA_CF_DB = 3.0
|
| 413 |
+
|
| 414 |
+
def _mma_build_segments(total_s, cf_s):
|
| 415 |
+
if total_s <= MMAUDIO_WINDOW:
|
| 416 |
+
return [(0.0, total_s)]
|
| 417 |
+
step_s = MMAUDIO_WINDOW - cf_s
|
| 418 |
+
segs, t = [], 0.0
|
| 419 |
+
while True:
|
| 420 |
+
if t + MMAUDIO_WINDOW >= total_s:
|
| 421 |
+
segs.append((max(0.0, total_s - MMAUDIO_WINDOW), total_s))
|
| 422 |
+
break
|
| 423 |
+
segs.append((t, t + MMAUDIO_WINDOW))
|
| 424 |
+
t += step_s
|
| 425 |
+
return segs
|
| 426 |
+
|
| 427 |
+
segments = _mma_build_segments(total_dur_s, MMA_CF_S)
|
| 428 |
+
print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
|
| 429 |
+
|
| 430 |
+
sr = seq_cfg.sampling_rate # 44100
|
| 431 |
+
|
| 432 |
for sample_idx in range(num_samples):
|
| 433 |
rng = torch.Generator(device=device)
|
| 434 |
if seed_val >= 0:
|
|
|
|
| 436 |
else:
|
| 437 |
rng.seed()
|
| 438 |
|
| 439 |
+
seg_audios = [] # list of (channels, samples) numpy arrays
|
| 440 |
+
|
| 441 |
+
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 442 |
+
seg_dur = seg_end - seg_start
|
| 443 |
+
# Trim a clean video clip for this segment
|
| 444 |
+
seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
|
| 445 |
+
ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
|
| 446 |
+
seg_path, vcodec="libx264", acodec="aac", strict="experimental"
|
| 447 |
+
).run(overwrite_output=True, quiet=True)
|
| 448 |
+
|
| 449 |
+
fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
|
| 450 |
+
video_info = load_video(seg_path, seg_dur)
|
| 451 |
+
clip_frames = video_info.clip_frames.unsqueeze(0)
|
| 452 |
+
sync_frames = video_info.sync_frames.unsqueeze(0)
|
| 453 |
+
actual_dur = video_info.duration_sec
|
| 454 |
+
|
| 455 |
+
seq_cfg.duration = actual_dur
|
| 456 |
+
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
| 457 |
+
|
| 458 |
+
print(f"[MMAudio] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
|
| 459 |
+
f"{seg_start:.1f}–{seg_end:.1f}s | dur={actual_dur:.2f}s | prompt='{prompt}'")
|
| 460 |
+
|
| 461 |
+
with torch.no_grad():
|
| 462 |
+
audios = generate(
|
| 463 |
+
clip_frames,
|
| 464 |
+
sync_frames,
|
| 465 |
+
[prompt],
|
| 466 |
+
negative_text=[negative_prompt] if negative_prompt else None,
|
| 467 |
+
feature_utils=feature_utils,
|
| 468 |
+
net=net,
|
| 469 |
+
fm=fm,
|
| 470 |
+
rng=rng,
|
| 471 |
+
cfg_strength=float(cfg_strength),
|
| 472 |
+
)
|
| 473 |
+
wav = audios.float().cpu()[0].numpy() # (C, T)
|
| 474 |
+
seg_samples = int(round(seg_dur * sr))
|
| 475 |
+
wav = wav[:, :seg_samples]
|
| 476 |
+
seg_audios.append(wav)
|
| 477 |
+
|
| 478 |
+
# Crossfade-stitch all segments
|
| 479 |
+
def _cf_join(a, b, cf_s):
|
| 480 |
+
cf = int(round(cf_s * sr))
|
| 481 |
+
cf = min(cf, a.shape[1], b.shape[1])
|
| 482 |
+
if cf <= 0:
|
| 483 |
+
return np.concatenate([a, b], axis=1)
|
| 484 |
+
gain = 10 ** (MMA_CF_DB / 20.0)
|
| 485 |
+
overlap = a[:, -cf:] * gain + b[:, :cf] * gain
|
| 486 |
+
return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
|
| 487 |
+
|
| 488 |
+
full_wav = seg_audios[0]
|
| 489 |
+
for nw in seg_audios[1:]:
|
| 490 |
+
full_wav = _cf_join(full_wav, nw, MMA_CF_S)
|
| 491 |
+
full_wav = full_wav[:, : int(round(total_dur_s * sr))]
|
| 492 |
|
| 493 |
audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
|
| 494 |
+
torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
|
| 495 |
|
| 496 |
video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
|
| 497 |
+
mux_video_audio(video_file, audio_path, video_path)
|
| 498 |
outputs.append((video_path, audio_path))
|
| 499 |
|
| 500 |
return _pad_outputs(outputs)
|
|
|
|
| 757 |
mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
|
| 758 |
mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
|
| 759 |
mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
|
|
|
|
| 760 |
mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
|
| 761 |
mma_btn = gr.Button("Generate", variant="primary")
|
| 762 |
|
|
|
|
| 776 |
outputs=mma_slot_grps,
|
| 777 |
)
|
| 778 |
|
| 779 |
+
def _run_mmaudio(video, prompt, neg, seed, cfg, steps, n):
|
| 780 |
+
flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n)
|
| 781 |
n = int(n)
|
| 782 |
grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
|
| 783 |
vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
|
|
|
|
| 787 |
mma_btn.click(
|
| 788 |
fn=_run_mmaudio,
|
| 789 |
inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
|
| 790 |
+
mma_cfg, mma_steps, mma_samples],
|
| 791 |
outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
|
| 792 |
)
|
| 793 |
|