BoxOfColors commited on
Commit
bdf9957
·
1 Parent(s): 2b2a599

MMAudio: sliding-window segmentation for videos longer than 8 s; remove duration slider

Browse files

load_video(video_file, duration) hard-caps to the duration param, so
generation was silently truncated to 8 s for any longer video. Fix:
segment the input with ffmpeg into overlapping <=8 s clips, run
generate() on each, and crossfade-stitch into a full-length track.
Also remove the Duration slider from the UI — window size is fixed at
8 s (MMAudio's native window) and segmentation handles long videos.

Files changed (1) hide show
  1. app.py +86 -35
app.py CHANGED
@@ -353,9 +353,11 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
353
  # ================================================================== #
354
 
355
  @spaces.GPU(duration=600)
 
 
356
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
357
- cfg_strength, num_steps, duration, num_samples):
358
- """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s window, text-guided."""
359
  # MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
360
  import sys as _sys, os as _os
361
  _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
@@ -370,7 +372,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
370
 
371
  seed_val = int(seed_val)
372
  num_samples = int(num_samples)
373
- duration = float(duration)
374
 
375
  device = "cuda" if torch.cuda.is_available() else "cpu"
376
  dtype = torch.bfloat16
@@ -404,6 +405,30 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
404
  tmp_dir = tempfile.mkdtemp()
405
  outputs = []
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  for sample_idx in range(num_samples):
408
  rng = torch.Generator(device=device)
409
  if seed_val >= 0:
@@ -411,38 +436,65 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
411
  else:
412
  rng.seed()
413
 
414
- fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
415
-
416
- # load_video() resamples to 8 fps (CLIP) and 25 fps (Synchformer) on the fly
417
- video_info = load_video(video_file, duration)
418
- clip_frames = video_info.clip_frames.unsqueeze(0) # (1, T_clip, C, H, W)
419
- sync_frames = video_info.sync_frames.unsqueeze(0) # (1, T_sync, C, H, W)
420
- actual_dur = video_info.duration_sec
421
-
422
- seq_cfg.duration = actual_dur
423
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
424
-
425
- print(f"[MMAudio] Sample {sample_idx+1} | duration={actual_dur:.2f}s | prompt='{prompt}'")
426
-
427
- with torch.no_grad():
428
- audios = generate(
429
- clip_frames,
430
- sync_frames,
431
- [prompt],
432
- negative_text=[negative_prompt] if negative_prompt else None,
433
- feature_utils=feature_utils,
434
- net=net,
435
- fm=fm,
436
- rng=rng,
437
- cfg_strength=float(cfg_strength),
438
- )
439
- audio = audios.float().cpu()[0] # (C, T)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
  audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
442
- torchaudio.save(audio_path, audio, seq_cfg.sampling_rate)
443
 
444
  video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
445
- make_video(video_info, video_path, audio, sampling_rate=seq_cfg.sampling_rate)
446
  outputs.append((video_path, audio_path))
447
 
448
  return _pad_outputs(outputs)
@@ -705,7 +757,6 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
705
  mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
706
  mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
707
  mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
708
- mma_dur = gr.Slider(label="Duration (s)", minimum=1, maximum=10, value=8, step=0.5)
709
  mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
710
  mma_btn = gr.Button("Generate", variant="primary")
711
 
@@ -725,8 +776,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
725
  outputs=mma_slot_grps,
726
  )
727
 
728
- def _run_mmaudio(video, prompt, neg, seed, cfg, steps, dur, n):
729
- flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, dur, n)
730
  n = int(n)
731
  grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
732
  vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
@@ -736,7 +787,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
736
  mma_btn.click(
737
  fn=_run_mmaudio,
738
  inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
739
- mma_cfg, mma_steps, mma_dur, mma_samples],
740
  outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
741
  )
742
 
 
353
  # ================================================================== #
354
 
355
  @spaces.GPU(duration=600)
356
+ MMAUDIO_WINDOW = 8.0 # seconds — MMAudio's fixed generation window
357
+
358
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
359
+ cfg_strength, num_steps, num_samples):
360
+ """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
361
  # MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
362
  import sys as _sys, os as _os
363
  _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
 
372
 
373
  seed_val = int(seed_val)
374
  num_samples = int(num_samples)
 
375
 
376
  device = "cuda" if torch.cuda.is_available() else "cpu"
377
  dtype = torch.bfloat16
 
405
  tmp_dir = tempfile.mkdtemp()
406
  outputs = []
407
 
408
+ # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
409
+ # with a 1 s crossfade overlap and stitch the results into a full-length track.
410
+ total_dur_s = get_video_duration(video_file)
411
+ MMA_CF_S = 1.0 # crossfade seconds between segments
412
+ MMA_CF_DB = 3.0
413
+
414
+ def _mma_build_segments(total_s, cf_s):
415
+ if total_s <= MMAUDIO_WINDOW:
416
+ return [(0.0, total_s)]
417
+ step_s = MMAUDIO_WINDOW - cf_s
418
+ segs, t = [], 0.0
419
+ while True:
420
+ if t + MMAUDIO_WINDOW >= total_s:
421
+ segs.append((max(0.0, total_s - MMAUDIO_WINDOW), total_s))
422
+ break
423
+ segs.append((t, t + MMAUDIO_WINDOW))
424
+ t += step_s
425
+ return segs
426
+
427
+ segments = _mma_build_segments(total_dur_s, MMA_CF_S)
428
+ print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
429
+
430
+ sr = seq_cfg.sampling_rate # 44100
431
+
432
  for sample_idx in range(num_samples):
433
  rng = torch.Generator(device=device)
434
  if seed_val >= 0:
 
436
  else:
437
  rng.seed()
438
 
439
+ seg_audios = [] # list of (channels, samples) numpy arrays
440
+
441
+ for seg_i, (seg_start, seg_end) in enumerate(segments):
442
+ seg_dur = seg_end - seg_start
443
+ # Trim a clean video clip for this segment
444
+ seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
445
+ ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
446
+ seg_path, vcodec="libx264", acodec="aac", strict="experimental"
447
+ ).run(overwrite_output=True, quiet=True)
448
+
449
+ fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
450
+ video_info = load_video(seg_path, seg_dur)
451
+ clip_frames = video_info.clip_frames.unsqueeze(0)
452
+ sync_frames = video_info.sync_frames.unsqueeze(0)
453
+ actual_dur = video_info.duration_sec
454
+
455
+ seq_cfg.duration = actual_dur
456
+ net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
457
+
458
+ print(f"[MMAudio] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
459
+ f"{seg_start:.1f}–{seg_end:.1f}s | dur={actual_dur:.2f}s | prompt='{prompt}'")
460
+
461
+ with torch.no_grad():
462
+ audios = generate(
463
+ clip_frames,
464
+ sync_frames,
465
+ [prompt],
466
+ negative_text=[negative_prompt] if negative_prompt else None,
467
+ feature_utils=feature_utils,
468
+ net=net,
469
+ fm=fm,
470
+ rng=rng,
471
+ cfg_strength=float(cfg_strength),
472
+ )
473
+ wav = audios.float().cpu()[0].numpy() # (C, T)
474
+ seg_samples = int(round(seg_dur * sr))
475
+ wav = wav[:, :seg_samples]
476
+ seg_audios.append(wav)
477
+
478
+ # Crossfade-stitch all segments
479
+ def _cf_join(a, b, cf_s):
480
+ cf = int(round(cf_s * sr))
481
+ cf = min(cf, a.shape[1], b.shape[1])
482
+ if cf <= 0:
483
+ return np.concatenate([a, b], axis=1)
484
+ gain = 10 ** (MMA_CF_DB / 20.0)
485
+ overlap = a[:, -cf:] * gain + b[:, :cf] * gain
486
+ return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
487
+
488
+ full_wav = seg_audios[0]
489
+ for nw in seg_audios[1:]:
490
+ full_wav = _cf_join(full_wav, nw, MMA_CF_S)
491
+ full_wav = full_wav[:, : int(round(total_dur_s * sr))]
492
 
493
  audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
494
+ torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
495
 
496
  video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
497
+ mux_video_audio(video_file, audio_path, video_path)
498
  outputs.append((video_path, audio_path))
499
 
500
  return _pad_outputs(outputs)
 
757
  mma_seed = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
758
  mma_cfg = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
759
  mma_steps = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
 
760
  mma_samples = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
761
  mma_btn = gr.Button("Generate", variant="primary")
762
 
 
776
  outputs=mma_slot_grps,
777
  )
778
 
779
+ def _run_mmaudio(video, prompt, neg, seed, cfg, steps, n):
780
+ flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n)
781
  n = int(n)
782
  grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
783
  vid_upd = [gr.update(value=flat[i * 2]) for i in range(MAX_SLOTS)]
 
787
  mma_btn.click(
788
  fn=_run_mmaudio,
789
  inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
790
+ mma_cfg, mma_steps, mma_samples],
791
  outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
792
  )
793