Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 5 days ago

Commit

bdf9957

1 Parent(s): 2b2a599

MMAudio: sliding-window segmentation for videos longer than 8 s; remove duration slider

load_video(video_file, duration) hard-caps to the duration param, so
generation was silently truncated to 8 s for any longer video. Fix:
segment the input with ffmpeg into overlapping <=8 s clips, run
generate() on each, and crossfade-stitch into a full-length track.
Also remove the Duration slider from the UI — window size is fixed at
8 s (MMAudio's native window) and segmentation handles long videos.

Files changed (1) hide show

app.py +86 -35

app.py CHANGED Viewed

@@ -353,9 +353,11 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
 # ================================================================== #
 @spaces.GPU(duration=600)
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
-                     cfg_strength, num_steps, duration, num_samples):
-    """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s window, text-guided."""
     # MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
@@ -370,7 +372,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     seed_val    = int(seed_val)
     num_samples = int(num_samples)
-    duration    = float(duration)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
@@ -404,6 +405,30 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     tmp_dir = tempfile.mkdtemp()
     outputs = []
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
         if seed_val >= 0:
@@ -411,38 +436,65 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         else:
             rng.seed()
-        fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
-        # load_video() resamples to 8 fps (CLIP) and 25 fps (Synchformer) on the fly
-        video_info  = load_video(video_file, duration)
-        clip_frames = video_info.clip_frames.unsqueeze(0)    # (1, T_clip, C, H, W)
-        sync_frames = video_info.sync_frames.unsqueeze(0)    # (1, T_sync, C, H, W)
-        actual_dur  = video_info.duration_sec
-        seq_cfg.duration = actual_dur
-        net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-        print(f"[MMAudio] Sample {sample_idx+1} | duration={actual_dur:.2f}s | prompt='{prompt}'")
-        with torch.no_grad():
-            audios = generate(
-                clip_frames,
-                sync_frames,
-                [prompt],
-                negative_text=[negative_prompt] if negative_prompt else None,
-                feature_utils=feature_utils,
-                net=net,
-                fm=fm,
-                rng=rng,
-                cfg_strength=float(cfg_strength),
-            )
-        audio = audios.float().cpu()[0]   # (C, T)
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
-        torchaudio.save(audio_path, audio, seq_cfg.sampling_rate)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
-        make_video(video_info, video_path, audio, sampling_rate=seq_cfg.sampling_rate)
         outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)
@@ -705,7 +757,6 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                     mma_seed     = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
                     mma_cfg      = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
                     mma_steps    = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
-                    mma_dur      = gr.Slider(label="Duration (s)", minimum=1, maximum=10, value=8, step=0.5)
                     mma_samples  = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
                     mma_btn      = gr.Button("Generate", variant="primary")
@@ -725,8 +776,8 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
                 outputs=mma_slot_grps,
             )
-            def _run_mmaudio(video, prompt, neg, seed, cfg, steps, dur, n):
-                flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, dur, n)
                 n = int(n)
                 grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
                 vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
@@ -736,7 +787,7 @@ with gr.Blocks(title="Generate Audio for Video") as demo:
             mma_btn.click(
                 fn=_run_mmaudio,
                 inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
-                        mma_cfg, mma_steps, mma_dur, mma_samples],
                 outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
             )

 # ================================================================== #
 @spaces.GPU(duration=600)
+MMAUDIO_WINDOW = 8.0   # seconds — MMAudio's fixed generation window
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
+                     cfg_strength, num_steps, num_samples):
+    """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
     # MMAudio is a local package in ./MMAudio/ — add it to sys.path so imports work.
     import sys as _sys, os as _os
     _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
     seed_val    = int(seed_val)
     num_samples = int(num_samples)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
     tmp_dir = tempfile.mkdtemp()
     outputs = []
+    # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
+    # with a 1 s crossfade overlap and stitch the results into a full-length track.
+    total_dur_s = get_video_duration(video_file)
+    MMA_CF_S    = 1.0   # crossfade seconds between segments
+    MMA_CF_DB   = 3.0
+    def _mma_build_segments(total_s, cf_s):
+        if total_s <= MMAUDIO_WINDOW:
+            return [(0.0, total_s)]
+        step_s = MMAUDIO_WINDOW - cf_s
+        segs, t = [], 0.0
+        while True:
+            if t + MMAUDIO_WINDOW >= total_s:
+                segs.append((max(0.0, total_s - MMAUDIO_WINDOW), total_s))
+                break
+            segs.append((t, t + MMAUDIO_WINDOW))
+            t += step_s
+        return segs
+    segments = _mma_build_segments(total_dur_s, MMA_CF_S)
+    print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
+    sr = seq_cfg.sampling_rate   # 44100
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
         if seed_val >= 0:
         else:
             rng.seed()
+        seg_audios = []   # list of (channels, samples) numpy arrays
+        for seg_i, (seg_start, seg_end) in enumerate(segments):
+            seg_dur = seg_end - seg_start
+            # Trim a clean video clip for this segment
+            seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
+            ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
+                seg_path, vcodec="libx264", acodec="aac", strict="experimental"
+            ).run(overwrite_output=True, quiet=True)
+            fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
+            video_info  = load_video(seg_path, seg_dur)
+            clip_frames = video_info.clip_frames.unsqueeze(0)
+            sync_frames = video_info.sync_frames.unsqueeze(0)
+            actual_dur  = video_info.duration_sec
+            seq_cfg.duration = actual_dur
+            net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
+            print(f"[MMAudio] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
+                  f"{seg_start:.1f}–{seg_end:.1f}s | dur={actual_dur:.2f}s | prompt='{prompt}'")
+            with torch.no_grad():
+                audios = generate(
+                    clip_frames,
+                    sync_frames,
+                    [prompt],
+                    negative_text=[negative_prompt] if negative_prompt else None,
+                    feature_utils=feature_utils,
+                    net=net,
+                    fm=fm,
+                    rng=rng,
+                    cfg_strength=float(cfg_strength),
+                )
+            wav = audios.float().cpu()[0].numpy()  # (C, T)
+            seg_samples = int(round(seg_dur * sr))
+            wav = wav[:, :seg_samples]
+            seg_audios.append(wav)
+        # Crossfade-stitch all segments
+        def _cf_join(a, b, cf_s):
+            cf = int(round(cf_s * sr))
+            cf = min(cf, a.shape[1], b.shape[1])
+            if cf <= 0:
+                return np.concatenate([a, b], axis=1)
+            gain = 10 ** (MMA_CF_DB / 20.0)
+            overlap = a[:, -cf:] * gain + b[:, :cf] * gain
+            return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
+        full_wav = seg_audios[0]
+        for nw in seg_audios[1:]:
+            full_wav = _cf_join(full_wav, nw, MMA_CF_S)
+        full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
+        torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
+        mux_video_audio(video_file, audio_path, video_path)
         outputs.append((video_path, audio_path))
     return _pad_outputs(outputs)
                     mma_seed     = gr.Number(label="Seed (-1 = random)", value=get_random_seed(), precision=0)
                     mma_cfg      = gr.Slider(label="CFG Strength", minimum=1, maximum=10, value=4.5, step=0.5)
                     mma_steps    = gr.Slider(label="Steps", minimum=10, maximum=50, value=25, step=1)
                     mma_samples  = gr.Slider(label="Generations", minimum=1, maximum=MAX_SLOTS, value=1, step=1)
                     mma_btn      = gr.Button("Generate", variant="primary")
                 outputs=mma_slot_grps,
             )
+            def _run_mmaudio(video, prompt, neg, seed, cfg, steps, n):
+                flat = generate_mmaudio(video, prompt, neg, seed, cfg, steps, n)
                 n = int(n)
                 grp_upd = [gr.update(visible=(i < n)) for i in range(MAX_SLOTS)]
                 vid_upd = [gr.update(value=flat[i * 2])     for i in range(MAX_SLOTS)]
             mma_btn.click(
                 fn=_run_mmaudio,
                 inputs=[mma_video, mma_prompt, mma_neg, mma_seed,
+                        mma_cfg, mma_steps, mma_samples],
                 outputs=mma_slot_grps + mma_slot_vids + mma_slot_auds,
             )