Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

JackIsNotInTheBox commited on 6 days ago

Commit

eaedb53

1 Parent(s): 0429f8a

Add crossfade duration/dB controls, inference caching, share=True

Browse files

Files changed (1) hide show

app.py +172 -154

app.py CHANGED Viewed

@@ -30,6 +30,14 @@ onset_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="onset_model.ckpt",
 taro_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="taro_ckpt.pt", cache_dir=CACHE_DIR)
 print("Checkpoints downloaded.")
 def set_global_seed(seed):
     np.random.seed(seed % (2**32))
@@ -56,22 +64,22 @@ def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
                   cfg_scale, num_steps, mode,
                   euler_sampler, euler_maruyama_sampler):
     """
-    Run one model inference pass for the video window [seg_start_s, seg_start_s + model_dur].
-    Returns a numpy float32 wav array of exactly round(model_dur * sr) samples,
-    trimmed to the actual segment length (seg_end_s - seg_start_s) when shorter.
     """
-    # -- CAVP features: 4 fps --
     cavp_start = int(round(seg_start_s * fps))
-    cavp_end   = cavp_start + truncate_frame
-    cavp_slice = cavp_feats_full[cavp_start:cavp_end]
-    # pad if near end of video
     if cavp_slice.shape[0] < truncate_frame:
-        pad = np.zeros((truncate_frame - cavp_slice.shape[0],) + cavp_slice.shape[1:], dtype=cavp_slice.dtype)
         cavp_slice = np.concatenate([cavp_slice, pad], axis=0)
     video_feats = torch.from_numpy(cavp_slice).unsqueeze(0).to(device).to(weight_dtype)
-    # -- Onset features: truncate_onset frames per model_dur --
-    onset_fps   = truncate_onset / model_dur          # frames per second of onset feats
     onset_start = int(round(seg_start_s * onset_fps))
     onset_slice = onset_feats_full[onset_start : onset_start + truncate_onset]
     if onset_slice.shape[0] < truncate_onset:
@@ -79,7 +87,6 @@ def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
         onset_slice = np.pad(onset_slice, ((0, pad_len),), mode="constant", constant_values=0)
     onset_feats_t = torch.from_numpy(onset_slice).unsqueeze(0).to(device).to(weight_dtype)
-    # -- Diffusion --
     z = torch.randn(1, model.in_channels, 204, 16, device=device).to(weight_dtype)
     sampling_kwargs = dict(
         model=model,
@@ -102,171 +109,180 @@ def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
     samples = vae.decode(samples / latents_scale).sample
     wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
-    # Trim to actual segment length
     seg_samples = int(round((seg_end_s - seg_start_s) * sr))
     return wav[:seg_samples]
-def crossfade_join(wav_a, wav_b, crossfade_s, sr):
     """
-    Join wav_a and wav_b with a 2-second equal-power (+3 dB) crossfade.
-    wav_a contains 1 s of 'extra' audio at its tail (the overlap region starts
-    1 s before its end).  wav_b contains 1 s of 'extra' audio at its head.
-    The crossfade window is crossfade_s wide; the midpoint sits at (crossfade_s/2)
-    into the window, where each gain = sqrt(0.5) ≈ -3 dB ... wait, we want +3 dB
-    at midpoint meaning both signals are at *full* amplitude there.
-    Equal-power (sqrt) ramps: at the midpoint t=0.5 the fade-out = sqrt(0.5) and
-    fade-in = sqrt(0.5), so combined power = 0.5+0.5 = 1.0 (+0 dB).
-    For a +3 dB bump at midpoint we use *linear* ramps instead:
-      fade_out = 1 - t,  fade_in = t   (t: 0->1 across window)
-    At t=0.5: both = 0.5, sum = 1.0 amplitude = +6 dB power... that is not right.
-    DaVinci Resolve "+3 dB" crossfade means the combined level at the midpoint
-    is +3 dB above either source, which equals the behaviour where each signal
-    is kept at full gain (1.0) across the entire overlap and the two are simply
-    summed — then the overlap region has 6 dB of headroom risk, but the *perceived*
-    loudness boost at the centre is +3 dB (sqrt(2) in amplitude).
-    Implementation: keep both signals at unity gain in the crossfade window and
-    sum them.  Outside the window use the respective signal only.
     """
     cf_samples = int(round(crossfade_s * sr))
-    # The crossfade sits at the junction: last cf_samples of wav_a overlap with
-    # first cf_samples of wav_b.
-    tail_a   = wav_a[-cf_samples:]          # 1s before end of a
-    head_b   = wav_b[:cf_samples]           # 1s after start of b
-    overlap  = tail_a + head_b              # +3 dB sum at centre (unity + unity)
-    result = np.concatenate([
-        wav_a[:-cf_samples],               # body of a (before crossfade)
-        overlap,                           # crossfade region
-        wav_b[cf_samples:],                # body of b (after crossfade)
     ])
-    return result
-@spaces.GPU(duration=300)
-def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
-    seed_val = int(seed_val)
-    if seed_val < 0:
-        seed_val = random.randint(0, 2**32 - 1)
-    set_global_seed(seed_val)
-    torch.set_grad_enabled(False)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    weight_dtype = torch.bfloat16
-    from cavp_util import Extract_CAVP_Features
-    from onset_util import VideoOnsetNet, extract_onset
-    from models import MMDiT
-    from samplers import euler_sampler, euler_maruyama_sampler
-    from diffusers import AudioLDM2Pipeline
-    extract_cavp = Extract_CAVP_Features(
-        device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path
-    )
-    state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if "model.net.model" in key:
-            new_key = key.replace("model.net.model", "net.model")
-        elif "model.fc." in key:
-            new_key = key.replace("model.fc", "fc")
-        else:
-            new_key = key
-        new_state_dict[new_key] = value
-    onset_model = VideoOnsetNet(False).to(device)
-    onset_model.load_state_dict(new_state_dict)
-    onset_model.eval()
-    model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
-    ckpt = torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"]
-    model.load_state_dict(ckpt)
-    model.eval()
-    model.to(weight_dtype)
-    model_audioldm = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
-    vae = model_audioldm.vae.to(device)
-    vae.eval()
-    vocoder = model_audioldm.vocoder.to(device)
-    tmp_dir = tempfile.mkdtemp()
-    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-    strip_audio_from_video(video_file, silent_video)
-    cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
-    onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
     sr            = 16000
     truncate      = 131072
     fps           = 4
-    truncate_frame = int(fps * truncate / sr)   # 32 cavp frames per segment
-    truncate_onset = 120                         # onset frames per segment
-    model_dur     = truncate / sr               # 8.192 s
-    crossfade_s   = 2.0                         # 2-second crossfade window
-    # Each segment starts (model_dur - crossfade_s) later than the previous,
-    # so the tails overlap by crossfade_s giving 1 s of extra audio on each side.
-    step_s        = model_dur - crossfade_s     # 6.192 s
-    latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
-    # Total video duration from cavp features
-    total_frames   = cavp_feats.shape[0]
-    total_dur_s    = total_frames / fps
-    # ------------------------------------------------------------------ #
-    # Build segment list: each entry is (seg_start_s, seg_end_s)          #
-    # seg_end_s is the actual content end (clipped to video length),      #
-    # but we always run the model for a full model_dur window.            #
-    # ------------------------------------------------------------------ #
-    segments = []
-    seg_start = 0.0
-    while True:
-        seg_end = min(seg_start + model_dur, total_dur_s)
-        segments.append((seg_start, seg_end))
-        if seg_end >= total_dur_s:
-            break
-        seg_start += step_s
-    # ------------------------------------------------------------------ #
-    # Run inference for every segment                                     #
-    # ------------------------------------------------------------------ #
-    wavs = []
-    for seg_start_s, seg_end_s in segments:
-        print(f"Inferring segment {seg_start_s:.2f}s – {seg_end_s:.2f}s ...")
-        wav = infer_segment(
-            model, vae, vocoder,
-            cavp_feats, onset_feats,
-            seg_start_s, seg_end_s,
-            sr, fps, truncate_frame, truncate_onset, model_dur,
-            latents_scale, device, weight_dtype,
-            cfg_scale, num_steps, mode,
-            euler_sampler, euler_maruyama_sampler,
-        )
-        wavs.append(wav)
-    # ------------------------------------------------------------------ #
-    # Stitch with crossfades                                              #
-    # Single segment: no crossfade needed                                 #
-    # ------------------------------------------------------------------ #
-    if len(wavs) == 1:
-        final_wav = wavs[0]
     else:
-        final_wav = wavs[0]
-        for next_wav in wavs[1:]:
-            final_wav = crossfade_join(final_wav, next_wav, crossfade_s, sr)
-    # Clip to exact video duration
-    target_samples = int(round(total_dur_s * sr))
-    final_wav = final_wav[:target_samples]
-    audio_path = os.path.join(tmp_dir, "output.wav")
     sf.write(audio_path, final_wav, sr)
-    # Mux original silent video (full length) with generated audio
     output_video = os.path.join(tmp_dir, "output.mp4")
     input_v = ffmpeg.input(silent_video)
     input_a = ffmpeg.input(audio_path)
@@ -292,12 +308,14 @@ demo = gr.Interface(
         gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5),
         gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1),
         gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde"),
     ],
     outputs=[
         gr.Video(label="Output Video with Audio"),
         gr.Audio(label="Generated Audio"),
     ],
     title="TARO: Video-to-Audio Synthesis (ICCV 2025)",
-    description="Upload a video and generate synchronized audio using TARO. Optimal duration is 8.2s.",
 )
-demo.queue().launch()

 taro_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="taro_ckpt.pt", cache_dir=CACHE_DIR)
 print("Checkpoints downloaded.")
+# ------------------------------------------------------------------ #
+# Inference cache: keyed by (video_path, seed, cfg_scale,            #
+#                             num_steps, mode, crossfade_s)           #
+# Stores the raw per-segment wavs so that only the dB value can be   #
+# changed without re-running the model.                               #
+# ------------------------------------------------------------------ #
+_INFERENCE_CACHE = {}   # key -> {"wavs": [...], "sr": int}
 def set_global_seed(seed):
     np.random.seed(seed % (2**32))
                   cfg_scale, num_steps, mode,
                   euler_sampler, euler_maruyama_sampler):
     """
+    Run one model inference pass for the video window starting at seg_start_s.
+    Returns a numpy float32 wav array trimmed to (seg_end_s - seg_start_s).
     """
+    # CAVP features at fps (4 fps)
     cavp_start = int(round(seg_start_s * fps))
+    cavp_slice = cavp_feats_full[cavp_start : cavp_start + truncate_frame]
     if cavp_slice.shape[0] < truncate_frame:
+        pad = np.zeros(
+            (truncate_frame - cavp_slice.shape[0],) + cavp_slice.shape[1:],
+            dtype=cavp_slice.dtype,
+        )
         cavp_slice = np.concatenate([cavp_slice, pad], axis=0)
     video_feats = torch.from_numpy(cavp_slice).unsqueeze(0).to(device).to(weight_dtype)
+    # Onset features at truncate_onset / model_dur frames per second
+    onset_fps   = truncate_onset / model_dur
     onset_start = int(round(seg_start_s * onset_fps))
     onset_slice = onset_feats_full[onset_start : onset_start + truncate_onset]
     if onset_slice.shape[0] < truncate_onset:
         onset_slice = np.pad(onset_slice, ((0, pad_len),), mode="constant", constant_values=0)
     onset_feats_t = torch.from_numpy(onset_slice).unsqueeze(0).to(device).to(weight_dtype)
     z = torch.randn(1, model.in_channels, 204, 16, device=device).to(weight_dtype)
     sampling_kwargs = dict(
         model=model,
     samples = vae.decode(samples / latents_scale).sample
     wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
     seg_samples = int(round((seg_end_s - seg_start_s) * sr))
     return wav[:seg_samples]
+def crossfade_join(wav_a, wav_b, crossfade_s, db_boost, sr):
     """
+    Join wav_a and wav_b with a crossfade_s-second crossfade.
+    db_boost controls the gain applied to both signals in the overlap region:
+        gain = 10 ** (db_boost / 20)
+    At +3 dB (gain ≈ 1.414), the two summed unity signals produce +3 dB at midpoint.
+    At 0 dB (gain = 1.0), each signal is kept at full amplitude — same as +3 dB sum
+    since both are 1.0.  The parameter lets the user tune the blend level freely.
+    The crossfade window is the last crossfade_s seconds of wav_a overlapping with
+    the first crossfade_s seconds of wav_b.  Both are scaled by gain and summed.
     """
     cf_samples = int(round(crossfade_s * sr))
+    # Guard: if either wav is shorter than the crossfade window, shrink the window
+    cf_samples = min(cf_samples, len(wav_a), len(wav_b))
+    if cf_samples <= 0:
+        return np.concatenate([wav_a, wav_b])
+    gain = 10 ** (db_boost / 20.0)
+    tail_a  = wav_a[-cf_samples:] * gain
+    head_b  = wav_b[:cf_samples]  * gain
+    overlap = tail_a + head_b
+    return np.concatenate([
+        wav_a[:-cf_samples],
+        overlap,
+        wav_b[cf_samples:],
     ])
+def stitch_wavs(wavs, crossfade_s, db_boost, sr, total_dur_s):
+    """Stitch a list of wav arrays using crossfade_join, then clip to total_dur_s."""
+    if len(wavs) == 1:
+        final_wav = wavs[0]
+    else:
+        final_wav = wavs[0]
+        for next_wav in wavs[1:]:
+            final_wav = crossfade_join(final_wav, next_wav, crossfade_s, db_boost, sr)
+    target_samples = int(round(total_dur_s * sr))
+    return final_wav[:target_samples]
+@spaces.GPU(duration=300)
+def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode,
+                   crossfade_s, crossfade_db):
+    global _INFERENCE_CACHE
+    seed_val     = int(seed_val)
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
+    if seed_val < 0:
+        seed_val = random.randint(0, 2**32 - 1)
     sr            = 16000
     truncate      = 131072
     fps           = 4
+    truncate_frame = int(fps * truncate / sr)
+    truncate_onset = 120
+    model_dur     = truncate / sr       # 8.192 s
+    step_s        = model_dur - crossfade_s
+    # Cache key covers everything that affects segmentation and inference
+    cache_key = (video_file, seed_val, float(cfg_scale), int(num_steps), mode,
+                 crossfade_s)
+    if cache_key in _INFERENCE_CACHE:
+        print("Cache hit — skipping inference, re-stitching with new dB value.")
+        cached      = _INFERENCE_CACHE[cache_key]
+        wavs        = cached["wavs"]
+        total_dur_s = cached["total_dur_s"]
+        tmp_dir     = cached["tmp_dir"]
+        silent_video = cached["silent_video"]
     else:
+        set_global_seed(seed_val)
+        torch.set_grad_enabled(False)
+        device       = "cuda" if torch.cuda.is_available() else "cpu"
+        weight_dtype = torch.bfloat16
+        from cavp_util import Extract_CAVP_Features
+        from onset_util import VideoOnsetNet, extract_onset
+        from models import MMDiT
+        from samplers import euler_sampler, euler_maruyama_sampler
+        from diffusers import AudioLDM2Pipeline
+        extract_cavp = Extract_CAVP_Features(
+            device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path
+        )
+        state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            if "model.net.model" in key:
+                new_key = key.replace("model.net.model", "net.model")
+            elif "model.fc." in key:
+                new_key = key.replace("model.fc", "fc")
+            else:
+                new_key = key
+            new_state_dict[new_key] = value
+        onset_model = VideoOnsetNet(False).to(device)
+        onset_model.load_state_dict(new_state_dict)
+        onset_model.eval()
+        model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
+        ckpt  = torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"]
+        model.load_state_dict(ckpt)
+        model.eval()
+        model.to(weight_dtype)
+        model_audioldm = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
+        vae     = model_audioldm.vae.to(device)
+        vae.eval()
+        vocoder = model_audioldm.vocoder.to(device)
+        tmp_dir      = tempfile.mkdtemp()
+        silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+        strip_audio_from_video(video_file, silent_video)
+        cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
+        onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
+        latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
+        total_frames = cavp_feats.shape[0]
+        total_dur_s  = total_frames / fps
+        # Build segment list
+        segments = []
+        seg_start = 0.0
+        while True:
+            seg_end = min(seg_start + model_dur, total_dur_s)
+            segments.append((seg_start, seg_end))
+            if seg_end >= total_dur_s:
+                break
+            seg_start += step_s
+        # Run inference for every segment
+        wavs = []
+        for seg_start_s, seg_end_s in segments:
+            print(f"Inferring segment {seg_start_s:.2f}s – {seg_end_s:.2f}s ...")
+            wav = infer_segment(
+                model, vae, vocoder,
+                cavp_feats, onset_feats,
+                seg_start_s, seg_end_s,
+                sr, fps, truncate_frame, truncate_onset, model_dur,
+                latents_scale, device, weight_dtype,
+                cfg_scale, num_steps, mode,
+                euler_sampler, euler_maruyama_sampler,
+            )
+            wavs.append(wav)
+        # Store in cache
+        _INFERENCE_CACHE[cache_key] = {
+            "wavs":         wavs,
+            "total_dur_s":  total_dur_s,
+            "tmp_dir":      tmp_dir,
+            "silent_video": silent_video,
+        }
+    # Stitch with current crossfade params
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    final_wav = stitch_wavs(wavs, crossfade_s, crossfade_db, sr, total_dur_s)
+    audio_path   = os.path.join(tmp_dir, "output.wav")
     sf.write(audio_path, final_wav, sr)
     output_video = os.path.join(tmp_dir, "output.mp4")
     input_v = ffmpeg.input(silent_video)
     input_a = ffmpeg.input(audio_path)
         gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5),
         gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1),
         gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde"),
+        gr.Slider(label="Crossfade Duration (s)", minimum=0, maximum=8, value=2, step=0.1),
+        gr.Textbox(label="Crossfade Boost (dB)", value="3"),
     ],
     outputs=[
         gr.Video(label="Output Video with Audio"),
         gr.Audio(label="Generated Audio"),
     ],
     title="TARO: Video-to-Audio Synthesis (ICCV 2025)",
+    description="Upload a video and generate synchronized audio using TARO. Optimal clip duration is 8.2s. Longer videos are automatically split into overlapping segments and stitched with a crossfade.",
 )
+demo.queue().launch(share=True)