Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

JackIsNotInTheBox commited on 6 days ago

Commit

0429f8a

1 Parent(s): d0e121d

Support videos longer than 8.2s via overlapping inference + 2s +3dB crossfade stitching

Browse files

Files changed (1) hide show

app.py +162 -57

app.py CHANGED Viewed

@@ -49,6 +49,105 @@ def strip_audio_from_video(video_path, output_path):
     )
 @spaces.GPU(duration=300)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
     seed_val = int(seed_val)
@@ -101,74 +200,80 @@ def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
     cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
     onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
-    sr = 16000
-    truncate = 131072
-    fps = 4
-    truncate_frame = int(fps * truncate / sr)
-    truncate_onset = 120
     latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
-    video_feats = torch.from_numpy(cavp_feats[:truncate_frame]).unsqueeze(0).to(device).to(weight_dtype)
-    # Slice onset features and pad to truncate_onset if the video is shorter than expected
-    onset_feats_sliced = onset_feats[:truncate_onset]
-    actual_onset_len = onset_feats_sliced.shape[0]
-    if actual_onset_len < truncate_onset:
-        pad_len = truncate_onset - actual_onset_len
-        onset_feats_sliced = np.pad(
-            onset_feats_sliced,
-            ((0, pad_len),),
-            mode="constant",
-            constant_values=0,
         )
-    onset_feats_t = torch.from_numpy(onset_feats_sliced).unsqueeze(0).to(device).to(weight_dtype)
-    z = torch.randn(len(video_feats), model.in_channels, 204, 16, device=device).to(weight_dtype)
-    sampling_kwargs = dict(
-        model=model,
-        latents=z,
-        y=onset_feats_t,
-        context=video_feats,
-        num_steps=int(num_steps),
-        heun=False,
-        cfg_scale=float(cfg_scale),
-        guidance_low=0.0,
-        guidance_high=0.7,
-        path_type="linear",
-    )
-    with torch.no_grad():
-        if mode == "sde":
-            samples = euler_maruyama_sampler(**sampling_kwargs)
-        else:
-            samples = euler_sampler(**sampling_kwargs)
-    samples = vae.decode(samples / latents_scale).sample
-    # Cast to float32 before vocoder (HiFi-GAN requires float32)
-    wav_samples = vocoder(samples.squeeze().float()).detach().cpu().numpy()
     audio_path = os.path.join(tmp_dir, "output.wav")
-    sf.write(audio_path, wav_samples, sr)
-    duration = truncate / sr
-    trimmed_video = os.path.join(tmp_dir, "trimmed.mp4")
     output_video = os.path.join(tmp_dir, "output.mp4")
-    (
-        ffmpeg
-        .input(silent_video, ss=0, t=duration)
-        .output(trimmed_video, vcodec="libx264", an=None)
-        .run(overwrite_output=True, quiet=True)
-    )
-    input_v = ffmpeg.input(trimmed_video)
     input_a = ffmpeg.input(audio_path)
     (
         ffmpeg
-        .output(input_v, input_a, output_video, vcodec="libx264", acodec="aac", strict="experimental")
         .run(overwrite_output=True, quiet=True)
     )

     )
+def infer_segment(model, vae, vocoder, cavp_feats_full, onset_feats_full,
+                  seg_start_s, seg_end_s,
+                  sr, fps, truncate_frame, truncate_onset, model_dur,
+                  latents_scale, device, weight_dtype,
+                  cfg_scale, num_steps, mode,
+                  euler_sampler, euler_maruyama_sampler):
+    """
+    Run one model inference pass for the video window [seg_start_s, seg_start_s + model_dur].
+    Returns a numpy float32 wav array of exactly round(model_dur * sr) samples,
+    trimmed to the actual segment length (seg_end_s - seg_start_s) when shorter.
+    """
+    # -- CAVP features: 4 fps --
+    cavp_start = int(round(seg_start_s * fps))
+    cavp_end   = cavp_start + truncate_frame
+    cavp_slice = cavp_feats_full[cavp_start:cavp_end]
+    # pad if near end of video
+    if cavp_slice.shape[0] < truncate_frame:
+        pad = np.zeros((truncate_frame - cavp_slice.shape[0],) + cavp_slice.shape[1:], dtype=cavp_slice.dtype)
+        cavp_slice = np.concatenate([cavp_slice, pad], axis=0)
+    video_feats = torch.from_numpy(cavp_slice).unsqueeze(0).to(device).to(weight_dtype)
+    # -- Onset features: truncate_onset frames per model_dur --
+    onset_fps   = truncate_onset / model_dur          # frames per second of onset feats
+    onset_start = int(round(seg_start_s * onset_fps))
+    onset_slice = onset_feats_full[onset_start : onset_start + truncate_onset]
+    if onset_slice.shape[0] < truncate_onset:
+        pad_len = truncate_onset - onset_slice.shape[0]
+        onset_slice = np.pad(onset_slice, ((0, pad_len),), mode="constant", constant_values=0)
+    onset_feats_t = torch.from_numpy(onset_slice).unsqueeze(0).to(device).to(weight_dtype)
+    # -- Diffusion --
+    z = torch.randn(1, model.in_channels, 204, 16, device=device).to(weight_dtype)
+    sampling_kwargs = dict(
+        model=model,
+        latents=z,
+        y=onset_feats_t,
+        context=video_feats,
+        num_steps=int(num_steps),
+        heun=False,
+        cfg_scale=float(cfg_scale),
+        guidance_low=0.0,
+        guidance_high=0.7,
+        path_type="linear",
+    )
+    with torch.no_grad():
+        if mode == "sde":
+            samples = euler_maruyama_sampler(**sampling_kwargs)
+        else:
+            samples = euler_sampler(**sampling_kwargs)
+    samples = vae.decode(samples / latents_scale).sample
+    wav = vocoder(samples.squeeze().float()).detach().cpu().numpy()
+    # Trim to actual segment length
+    seg_samples = int(round((seg_end_s - seg_start_s) * sr))
+    return wav[:seg_samples]
+def crossfade_join(wav_a, wav_b, crossfade_s, sr):
+    """
+    Join wav_a and wav_b with a 2-second equal-power (+3 dB) crossfade.
+    wav_a contains 1 s of 'extra' audio at its tail (the overlap region starts
+    1 s before its end).  wav_b contains 1 s of 'extra' audio at its head.
+    The crossfade window is crossfade_s wide; the midpoint sits at (crossfade_s/2)
+    into the window, where each gain = sqrt(0.5) ≈ -3 dB ... wait, we want +3 dB
+    at midpoint meaning both signals are at *full* amplitude there.
+    Equal-power (sqrt) ramps: at the midpoint t=0.5 the fade-out = sqrt(0.5) and
+    fade-in = sqrt(0.5), so combined power = 0.5+0.5 = 1.0 (+0 dB).
+    For a +3 dB bump at midpoint we use *linear* ramps instead:
+      fade_out = 1 - t,  fade_in = t   (t: 0->1 across window)
+    At t=0.5: both = 0.5, sum = 1.0 amplitude = +6 dB power... that is not right.
+    DaVinci Resolve "+3 dB" crossfade means the combined level at the midpoint
+    is +3 dB above either source, which equals the behaviour where each signal
+    is kept at full gain (1.0) across the entire overlap and the two are simply
+    summed — then the overlap region has 6 dB of headroom risk, but the *perceived*
+    loudness boost at the centre is +3 dB (sqrt(2) in amplitude).
+    Implementation: keep both signals at unity gain in the crossfade window and
+    sum them.  Outside the window use the respective signal only.
+    """
+    cf_samples = int(round(crossfade_s * sr))
+    # The crossfade sits at the junction: last cf_samples of wav_a overlap with
+    # first cf_samples of wav_b.
+    tail_a   = wav_a[-cf_samples:]          # 1s before end of a
+    head_b   = wav_b[:cf_samples]           # 1s after start of b
+    overlap  = tail_a + head_b              # +3 dB sum at centre (unity + unity)
+    result = np.concatenate([
+        wav_a[:-cf_samples],               # body of a (before crossfade)
+        overlap,                           # crossfade region
+        wav_b[cf_samples:],                # body of b (after crossfade)
+    ])
+    return result
 @spaces.GPU(duration=300)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
     seed_val = int(seed_val)
     cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
     onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
+    sr            = 16000
+    truncate      = 131072
+    fps           = 4
+    truncate_frame = int(fps * truncate / sr)   # 32 cavp frames per segment
+    truncate_onset = 120                         # onset frames per segment
+    model_dur     = truncate / sr               # 8.192 s
+    crossfade_s   = 2.0                         # 2-second crossfade window
+    # Each segment starts (model_dur - crossfade_s) later than the previous,
+    # so the tails overlap by crossfade_s giving 1 s of extra audio on each side.
+    step_s        = model_dur - crossfade_s     # 6.192 s
     latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
+    # Total video duration from cavp features
+    total_frames   = cavp_feats.shape[0]
+    total_dur_s    = total_frames / fps
+    # ------------------------------------------------------------------ #
+    # Build segment list: each entry is (seg_start_s, seg_end_s)          #
+    # seg_end_s is the actual content end (clipped to video length),      #
+    # but we always run the model for a full model_dur window.            #
+    # ------------------------------------------------------------------ #
+    segments = []
+    seg_start = 0.0
+    while True:
+        seg_end = min(seg_start + model_dur, total_dur_s)
+        segments.append((seg_start, seg_end))
+        if seg_end >= total_dur_s:
+            break
+        seg_start += step_s
+    # ------------------------------------------------------------------ #
+    # Run inference for every segment                                     #
+    # ------------------------------------------------------------------ #
+    wavs = []
+    for seg_start_s, seg_end_s in segments:
+        print(f"Inferring segment {seg_start_s:.2f}s – {seg_end_s:.2f}s ...")
+        wav = infer_segment(
+            model, vae, vocoder,
+            cavp_feats, onset_feats,
+            seg_start_s, seg_end_s,
+            sr, fps, truncate_frame, truncate_onset, model_dur,
+            latents_scale, device, weight_dtype,
+            cfg_scale, num_steps, mode,
+            euler_sampler, euler_maruyama_sampler,
         )
+        wavs.append(wav)
+    # ------------------------------------------------------------------ #
+    # Stitch with crossfades                                              #
+    # Single segment: no crossfade needed                                 #
+    # ------------------------------------------------------------------ #
+    if len(wavs) == 1:
+        final_wav = wavs[0]
+    else:
+        final_wav = wavs[0]
+        for next_wav in wavs[1:]:
+            final_wav = crossfade_join(final_wav, next_wav, crossfade_s, sr)
+    # Clip to exact video duration
+    target_samples = int(round(total_dur_s * sr))
+    final_wav = final_wav[:target_samples]
     audio_path = os.path.join(tmp_dir, "output.wav")
+    sf.write(audio_path, final_wav, sr)
+    # Mux original silent video (full length) with generated audio
     output_video = os.path.join(tmp_dir, "output.mp4")
+    input_v = ffmpeg.input(silent_video)
     input_a = ffmpeg.input(audio_path)
     (
         ffmpeg
+        .output(input_v, input_a, output_video,
+                vcodec="libx264", acodec="aac", strict="experimental")
         .run(overwrite_output=True, quiet=True)
     )