Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 5 days ago

Commit

2b2a599

1 Parent(s): f8b59b5

HunyuanFoley: sliding-window segmentation for videos longer than 15 s

The model is hard-limited to 15 s per pass (MAX_VIDEO_DURATION_SECONDS=15
in constants.py, enforced in get_frames_av). For longer videos, slice the
input with ffmpeg into overlapping <=15 s segments, run feature_process +
denoise_process on each, then crossfade-stitch all segment wavs into a
single full-length audio track — same strategy as TARO. Text features are
encoded once from the first segment and reused across all segments.

Files changed (1) hide show

app.py +71 -20

app.py CHANGED Viewed

@@ -507,33 +507,84 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     tmp_dir = tempfile.mkdtemp()
     outputs = []
-    # feature_process() extracts SigLIP2 visual features + Synchformer sync features
-    # + CLAP text embeddings — exactly as in HunyuanVideo-Foley/gradio_app.py
-    visual_feats, text_feats, audio_len_in_s = feature_process(
-        video_file,
         prompt if prompt else "",
         model_dict,
         cfg,
         neg_prompt=negative_prompt if negative_prompt else None,
     )
-    print(f"[HunyuanFoley] Audio length: {audio_len_in_s:.2f}s | generating {num_samples} sample(s)")
-    # denoise_process() runs the flow-matching diffusion loop and decodes with DAC-VAE
-    # batch_size=num_samples generates all samples in one pass
-    audio, sample_rate = denoise_process(
-        visual_feats,
-        text_feats,
-        audio_len_in_s,
-        model_dict,
-        cfg,
-        guidance_scale=float(guidance_scale),
-        num_inference_steps=int(num_steps),
-        batch_size=num_samples,
-    )
-    # audio shape: (batch, channels, samples)
     for sample_idx in range(num_samples):
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
-        torchaudio.save(audio_path, audio[sample_idx], sample_rate)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, video_file, video_path)
         outputs.append((video_path, audio_path))

     tmp_dir = tempfile.mkdtemp()
     outputs = []
+    # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
+    # input into overlapping segments, generate audio for each, then crossfade-
+    # stitch the results into a single full-length audio track.
+    total_dur_s = get_video_duration(video_file)
+    CF_S  = 2.0   # crossfade seconds between segments
+    CF_DB = 3.0   # crossfade boost in dB
+    segments = _taro_build_segments(total_dur_s, CF_S)   # reuse TARO helper
+    print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
+    # Pre-encode text features once (same for every segment)
+    _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
+    ffmpeg.input(video_file, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
+        _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
+    ).run(overwrite_output=True, quiet=True)
+    _, text_feats, _ = feature_process(
+        _dummy_seg_path,
         prompt if prompt else "",
         model_dict,
         cfg,
         neg_prompt=negative_prompt if negative_prompt else None,
     )
+    # Generate audio per segment, then stitch
     for sample_idx in range(num_samples):
+        seg_wavs = []
+        sr = 48000  # HunyuanFoley always outputs 48 kHz
+        for seg_i, (seg_start, seg_end) in enumerate(segments):
+            seg_dur = seg_end - seg_start
+            seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
+            ffmpeg.input(video_file, ss=seg_start, t=seg_dur).output(
+                seg_path, vcodec="libx264", acodec="aac", strict="experimental"
+            ).run(overwrite_output=True, quiet=True)
+            visual_feats, _, seg_audio_len = feature_process(
+                seg_path,
+                prompt if prompt else "",
+                model_dict,
+                cfg,
+                neg_prompt=negative_prompt if negative_prompt else None,
+            )
+            print(f"[HunyuanFoley] Sample {sample_idx+1} | seg {seg_i+1}/{len(segments)} "
+                  f"{seg_start:.1f}–{seg_end:.1f}s → {seg_audio_len:.2f}s audio")
+            audio_batch, sr = denoise_process(
+                visual_feats,
+                text_feats,
+                seg_audio_len,
+                model_dict,
+                cfg,
+                guidance_scale=float(guidance_scale),
+                num_inference_steps=int(num_steps),
+                batch_size=1,
+            )
+            # audio_batch shape: (1, channels, samples) — take first (and only) sample
+            wav = audio_batch[0].float().cpu().numpy()  # (channels, samples)
+            # Trim to exact segment length in samples
+            seg_samples = int(round(seg_dur * sr))
+            wav = wav[:, :seg_samples]
+            seg_wavs.append(wav)
+        # Stitch segments with crossfade (operates on (channels, samples) arrays)
+        def _cf_join_stereo(a, b, cf_s, db):
+            cf = int(round(cf_s * sr))
+            cf = min(cf, a.shape[1], b.shape[1])
+            if cf <= 0:
+                return np.concatenate([a, b], axis=1)
+            gain = 10 ** (db / 20.0)
+            overlap = a[:, -cf:] * gain + b[:, :cf] * gain
+            return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
+        full_wav = seg_wavs[0]
+        for nw in seg_wavs[1:]:
+            full_wav = _cf_join_stereo(full_wav, nw, CF_S, CF_DB)
+        # Trim to exact video duration
+        full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
+        torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, video_file, video_path)
         outputs.append((video_path, audio_path))