Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 7 days ago

Commit

72afd74

1 Parent(s): 95c5c55

Refactor: consolidate segmentation and crossfade into shared helpers

- Add _build_segments(total, window_s, crossfade_s) as universal segmentation
helper used by all three models (TARO via thin wrapper, MMAudio and
HunyuanFoley directly with their respective window constants)
- Add _cf_join_stereo(a, b, cf_s, db, sr) as shared equal-power crossfade for
stereo (C, T) arrays; MMAudio and HunyuanFoley both call this instead of
duplicating the same inline closure
- Remove duplicate _mma_build_segments and inline _cf_join closures from
generate_mmaudio and generate_hunyuan
- _taro_build_segments now delegates to _build_segments; TARO keeps its own
mono _crossfade_join/_stitch_wavs since it outputs 1D not (C,T)

Files changed (1) hide show

app.py +46 -58

app.py CHANGED Viewed

@@ -105,6 +105,43 @@ def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
     ).run(overwrite_output=True, quiet=True)
 # ================================================================== #
 #                              TARO                                   #
 # ================================================================== #
@@ -128,19 +165,8 @@ _TARO_INFERENCE_CACHE: dict = {}
 def _taro_build_segments(total_dur_s: float, crossfade_s: float) -> list:
-    """Sliding-window segmentation for videos longer than one TARO window."""
-    if total_dur_s <= TARO_MODEL_DUR:
-        return [(0.0, total_dur_s)]
-    step_s = TARO_MODEL_DUR - crossfade_s
-    segments, seg_start = [], 0.0
-    while True:
-        if seg_start + TARO_MODEL_DUR >= total_dur_s:
-            seg_start = max(0.0, total_dur_s - TARO_MODEL_DUR)
-            segments.append((seg_start, total_dur_s))
-            break
-        segments.append((seg_start, seg_start + TARO_MODEL_DUR))
-        seg_start += step_s
-    return segments
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
@@ -411,25 +437,11 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
     outputs = []
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
-    # with a 1 s crossfade overlap and stitch the results into a full-length track.
     total_dur_s = get_video_duration(video_file)
     MMA_CF_S    = float(crossfade_s)
     MMA_CF_DB   = float(crossfade_db)
-    def _mma_build_segments(total_s, cf_s):
-        if total_s <= MMAUDIO_WINDOW:
-            return [(0.0, total_s)]
-        step_s = MMAUDIO_WINDOW - cf_s
-        segs, t = [], 0.0
-        while True:
-            if t + MMAUDIO_WINDOW >= total_s:
-                segs.append((max(0.0, total_s - MMAUDIO_WINDOW), total_s))
-                break
-            segs.append((t, t + MMAUDIO_WINDOW))
-            t += step_s
-        return segs
-    segments = _mma_build_segments(total_dur_s, MMA_CF_S)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
     sr = seq_cfg.sampling_rate   # 44100
@@ -480,22 +492,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
             wav = wav[:, :seg_samples]
             seg_audios.append(wav)
-        # Crossfade-stitch all segments (equal-power fade)
-        def _cf_join(a, b, cf_s):
-            cf = int(round(cf_s * sr))
-            cf = min(cf, a.shape[1], b.shape[1])
-            if cf <= 0:
-                return np.concatenate([a, b], axis=1)
-            gain = 10 ** (MMA_CF_DB / 20.0)
-            t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
-            fade_out = np.cos(t * np.pi / 2)
-            fade_in  = np.sin(t * np.pi / 2)
-            overlap = a[:, -cf:] * fade_out * gain + b[:, :cf] * fade_in * gain
-            return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
-            full_wav = _cf_join(full_wav, nw, MMA_CF_S)
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
@@ -574,7 +574,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     total_dur_s = get_video_duration(video_file)
     CF_S  = float(crossfade_s)
     CF_DB = float(crossfade_db)
-    segments = _taro_build_segments(total_dur_s, CF_S)   # reuse TARO helper
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-encode text features once (same for every segment)
@@ -628,22 +628,10 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
-        # Stitch segments with equal-power crossfade (operates on (channels, samples) arrays)
-        def _cf_join_stereo(a, b, cf_s, db):
-            cf = int(round(cf_s * sr))
-            cf = min(cf, a.shape[1], b.shape[1])
-            if cf <= 0:
-                return np.concatenate([a, b], axis=1)
-            gain = 10 ** (db / 20.0)
-            t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
-            fade_out = np.cos(t * np.pi / 2)
-            fade_in  = np.sin(t * np.pi / 2)
-            overlap = a[:, -cf:] * fade_out * gain + b[:, :cf] * fade_in * gain
-            return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
-            full_wav = _cf_join_stereo(full_wav, nw, CF_S, CF_DB)
         # Trim to exact video duration
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]

     ).run(overwrite_output=True, quiet=True)
+# ------------------------------------------------------------------ #
+# Shared sliding-window segmentation and crossfade helpers            #
+# Used by all three models (TARO, MMAudio, HunyuanFoley).            #
+# ------------------------------------------------------------------ #
+def _build_segments(total_dur_s: float, window_s: float, crossfade_s: float) -> list:
+    """Return list of (start, end) pairs covering *total_dur_s* with a sliding
+    window of *window_s* and *crossfade_s* overlap between consecutive segments."""
+    if total_dur_s <= window_s:
+        return [(0.0, total_dur_s)]
+    step_s = window_s - crossfade_s
+    segments, seg_start = [], 0.0
+    while True:
+        if seg_start + window_s >= total_dur_s:
+            seg_start = max(0.0, total_dur_s - window_s)
+            segments.append((seg_start, total_dur_s))
+            break
+        segments.append((seg_start, seg_start + window_s))
+        seg_start += step_s
+    return segments
+def _cf_join_stereo(a: np.ndarray, b: np.ndarray,
+                    crossfade_s: float, db_boost: float, sr: int) -> np.ndarray:
+    """Equal-power crossfade join for stereo (C, T) numpy arrays."""
+    cf = int(round(crossfade_s * sr))
+    cf = min(cf, a.shape[1], b.shape[1])
+    if cf <= 0:
+        return np.concatenate([a, b], axis=1)
+    gain = 10 ** (db_boost / 20.0)
+    t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
+    fade_out = np.cos(t * np.pi / 2)   # 1 → 0
+    fade_in  = np.sin(t * np.pi / 2)   # 0 → 1
+    overlap = a[:, -cf:] * fade_out * gain + b[:, :cf] * fade_in * gain
+    return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
 # ================================================================== #
 #                              TARO                                   #
 # ================================================================== #
 def _taro_build_segments(total_dur_s: float, crossfade_s: float) -> list:
+    """Sliding-window segmentation using TARO's 8.192 s window."""
+    return _build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s)
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
     outputs = []
     # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
+    # with a crossfade overlap and stitch the results into a full-length track.
     total_dur_s = get_video_duration(video_file)
     MMA_CF_S    = float(crossfade_s)
     MMA_CF_DB   = float(crossfade_db)
+    segments = _build_segments(total_dur_s, MMAUDIO_WINDOW, MMA_CF_S)
     print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
     sr = seq_cfg.sampling_rate   # 44100
             wav = wav[:, :seg_samples]
             seg_audios.append(wav)
+        # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
+            full_wav = _cf_join_stereo(full_wav, nw, MMA_CF_S, MMA_CF_DB, sr)
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
     total_dur_s = get_video_duration(video_file)
     CF_S  = float(crossfade_s)
     CF_DB = float(crossfade_db)
+    segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, CF_S)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
     # Pre-encode text features once (same for every segment)
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
+        # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
+            full_wav = _cf_join_stereo(full_wav, nw, CF_S, CF_DB, sr)
         # Trim to exact video duration
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]