Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on about 12 hours ago

Commit

9f07d3f

1 Parent(s): d5399ac

Fix segment bleed: equal-spaced windows + contact-edge trimming

_build_segments now places n equally-spaced full-window segments so every
overlap is identical and >= crossfade_s, eliminating the unequal last-
boundary overlap that caused raw bleed.

_stitch_wavs now accepts segments list and trims each generated wav to its
contact-edge window (midpoint of overlap +/- half_cf) before crossfade-join,
so the crossfade zone is always exactly crossfade_s wide at every boundary.

Files changed (1) hide show

app.py +77 -19

app.py CHANGED Viewed

@@ -411,22 +411,34 @@ def mux_video_audio(silent_video: str, audio_path: str, output_path: str,
 # ------------------------------------------------------------------ #
 def _build_segments(total_dur_s: float, window_s: float, crossfade_s: float) -> list[tuple[float, float]]:
-    """Return list of (start, end) pairs covering *total_dur_s* with a sliding
-    window of *window_s* and *crossfade_s* overlap between consecutive segments."""
-    # Safety: clamp crossfade to < half the window so step_s stays positive
     crossfade_s = min(crossfade_s, window_s * 0.5)
     if total_dur_s <= window_s:
         return [(0.0, total_dur_s)]
-    step_s = window_s - crossfade_s
-    segments, seg_start = [], 0.0
-    while True:
-        if seg_start + window_s >= total_dur_s:
-            seg_start = max(0.0, total_dur_s - window_s)
-            segments.append((seg_start, total_dur_s))
-            break
-        segments.append((seg_start, seg_start + window_s))
-        seg_start += step_s
-    return segments
 def _cf_join(a: np.ndarray, b: np.ndarray,
@@ -687,12 +699,58 @@ def _upsample_taro(wav_16k: np.ndarray) -> np.ndarray:
 def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
-                 total_dur_s: float, sr: int) -> np.ndarray:
     """Crossfade-join a list of wav arrays and trim to *total_dur_s*.
-    Works for both mono (T,) and stereo (C, T) arrays."""
-    out = wavs[0]
-    for nw in wavs[1:]:
         out = _cf_join(out, nw, crossfade_s, db_boost, sr)
     n = int(round(total_dur_s * sr))
     return out[:, :n] if out.ndim == 2 else out[:n]
@@ -757,7 +815,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
     for sample_idx, result in enumerate(results):
         seg_wavs = result[0]
-        full_wav   = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr)
         audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
         _save_wav(audio_path, full_wav, sr)
         video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
@@ -1242,7 +1300,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     segments     = meta["segments"]
     model        = meta["model"]
-    full_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, sr)
     # Save new audio — use a new timestamped filename so Gradio / the browser
     # treats it as a genuinely different file and reloads the video player.

 # ------------------------------------------------------------------ #
 def _build_segments(total_dur_s: float, window_s: float, crossfade_s: float) -> list[tuple[float, float]]:
+    """Return list of (start, end) pairs covering *total_dur_s*.
+    Every segment uses the full *window_s* inference window.  Segments are
+    equally spaced so every overlap is identical, guaranteeing the crossfade
+    setting is honoured at every boundary with no raw bleed.
+    Algorithm
+    ---------
+    1. Clamp crossfade_s so the step stays positive.
+    2. Find the minimum n such that n segments of *window_s* cover
+       *total_dur_s* with overlap ≥ crossfade_s at every boundary:
+           n = ceil((total_dur_s - crossfade_s) / (window_s - crossfade_s))
+    3. Compute equal spacing: step = (total_dur_s - window_s) / (n - 1)
+       so that every gap is identical and the last segment ends exactly at
+       total_dur_s.
+    4. Every segment is exactly *window_s* wide.  The trailing audio of each
+       segment beyond its contact edge is discarded in _stitch_wavs.
+    """
     crossfade_s = min(crossfade_s, window_s * 0.5)
     if total_dur_s <= window_s:
         return [(0.0, total_dur_s)]
+    import math
+    step_min = window_s - crossfade_s          # minimum step to honour crossfade
+    n = math.ceil((total_dur_s - crossfade_s) / step_min)
+    n = max(n, 2)
+    # Equal step so first seg starts at 0 and last seg ends at total_dur_s
+    step_s = (total_dur_s - window_s) / (n - 1)
+    return [(i * step_s, i * step_s + window_s) for i in range(n)]
 def _cf_join(a: np.ndarray, b: np.ndarray,
 def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
+                 total_dur_s: float, sr: int,
+                 segments: list[tuple[float, float]] = None) -> np.ndarray:
     """Crossfade-join a list of wav arrays and trim to *total_dur_s*.
+    Works for both mono (T,) and stereo (C, T) arrays.
+    When *segments* is provided (list of (start, end) video-time pairs),
+    each wav is trimmed to its contact-edge window before joining:
+      contact_edge[i→i+1] = midpoint of overlap = (seg[i].end + seg[i+1].start) / 2
+      half_cf              = crossfade_s / 2
+      seg i  keep: [contact_edge[i-1→i] - half_cf,  contact_edge[i→i+1] + half_cf]
+             expressed as sample offsets into the generated audio for that segment.
+    This guarantees every crossfade zone is exactly crossfade_s wide with no
+    raw bleed regardless of how much the inference windows overlap.
+    """
+    def _trim(wav, start_s, end_s, seg_start_s):
+        """Trim wav to [start_s, end_s] expressed in absolute video time,
+        where the wav starts at seg_start_s in video time."""
+        s = max(0, int(round((start_s - seg_start_s) * sr)))
+        e = int(round((end_s   - seg_start_s) * sr))
+        e = min(e, wav.shape[1] if wav.ndim == 2 else len(wav))
+        return wav[:, s:e] if wav.ndim == 2 else wav[s:e]
+    if segments is None or len(segments) == 1:
+        out = wavs[0]
+        for nw in wavs[1:]:
+            out = _cf_join(out, nw, crossfade_s, db_boost, sr)
+        n = int(round(total_dur_s * sr))
+        return out[:, :n] if out.ndim == 2 else out[:n]
+    half_cf = crossfade_s / 2.0
+    # Compute contact edges between consecutive segments
+    contact_edges = [
+        (segments[i][1] + segments[i + 1][0]) / 2.0
+        for i in range(len(segments) - 1)
+    ]
+    # Trim each segment to its keep window
+    trimmed = []
+    for i, (wav, (seg_start, seg_end)) in enumerate(zip(wavs, segments)):
+        keep_start = (contact_edges[i - 1] - half_cf) if i > 0                 else seg_start
+        keep_end   = (contact_edges[i]     + half_cf) if i < len(segments) - 1 else total_dur_s
+        trimmed.append(_trim(wav, keep_start, keep_end, seg_start))
+    # Crossfade-join the trimmed segments
+    out = trimmed[0]
+    for nw in trimmed[1:]:
         out = _cf_join(out, nw, crossfade_s, db_boost, sr)
     n = int(round(total_dur_s * sr))
     return out[:, :n] if out.ndim == 2 else out[:n]
     for sample_idx, result in enumerate(results):
         seg_wavs = result[0]
+        full_wav   = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
         audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
         _save_wav(audio_path, full_wav, sr)
         video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
     segments     = meta["segments"]
     model        = meta["model"]
+    full_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
     # Save new audio — use a new timestamped filename so Gradio / the browser
     # treats it as a genuinely different file and reloads the video player.