Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running

BoxOfColors Claude Sonnet 4.6 commited on Mar 31

Commit

b49a86d

1 Parent(s): 2e7f9a4

Fix xregen duration truncation: use full segment span, not model window

_xregen_clip_window was centering target_window_s on the segment midpoint,
returning clip_dur=target_window_s even when the original segment was longer.
_build_segments then produced only 1 sub-seg, generating e.g. 8s of MMAudio
audio for a 15s segment — causing _stitch_wavs to truncate the total output.

Fix: _xregen_clip_window now always returns the full original segment span
(seg_start..seg_end clamped to video bounds). _build_segments then splits
it into as many model-sized sub-windows as needed, exactly like initial
generation — so the stitched regen wav always covers the full segment.

Also reverts the incorrect silence-padding workaround from the prior commit.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +15 -42

app.py CHANGED Viewed

@@ -1744,28 +1744,18 @@ def _resolve_silent_video(meta: dict) -> str:
 def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
     """Compute the video clip window for a cross-model regen.
-    Centers *target_window_s* on the original segment's midpoint, clamped to
-    [0, total_dur_s].  Returns (clip_start, clip_end, clip_dur).
-    If the video is shorter than *target_window_s*, the full video is used
-    (suboptimal but never breaks).  If the segment span exceeds
-    *target_window_s*, the caller should run _build_segments on the span and
-    generate multiple sub-segments — but the clip window is still returned as
-    the full segment span so the caller can decide.
     """
     total_dur_s = float(meta["total_dur_s"])
     seg_start, seg_end = meta["segments"][seg_idx]
-    seg_mid   = (seg_start + seg_end) / 2.0
-    half_win  = target_window_s / 2.0
-    clip_start = max(0.0, seg_mid - half_win)
-    clip_end   = min(total_dur_s, seg_mid + half_win)
-    # If clamped at one end, extend the other to preserve full window if possible
-    if clip_start == 0.0:
-        clip_end = min(total_dur_s, target_window_s)
-    elif clip_end == total_dur_s:
-        clip_start = max(0.0, total_dur_s - target_window_s)
-    clip_dur = clip_end - clip_start
     return clip_start, clip_end, clip_dur
@@ -1785,35 +1775,18 @@ def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
     slot_wavs = _load_seg_wavs(meta["wav_paths"])
     new_wav   = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
-    # Align new_wav so sample index 0 corresponds to seg_start in video time,
-    # and the wav is long enough to cover the full original segment window.
-    #
-    # _stitch_wavs trims each wav relative to its seg_start, expecting the wav
-    # to cover the full segment window (seg_end - seg_start).  xregen models
-    # may generate a shorter clip (e.g. MMAudio 8 s on a 15 s segment), which
-    # causes _stitch_wavs to trim short and produce truncated output.
-    #
-    # Steps:
-    #   1. Prepend silence if the clip started after seg_start.
-    #   2. Append silence if the wav is still shorter than the full segment window.
     if clip_start_s is not None:
-        seg_start, seg_end = meta["segments"][seg_idx]
-        full_seg_samples = int(round((seg_end - seg_start) * slot_sr))
-        # Step 1: prepend silence to align to seg_start
-        offset_s = seg_start - clip_start_s   # negative when clip starts after seg_start
         if offset_s < 0:
             pad_samples = int(round(abs(offset_s) * slot_sr))
             silence = np.zeros((new_wav.shape[0], pad_samples), dtype=new_wav.dtype)
             new_wav = np.concatenate([silence, new_wav], axis=1)
-        # Step 2: append silence to fill the full segment window
-        current_samples = new_wav.shape[1]
-        if current_samples < full_seg_samples:
-            tail = np.zeros((new_wav.shape[0], full_seg_samples - current_samples),
-                            dtype=new_wav.dtype)
-            new_wav = np.concatenate([new_wav, tail], axis=1)
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )

 def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
     """Compute the video clip window for a cross-model regen.
+    Always returns the full original segment span (clamped to [0, total_dur_s]).
+    The caller passes clip_dur to _build_segments, which splits it into
+    model-sized sub-windows exactly as initial generation does — ensuring the
+    regen wav covers the full segment duration regardless of model window size.
+    *target_window_s* is unused but kept for call-site compatibility.
     """
     total_dur_s = float(meta["total_dur_s"])
     seg_start, seg_end = meta["segments"][seg_idx]
+    clip_start = max(0.0, seg_start)
+    clip_end   = min(total_dur_s, seg_end)
+    clip_dur   = clip_end - clip_start
     return clip_start, clip_end, clip_dur
     slot_wavs = _load_seg_wavs(meta["wav_paths"])
     new_wav   = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
+    # Align new_wav so sample index 0 corresponds to seg_start in video time.
+    # _stitch_wavs trims using seg_start as the time origin, so if the clip
+    # started AFTER seg_start (clip_start_s > seg_start), we prepend silence
+    # equal to (clip_start_s - seg_start) to shift the audio back to seg_start.
     if clip_start_s is not None:
+        seg_start = meta["segments"][seg_idx][0]
+        offset_s  = seg_start - clip_start_s   # negative when clip starts after seg_start
         if offset_s < 0:
             pad_samples = int(round(abs(offset_s) * slot_sr))
             silence = np.zeros((new_wav.shape[0], pad_samples), dtype=new_wav.dtype)
             new_wav = np.concatenate([silence, new_wav], axis=1)
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )