BoxOfColors Claude Sonnet 4.6 commited on
Commit
b49a86d
·
1 Parent(s): 2e7f9a4

Fix xregen duration truncation: use full segment span, not model window

Browse files

_xregen_clip_window was centering target_window_s on the segment midpoint,
returning clip_dur=target_window_s even when the original segment was longer.
_build_segments then produced only 1 sub-seg, generating e.g. 8s of MMAudio
audio for a 15s segment — causing _stitch_wavs to truncate the total output.

Fix: _xregen_clip_window now always returns the full original segment span
(seg_start..seg_end clamped to video bounds). _build_segments then splits
it into as many model-sized sub-windows as needed, exactly like initial
generation — so the stitched regen wav always covers the full segment.

Also reverts the incorrect silence-padding workaround from the prior commit.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +15 -42
app.py CHANGED
@@ -1744,28 +1744,18 @@ def _resolve_silent_video(meta: dict) -> str:
1744
  def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
1745
  """Compute the video clip window for a cross-model regen.
1746
 
1747
- Centers *target_window_s* on the original segment's midpoint, clamped to
1748
- [0, total_dur_s]. Returns (clip_start, clip_end, clip_dur).
1749
-
1750
- If the video is shorter than *target_window_s*, the full video is used
1751
- (suboptimal but never breaks). If the segment span exceeds
1752
- *target_window_s*, the caller should run _build_segments on the span and
1753
- generate multiple sub-segments — but the clip window is still returned as
1754
- the full segment span so the caller can decide.
1755
  """
1756
  total_dur_s = float(meta["total_dur_s"])
1757
  seg_start, seg_end = meta["segments"][seg_idx]
1758
- seg_mid = (seg_start + seg_end) / 2.0
1759
- half_win = target_window_s / 2.0
1760
-
1761
- clip_start = max(0.0, seg_mid - half_win)
1762
- clip_end = min(total_dur_s, seg_mid + half_win)
1763
- # If clamped at one end, extend the other to preserve full window if possible
1764
- if clip_start == 0.0:
1765
- clip_end = min(total_dur_s, target_window_s)
1766
- elif clip_end == total_dur_s:
1767
- clip_start = max(0.0, total_dur_s - target_window_s)
1768
- clip_dur = clip_end - clip_start
1769
  return clip_start, clip_end, clip_dur
1770
 
1771
 
@@ -1785,35 +1775,18 @@ def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
1785
  slot_wavs = _load_seg_wavs(meta["wav_paths"])
1786
  new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
1787
 
1788
- # Align new_wav so sample index 0 corresponds to seg_start in video time,
1789
- # and the wav is long enough to cover the full original segment window.
1790
- #
1791
- # _stitch_wavs trims each wav relative to its seg_start, expecting the wav
1792
- # to cover the full segment window (seg_end - seg_start). xregen models
1793
- # may generate a shorter clip (e.g. MMAudio 8 s on a 15 s segment), which
1794
- # causes _stitch_wavs to trim short and produce truncated output.
1795
- #
1796
- # Steps:
1797
- # 1. Prepend silence if the clip started after seg_start.
1798
- # 2. Append silence if the wav is still shorter than the full segment window.
1799
  if clip_start_s is not None:
1800
- seg_start, seg_end = meta["segments"][seg_idx]
1801
- full_seg_samples = int(round((seg_end - seg_start) * slot_sr))
1802
-
1803
- # Step 1: prepend silence to align to seg_start
1804
- offset_s = seg_start - clip_start_s # negative when clip starts after seg_start
1805
  if offset_s < 0:
1806
  pad_samples = int(round(abs(offset_s) * slot_sr))
1807
  silence = np.zeros((new_wav.shape[0], pad_samples), dtype=new_wav.dtype)
1808
  new_wav = np.concatenate([silence, new_wav], axis=1)
1809
 
1810
- # Step 2: append silence to fill the full segment window
1811
- current_samples = new_wav.shape[1]
1812
- if current_samples < full_seg_samples:
1813
- tail = np.zeros((new_wav.shape[0], full_seg_samples - current_samples),
1814
- dtype=new_wav.dtype)
1815
- new_wav = np.concatenate([new_wav, tail], axis=1)
1816
-
1817
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1818
  new_wav, seg_idx, meta, slot_id
1819
  )
 
1744
  def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
1745
  """Compute the video clip window for a cross-model regen.
1746
 
1747
+ Always returns the full original segment span (clamped to [0, total_dur_s]).
1748
+ The caller passes clip_dur to _build_segments, which splits it into
1749
+ model-sized sub-windows exactly as initial generation does — ensuring the
1750
+ regen wav covers the full segment duration regardless of model window size.
1751
+
1752
+ *target_window_s* is unused but kept for call-site compatibility.
 
 
1753
  """
1754
  total_dur_s = float(meta["total_dur_s"])
1755
  seg_start, seg_end = meta["segments"][seg_idx]
1756
+ clip_start = max(0.0, seg_start)
1757
+ clip_end = min(total_dur_s, seg_end)
1758
+ clip_dur = clip_end - clip_start
 
 
 
 
 
 
 
 
1759
  return clip_start, clip_end, clip_dur
1760
 
1761
 
 
1775
  slot_wavs = _load_seg_wavs(meta["wav_paths"])
1776
  new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
1777
 
1778
+ # Align new_wav so sample index 0 corresponds to seg_start in video time.
1779
+ # _stitch_wavs trims using seg_start as the time origin, so if the clip
1780
+ # started AFTER seg_start (clip_start_s > seg_start), we prepend silence
1781
+ # equal to (clip_start_s - seg_start) to shift the audio back to seg_start.
 
 
 
 
 
 
 
1782
  if clip_start_s is not None:
1783
+ seg_start = meta["segments"][seg_idx][0]
1784
+ offset_s = seg_start - clip_start_s # negative when clip starts after seg_start
 
 
 
1785
  if offset_s < 0:
1786
  pad_samples = int(round(abs(offset_s) * slot_sr))
1787
  silence = np.zeros((new_wav.shape[0], pad_samples), dtype=new_wav.dtype)
1788
  new_wav = np.concatenate([silence, new_wav], axis=1)
1789
 
 
 
 
 
 
 
 
1790
  video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
1791
  new_wav, seg_idx, meta, slot_id
1792
  )