Commit ·
b49a86d
1
Parent(s): 2e7f9a4
Fix xregen duration truncation: use full segment span, not model window
Browse files_xregen_clip_window was centering target_window_s on the segment midpoint,
returning clip_dur=target_window_s even when the original segment was longer.
_build_segments then produced only 1 sub-seg, generating e.g. 8s of MMAudio
audio for a 15s segment — causing _stitch_wavs to truncate the total output.
Fix: _xregen_clip_window now always returns the full original segment span
(seg_start..seg_end clamped to video bounds). _build_segments then splits
it into as many model-sized sub-windows as needed, exactly like initial
generation — so the stitched regen wav always covers the full segment.
Also reverts the incorrect silence-padding workaround from the prior commit.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -1744,28 +1744,18 @@ def _resolve_silent_video(meta: dict) -> str:
|
|
| 1744 |
def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
|
| 1745 |
"""Compute the video clip window for a cross-model regen.
|
| 1746 |
|
| 1747 |
-
|
| 1748 |
-
|
| 1749 |
-
|
| 1750 |
-
|
| 1751 |
-
|
| 1752 |
-
*target_window_s*
|
| 1753 |
-
generate multiple sub-segments — but the clip window is still returned as
|
| 1754 |
-
the full segment span so the caller can decide.
|
| 1755 |
"""
|
| 1756 |
total_dur_s = float(meta["total_dur_s"])
|
| 1757 |
seg_start, seg_end = meta["segments"][seg_idx]
|
| 1758 |
-
|
| 1759 |
-
|
| 1760 |
-
|
| 1761 |
-
clip_start = max(0.0, seg_mid - half_win)
|
| 1762 |
-
clip_end = min(total_dur_s, seg_mid + half_win)
|
| 1763 |
-
# If clamped at one end, extend the other to preserve full window if possible
|
| 1764 |
-
if clip_start == 0.0:
|
| 1765 |
-
clip_end = min(total_dur_s, target_window_s)
|
| 1766 |
-
elif clip_end == total_dur_s:
|
| 1767 |
-
clip_start = max(0.0, total_dur_s - target_window_s)
|
| 1768 |
-
clip_dur = clip_end - clip_start
|
| 1769 |
return clip_start, clip_end, clip_dur
|
| 1770 |
|
| 1771 |
|
|
@@ -1785,35 +1775,18 @@ def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
|
|
| 1785 |
slot_wavs = _load_seg_wavs(meta["wav_paths"])
|
| 1786 |
new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
|
| 1787 |
|
| 1788 |
-
# Align new_wav so sample index 0 corresponds to seg_start in video time
|
| 1789 |
-
#
|
| 1790 |
-
#
|
| 1791 |
-
#
|
| 1792 |
-
# to cover the full segment window (seg_end - seg_start). xregen models
|
| 1793 |
-
# may generate a shorter clip (e.g. MMAudio 8 s on a 15 s segment), which
|
| 1794 |
-
# causes _stitch_wavs to trim short and produce truncated output.
|
| 1795 |
-
#
|
| 1796 |
-
# Steps:
|
| 1797 |
-
# 1. Prepend silence if the clip started after seg_start.
|
| 1798 |
-
# 2. Append silence if the wav is still shorter than the full segment window.
|
| 1799 |
if clip_start_s is not None:
|
| 1800 |
-
seg_start
|
| 1801 |
-
|
| 1802 |
-
|
| 1803 |
-
# Step 1: prepend silence to align to seg_start
|
| 1804 |
-
offset_s = seg_start - clip_start_s # negative when clip starts after seg_start
|
| 1805 |
if offset_s < 0:
|
| 1806 |
pad_samples = int(round(abs(offset_s) * slot_sr))
|
| 1807 |
silence = np.zeros((new_wav.shape[0], pad_samples), dtype=new_wav.dtype)
|
| 1808 |
new_wav = np.concatenate([silence, new_wav], axis=1)
|
| 1809 |
|
| 1810 |
-
# Step 2: append silence to fill the full segment window
|
| 1811 |
-
current_samples = new_wav.shape[1]
|
| 1812 |
-
if current_samples < full_seg_samples:
|
| 1813 |
-
tail = np.zeros((new_wav.shape[0], full_seg_samples - current_samples),
|
| 1814 |
-
dtype=new_wav.dtype)
|
| 1815 |
-
new_wav = np.concatenate([new_wav, tail], axis=1)
|
| 1816 |
-
|
| 1817 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1818 |
new_wav, seg_idx, meta, slot_id
|
| 1819 |
)
|
|
|
|
| 1744 |
def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
|
| 1745 |
"""Compute the video clip window for a cross-model regen.
|
| 1746 |
|
| 1747 |
+
Always returns the full original segment span (clamped to [0, total_dur_s]).
|
| 1748 |
+
The caller passes clip_dur to _build_segments, which splits it into
|
| 1749 |
+
model-sized sub-windows exactly as initial generation does — ensuring the
|
| 1750 |
+
regen wav covers the full segment duration regardless of model window size.
|
| 1751 |
+
|
| 1752 |
+
*target_window_s* is unused but kept for call-site compatibility.
|
|
|
|
|
|
|
| 1753 |
"""
|
| 1754 |
total_dur_s = float(meta["total_dur_s"])
|
| 1755 |
seg_start, seg_end = meta["segments"][seg_idx]
|
| 1756 |
+
clip_start = max(0.0, seg_start)
|
| 1757 |
+
clip_end = min(total_dur_s, seg_end)
|
| 1758 |
+
clip_dur = clip_end - clip_start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1759 |
return clip_start, clip_end, clip_dur
|
| 1760 |
|
| 1761 |
|
|
|
|
| 1775 |
slot_wavs = _load_seg_wavs(meta["wav_paths"])
|
| 1776 |
new_wav = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
|
| 1777 |
|
| 1778 |
+
# Align new_wav so sample index 0 corresponds to seg_start in video time.
|
| 1779 |
+
# _stitch_wavs trims using seg_start as the time origin, so if the clip
|
| 1780 |
+
# started AFTER seg_start (clip_start_s > seg_start), we prepend silence
|
| 1781 |
+
# equal to (clip_start_s - seg_start) to shift the audio back to seg_start.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1782 |
if clip_start_s is not None:
|
| 1783 |
+
seg_start = meta["segments"][seg_idx][0]
|
| 1784 |
+
offset_s = seg_start - clip_start_s # negative when clip starts after seg_start
|
|
|
|
|
|
|
|
|
|
| 1785 |
if offset_s < 0:
|
| 1786 |
pad_samples = int(round(abs(offset_s) * slot_sr))
|
| 1787 |
silence = np.zeros((new_wav.shape[0], pad_samples), dtype=new_wav.dtype)
|
| 1788 |
new_wav = np.concatenate([silence, new_wav], axis=1)
|
| 1789 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1790 |
video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
|
| 1791 |
new_wav, seg_idx, meta, slot_id
|
| 1792 |
)
|