Spaces:
Running on Zero
Running on Zero
Commit ·
93fa90c
1
Parent(s): 52e1535
Fix regen/xregen errors when silent_video tmp file is missing
Browse filesAdd _resolve_silent_video(meta) helper that falls back to re-stripping
the original Gradio upload (source_video) when silent_video has been
evicted from tmp or the Space restarted since generation.
- Store source_video (original video_file upload path) in seg_meta via
_build_seg_meta and _post_process_samples for all three models.
- Replace all meta[silent_video] direct reads in regen/xregen paths
with _resolve_silent_video(meta) — covers _splice_and_save,
regen_taro/mmaudio/hunyuan, and xregen_taro/mmaudio/hunyuan.
app.py
CHANGED
|
@@ -780,15 +780,22 @@ def _log_inference_timing(label: str, elapsed: float, n_segs: int,
|
|
| 780 |
|
| 781 |
def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
|
| 782 |
silent_video, sr, model, crossfade_s, crossfade_db,
|
| 783 |
-
total_dur_s, **extras) -> dict:
|
| 784 |
"""Build the seg_meta dict shared by all three generate_* functions.
|
| 785 |
-
Model-specific keys are passed via **extras.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
meta = {
|
| 787 |
"segments": segments,
|
| 788 |
"wav_paths": wav_paths,
|
| 789 |
"audio_path": audio_path,
|
| 790 |
"video_path": video_path,
|
| 791 |
"silent_video": silent_video,
|
|
|
|
| 792 |
"sr": sr,
|
| 793 |
"model": model,
|
| 794 |
"crossfade_s": crossfade_s,
|
|
@@ -803,6 +810,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
|
|
| 803 |
silent_video: str, segments: list,
|
| 804 |
crossfade_s: float, crossfade_db: float,
|
| 805 |
total_dur_s: float, sr: int,
|
|
|
|
| 806 |
extra_meta_fn=None) -> list:
|
| 807 |
"""Shared CPU post-processing for all three generate_* wrappers.
|
| 808 |
|
|
@@ -832,7 +840,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
|
|
| 832 |
segments=segments, wav_paths=wav_paths, audio_path=audio_path,
|
| 833 |
video_path=video_path, silent_video=silent_video, sr=sr,
|
| 834 |
model=model, crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 835 |
-
total_dur_s=total_dur_s, **extras,
|
| 836 |
)
|
| 837 |
outputs.append((video_path, audio_path, seg_meta))
|
| 838 |
return outputs
|
|
@@ -974,6 +982,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 974 |
silent_video=silent_video, segments=segments,
|
| 975 |
crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 976 |
total_dur_s=total_dur_s, sr=TARO_SR_OUT,
|
|
|
|
| 977 |
extra_meta_fn=_taro_extras,
|
| 978 |
)
|
| 979 |
return _pad_outputs(outputs)
|
|
@@ -1120,6 +1129,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 1120 |
silent_video=silent_video, segments=segments,
|
| 1121 |
crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 1122 |
total_dur_s=total_dur_s, sr=TARGET_SR,
|
|
|
|
| 1123 |
)
|
| 1124 |
return _pad_outputs(outputs)
|
| 1125 |
|
|
@@ -1269,6 +1279,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 1269 |
silent_video=silent_video, segments=segments,
|
| 1270 |
crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 1271 |
total_dur_s=total_dur_s, sr=48000,
|
|
|
|
| 1272 |
extra_meta_fn=_hunyuan_extras,
|
| 1273 |
)
|
| 1274 |
return _pad_outputs(outputs)
|
|
@@ -1295,7 +1306,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
|
|
| 1295 |
crossfade_db = float(meta["crossfade_db"])
|
| 1296 |
sr = int(meta["sr"])
|
| 1297 |
total_dur_s = float(meta["total_dur_s"])
|
| 1298 |
-
silent_video = meta
|
| 1299 |
segments = meta["segments"]
|
| 1300 |
model = meta["model"]
|
| 1301 |
|
|
@@ -1378,7 +1389,7 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
|
|
| 1378 |
print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
|
| 1379 |
from TARO.onset_util import extract_onset
|
| 1380 |
extract_cavp, onset_model = _load_taro_feature_extractors(device)
|
| 1381 |
-
silent_video = meta
|
| 1382 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1383 |
cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
|
| 1384 |
onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
|
|
@@ -1449,7 +1460,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
|
|
| 1449 |
# Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
|
| 1450 |
# This avoids any cross-process context passing that fails under ZeroGPU isolation.
|
| 1451 |
seg_path = _extract_segment_clip(
|
| 1452 |
-
meta
|
| 1453 |
os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
|
| 1454 |
)
|
| 1455 |
|
|
@@ -1531,7 +1542,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
|
|
| 1531 |
|
| 1532 |
# Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
|
| 1533 |
seg_path = _extract_segment_clip(
|
| 1534 |
-
meta
|
| 1535 |
os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
|
| 1536 |
)
|
| 1537 |
|
|
@@ -1620,6 +1631,30 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
|
|
| 1620 |
return wav
|
| 1621 |
|
| 1622 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1623 |
def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
|
| 1624 |
"""Compute the video clip window for a cross-model regen.
|
| 1625 |
|
|
@@ -1718,7 +1753,7 @@ def xregen_taro(seg_idx, state_json, slot_id,
|
|
| 1718 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, TARO_MODEL_DUR)
|
| 1719 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1720 |
clip_path = _extract_segment_clip(
|
| 1721 |
-
meta
|
| 1722 |
os.path.join(tmp_dir, "xregen_taro_clip.mp4"),
|
| 1723 |
)
|
| 1724 |
# Build a minimal fake-video meta so generate_taro can run on clip_path
|
|
@@ -1755,7 +1790,7 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
|
|
| 1755 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
|
| 1756 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1757 |
clip_path = _extract_segment_clip(
|
| 1758 |
-
meta
|
| 1759 |
os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
|
| 1760 |
)
|
| 1761 |
sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
|
|
@@ -1787,7 +1822,7 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
|
|
| 1787 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
|
| 1788 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1789 |
clip_path = _extract_segment_clip(
|
| 1790 |
-
meta
|
| 1791 |
os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
|
| 1792 |
)
|
| 1793 |
sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
|
|
|
|
| 780 |
|
| 781 |
def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
|
| 782 |
silent_video, sr, model, crossfade_s, crossfade_db,
|
| 783 |
+
total_dur_s, source_video=None, **extras) -> dict:
|
| 784 |
"""Build the seg_meta dict shared by all three generate_* functions.
|
| 785 |
+
Model-specific keys are passed via **extras.
|
| 786 |
+
|
| 787 |
+
*source_video* is the original Gradio-managed upload path
|
| 788 |
+
(/tmp/gradio/...). It lives for the entire session and is used as a
|
| 789 |
+
fallback when *silent_video* (which lives in a managed tmp dir) has been
|
| 790 |
+
evicted or the Space has restarted since generation.
|
| 791 |
+
"""
|
| 792 |
meta = {
|
| 793 |
"segments": segments,
|
| 794 |
"wav_paths": wav_paths,
|
| 795 |
"audio_path": audio_path,
|
| 796 |
"video_path": video_path,
|
| 797 |
"silent_video": silent_video,
|
| 798 |
+
"source_video": source_video or video_path,
|
| 799 |
"sr": sr,
|
| 800 |
"model": model,
|
| 801 |
"crossfade_s": crossfade_s,
|
|
|
|
| 810 |
silent_video: str, segments: list,
|
| 811 |
crossfade_s: float, crossfade_db: float,
|
| 812 |
total_dur_s: float, sr: int,
|
| 813 |
+
source_video: str = None,
|
| 814 |
extra_meta_fn=None) -> list:
|
| 815 |
"""Shared CPU post-processing for all three generate_* wrappers.
|
| 816 |
|
|
|
|
| 840 |
segments=segments, wav_paths=wav_paths, audio_path=audio_path,
|
| 841 |
video_path=video_path, silent_video=silent_video, sr=sr,
|
| 842 |
model=model, crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 843 |
+
total_dur_s=total_dur_s, source_video=source_video, **extras,
|
| 844 |
)
|
| 845 |
outputs.append((video_path, audio_path, seg_meta))
|
| 846 |
return outputs
|
|
|
|
| 982 |
silent_video=silent_video, segments=segments,
|
| 983 |
crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 984 |
total_dur_s=total_dur_s, sr=TARO_SR_OUT,
|
| 985 |
+
source_video=video_file,
|
| 986 |
extra_meta_fn=_taro_extras,
|
| 987 |
)
|
| 988 |
return _pad_outputs(outputs)
|
|
|
|
| 1129 |
silent_video=silent_video, segments=segments,
|
| 1130 |
crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 1131 |
total_dur_s=total_dur_s, sr=TARGET_SR,
|
| 1132 |
+
source_video=video_file,
|
| 1133 |
)
|
| 1134 |
return _pad_outputs(outputs)
|
| 1135 |
|
|
|
|
| 1279 |
silent_video=silent_video, segments=segments,
|
| 1280 |
crossfade_s=crossfade_s, crossfade_db=crossfade_db,
|
| 1281 |
total_dur_s=total_dur_s, sr=48000,
|
| 1282 |
+
source_video=video_file,
|
| 1283 |
extra_meta_fn=_hunyuan_extras,
|
| 1284 |
)
|
| 1285 |
return _pad_outputs(outputs)
|
|
|
|
| 1306 |
crossfade_db = float(meta["crossfade_db"])
|
| 1307 |
sr = int(meta["sr"])
|
| 1308 |
total_dur_s = float(meta["total_dur_s"])
|
| 1309 |
+
silent_video = _resolve_silent_video(meta)
|
| 1310 |
segments = meta["segments"]
|
| 1311 |
model = meta["model"]
|
| 1312 |
|
|
|
|
| 1389 |
print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
|
| 1390 |
from TARO.onset_util import extract_onset
|
| 1391 |
extract_cavp, onset_model = _load_taro_feature_extractors(device)
|
| 1392 |
+
silent_video = _resolve_silent_video(meta)
|
| 1393 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1394 |
cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
|
| 1395 |
onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
|
|
|
|
| 1460 |
# Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
|
| 1461 |
# This avoids any cross-process context passing that fails under ZeroGPU isolation.
|
| 1462 |
seg_path = _extract_segment_clip(
|
| 1463 |
+
_resolve_silent_video(meta), seg_start, seg_dur,
|
| 1464 |
os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
|
| 1465 |
)
|
| 1466 |
|
|
|
|
| 1542 |
|
| 1543 |
# Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
|
| 1544 |
seg_path = _extract_segment_clip(
|
| 1545 |
+
_resolve_silent_video(meta), seg_start, seg_dur,
|
| 1546 |
os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
|
| 1547 |
)
|
| 1548 |
|
|
|
|
| 1631 |
return wav
|
| 1632 |
|
| 1633 |
|
| 1634 |
+
def _resolve_silent_video(meta: dict) -> str:
|
| 1635 |
+
"""Return a valid silent (audio-stripped) video path for *meta*.
|
| 1636 |
+
|
| 1637 |
+
Prefers meta["silent_video"] if it still exists on disk. Falls back to
|
| 1638 |
+
re-stripping meta["source_video"] (the original Gradio upload path, which
|
| 1639 |
+
persists for the full session lifetime) into a fresh tmp file.
|
| 1640 |
+
This prevents xregen failures caused by tmp-dir eviction or Space restarts
|
| 1641 |
+
between initial generation and the regen call.
|
| 1642 |
+
"""
|
| 1643 |
+
sv = meta.get("silent_video", "")
|
| 1644 |
+
if sv and os.path.exists(sv):
|
| 1645 |
+
return sv
|
| 1646 |
+
source = meta.get("source_video") or meta.get("video_path", "")
|
| 1647 |
+
if not source or not os.path.exists(source):
|
| 1648 |
+
raise FileNotFoundError(
|
| 1649 |
+
f"Cannot locate source video for regen — "
|
| 1650 |
+
f"silent_video={sv!r}, source_video={source!r}"
|
| 1651 |
+
)
|
| 1652 |
+
out = os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "silent_input.mp4")
|
| 1653 |
+
print(f"[regen] silent_video missing, re-stripping from source_video: {source}")
|
| 1654 |
+
strip_audio_from_video(source, out)
|
| 1655 |
+
return out
|
| 1656 |
+
|
| 1657 |
+
|
| 1658 |
def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
|
| 1659 |
"""Compute the video clip window for a cross-model regen.
|
| 1660 |
|
|
|
|
| 1753 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, TARO_MODEL_DUR)
|
| 1754 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1755 |
clip_path = _extract_segment_clip(
|
| 1756 |
+
_resolve_silent_video(meta), clip_start, clip_dur,
|
| 1757 |
os.path.join(tmp_dir, "xregen_taro_clip.mp4"),
|
| 1758 |
)
|
| 1759 |
# Build a minimal fake-video meta so generate_taro can run on clip_path
|
|
|
|
| 1790 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
|
| 1791 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1792 |
clip_path = _extract_segment_clip(
|
| 1793 |
+
_resolve_silent_video(meta), clip_start, clip_dur,
|
| 1794 |
os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
|
| 1795 |
)
|
| 1796 |
sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
|
|
|
|
| 1822 |
clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
|
| 1823 |
tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
|
| 1824 |
clip_path = _extract_segment_clip(
|
| 1825 |
+
_resolve_silent_video(meta), clip_start, clip_dur,
|
| 1826 |
os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
|
| 1827 |
)
|
| 1828 |
sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
|