BoxOfColors commited on
Commit
93fa90c
·
1 Parent(s): 52e1535

Fix regen/xregen errors when silent_video tmp file is missing

Browse files

Add _resolve_silent_video(meta) helper that falls back to re-stripping
the original Gradio upload (source_video) when silent_video has been
evicted from tmp or the Space restarted since generation.

- Store source_video (original video_file upload path) in seg_meta via
_build_seg_meta and _post_process_samples for all three models.
- Replace all meta[silent_video] direct reads in regen/xregen paths
with _resolve_silent_video(meta) — covers _splice_and_save,
regen_taro/mmaudio/hunyuan, and xregen_taro/mmaudio/hunyuan.

Files changed (1) hide show
  1. app.py +45 -10
app.py CHANGED
@@ -780,15 +780,22 @@ def _log_inference_timing(label: str, elapsed: float, n_segs: int,
780
 
781
  def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
782
  silent_video, sr, model, crossfade_s, crossfade_db,
783
- total_dur_s, **extras) -> dict:
784
  """Build the seg_meta dict shared by all three generate_* functions.
785
- Model-specific keys are passed via **extras."""
 
 
 
 
 
 
786
  meta = {
787
  "segments": segments,
788
  "wav_paths": wav_paths,
789
  "audio_path": audio_path,
790
  "video_path": video_path,
791
  "silent_video": silent_video,
 
792
  "sr": sr,
793
  "model": model,
794
  "crossfade_s": crossfade_s,
@@ -803,6 +810,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
803
  silent_video: str, segments: list,
804
  crossfade_s: float, crossfade_db: float,
805
  total_dur_s: float, sr: int,
 
806
  extra_meta_fn=None) -> list:
807
  """Shared CPU post-processing for all three generate_* wrappers.
808
 
@@ -832,7 +840,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
832
  segments=segments, wav_paths=wav_paths, audio_path=audio_path,
833
  video_path=video_path, silent_video=silent_video, sr=sr,
834
  model=model, crossfade_s=crossfade_s, crossfade_db=crossfade_db,
835
- total_dur_s=total_dur_s, **extras,
836
  )
837
  outputs.append((video_path, audio_path, seg_meta))
838
  return outputs
@@ -974,6 +982,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
974
  silent_video=silent_video, segments=segments,
975
  crossfade_s=crossfade_s, crossfade_db=crossfade_db,
976
  total_dur_s=total_dur_s, sr=TARO_SR_OUT,
 
977
  extra_meta_fn=_taro_extras,
978
  )
979
  return _pad_outputs(outputs)
@@ -1120,6 +1129,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
1120
  silent_video=silent_video, segments=segments,
1121
  crossfade_s=crossfade_s, crossfade_db=crossfade_db,
1122
  total_dur_s=total_dur_s, sr=TARGET_SR,
 
1123
  )
1124
  return _pad_outputs(outputs)
1125
 
@@ -1269,6 +1279,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1269
  silent_video=silent_video, segments=segments,
1270
  crossfade_s=crossfade_s, crossfade_db=crossfade_db,
1271
  total_dur_s=total_dur_s, sr=48000,
 
1272
  extra_meta_fn=_hunyuan_extras,
1273
  )
1274
  return _pad_outputs(outputs)
@@ -1295,7 +1306,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
1295
  crossfade_db = float(meta["crossfade_db"])
1296
  sr = int(meta["sr"])
1297
  total_dur_s = float(meta["total_dur_s"])
1298
- silent_video = meta["silent_video"]
1299
  segments = meta["segments"]
1300
  model = meta["model"]
1301
 
@@ -1378,7 +1389,7 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
1378
  print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
1379
  from TARO.onset_util import extract_onset
1380
  extract_cavp, onset_model = _load_taro_feature_extractors(device)
1381
- silent_video = meta["silent_video"]
1382
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1383
  cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
1384
  onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
@@ -1449,7 +1460,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
1449
  # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
1450
  # This avoids any cross-process context passing that fails under ZeroGPU isolation.
1451
  seg_path = _extract_segment_clip(
1452
- meta["silent_video"], seg_start, seg_dur,
1453
  os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
1454
  )
1455
 
@@ -1531,7 +1542,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
1531
 
1532
  # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
1533
  seg_path = _extract_segment_clip(
1534
- meta["silent_video"], seg_start, seg_dur,
1535
  os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
1536
  )
1537
 
@@ -1620,6 +1631,30 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
1620
  return wav
1621
 
1622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1623
  def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
1624
  """Compute the video clip window for a cross-model regen.
1625
 
@@ -1718,7 +1753,7 @@ def xregen_taro(seg_idx, state_json, slot_id,
1718
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, TARO_MODEL_DUR)
1719
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1720
  clip_path = _extract_segment_clip(
1721
- meta["silent_video"], clip_start, clip_dur,
1722
  os.path.join(tmp_dir, "xregen_taro_clip.mp4"),
1723
  )
1724
  # Build a minimal fake-video meta so generate_taro can run on clip_path
@@ -1755,7 +1790,7 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
1755
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
1756
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1757
  clip_path = _extract_segment_clip(
1758
- meta["silent_video"], clip_start, clip_dur,
1759
  os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
1760
  )
1761
  sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
@@ -1787,7 +1822,7 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
1787
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
1788
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1789
  clip_path = _extract_segment_clip(
1790
- meta["silent_video"], clip_start, clip_dur,
1791
  os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
1792
  )
1793
  sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
 
780
 
781
  def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
782
  silent_video, sr, model, crossfade_s, crossfade_db,
783
+ total_dur_s, source_video=None, **extras) -> dict:
784
  """Build the seg_meta dict shared by all three generate_* functions.
785
+ Model-specific keys are passed via **extras.
786
+
787
+ *source_video* is the original Gradio-managed upload path
788
+ (/tmp/gradio/...). It lives for the entire session and is used as a
789
+ fallback when *silent_video* (which lives in a managed tmp dir) has been
790
+ evicted or the Space has restarted since generation.
791
+ """
792
  meta = {
793
  "segments": segments,
794
  "wav_paths": wav_paths,
795
  "audio_path": audio_path,
796
  "video_path": video_path,
797
  "silent_video": silent_video,
798
+ "source_video": source_video or video_path,
799
  "sr": sr,
800
  "model": model,
801
  "crossfade_s": crossfade_s,
 
810
  silent_video: str, segments: list,
811
  crossfade_s: float, crossfade_db: float,
812
  total_dur_s: float, sr: int,
813
+ source_video: str = None,
814
  extra_meta_fn=None) -> list:
815
  """Shared CPU post-processing for all three generate_* wrappers.
816
 
 
840
  segments=segments, wav_paths=wav_paths, audio_path=audio_path,
841
  video_path=video_path, silent_video=silent_video, sr=sr,
842
  model=model, crossfade_s=crossfade_s, crossfade_db=crossfade_db,
843
+ total_dur_s=total_dur_s, source_video=source_video, **extras,
844
  )
845
  outputs.append((video_path, audio_path, seg_meta))
846
  return outputs
 
982
  silent_video=silent_video, segments=segments,
983
  crossfade_s=crossfade_s, crossfade_db=crossfade_db,
984
  total_dur_s=total_dur_s, sr=TARO_SR_OUT,
985
+ source_video=video_file,
986
  extra_meta_fn=_taro_extras,
987
  )
988
  return _pad_outputs(outputs)
 
1129
  silent_video=silent_video, segments=segments,
1130
  crossfade_s=crossfade_s, crossfade_db=crossfade_db,
1131
  total_dur_s=total_dur_s, sr=TARGET_SR,
1132
+ source_video=video_file,
1133
  )
1134
  return _pad_outputs(outputs)
1135
 
 
1279
  silent_video=silent_video, segments=segments,
1280
  crossfade_s=crossfade_s, crossfade_db=crossfade_db,
1281
  total_dur_s=total_dur_s, sr=48000,
1282
+ source_video=video_file,
1283
  extra_meta_fn=_hunyuan_extras,
1284
  )
1285
  return _pad_outputs(outputs)
 
1306
  crossfade_db = float(meta["crossfade_db"])
1307
  sr = int(meta["sr"])
1308
  total_dur_s = float(meta["total_dur_s"])
1309
+ silent_video = _resolve_silent_video(meta)
1310
  segments = meta["segments"]
1311
  model = meta["model"]
1312
 
 
1389
  print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
1390
  from TARO.onset_util import extract_onset
1391
  extract_cavp, onset_model = _load_taro_feature_extractors(device)
1392
+ silent_video = _resolve_silent_video(meta)
1393
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1394
  cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
1395
  onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
 
1460
  # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
1461
  # This avoids any cross-process context passing that fails under ZeroGPU isolation.
1462
  seg_path = _extract_segment_clip(
1463
+ _resolve_silent_video(meta), seg_start, seg_dur,
1464
  os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
1465
  )
1466
 
 
1542
 
1543
  # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
1544
  seg_path = _extract_segment_clip(
1545
+ _resolve_silent_video(meta), seg_start, seg_dur,
1546
  os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
1547
  )
1548
 
 
1631
  return wav
1632
 
1633
 
1634
+ def _resolve_silent_video(meta: dict) -> str:
1635
+ """Return a valid silent (audio-stripped) video path for *meta*.
1636
+
1637
+ Prefers meta["silent_video"] if it still exists on disk. Falls back to
1638
+ re-stripping meta["source_video"] (the original Gradio upload path, which
1639
+ persists for the full session lifetime) into a fresh tmp file.
1640
+ This prevents xregen failures caused by tmp-dir eviction or Space restarts
1641
+ between initial generation and the regen call.
1642
+ """
1643
+ sv = meta.get("silent_video", "")
1644
+ if sv and os.path.exists(sv):
1645
+ return sv
1646
+ source = meta.get("source_video") or meta.get("video_path", "")
1647
+ if not source or not os.path.exists(source):
1648
+ raise FileNotFoundError(
1649
+ f"Cannot locate source video for regen — "
1650
+ f"silent_video={sv!r}, source_video={source!r}"
1651
+ )
1652
+ out = os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "silent_input.mp4")
1653
+ print(f"[regen] silent_video missing, re-stripping from source_video: {source}")
1654
+ strip_audio_from_video(source, out)
1655
+ return out
1656
+
1657
+
1658
  def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
1659
  """Compute the video clip window for a cross-model regen.
1660
 
 
1753
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, TARO_MODEL_DUR)
1754
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1755
  clip_path = _extract_segment_clip(
1756
+ _resolve_silent_video(meta), clip_start, clip_dur,
1757
  os.path.join(tmp_dir, "xregen_taro_clip.mp4"),
1758
  )
1759
  # Build a minimal fake-video meta so generate_taro can run on clip_path
 
1790
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
1791
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1792
  clip_path = _extract_segment_clip(
1793
+ _resolve_silent_video(meta), clip_start, clip_dur,
1794
  os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
1795
  )
1796
  sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
 
1822
  clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
1823
  tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
1824
  clip_path = _extract_segment_clip(
1825
+ _resolve_silent_video(meta), clip_start, clip_dur,
1826
  os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
1827
  )
1828
  sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))