Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on about 9 hours ago

Commit

93fa90c

1 Parent(s): 52e1535

Fix regen/xregen errors when silent_video tmp file is missing

Add _resolve_silent_video(meta) helper that falls back to re-stripping
the original Gradio upload (source_video) when silent_video has been
evicted from tmp or the Space restarted since generation.

- Store source_video (original video_file upload path) in seg_meta via
_build_seg_meta and _post_process_samples for all three models.
- Replace all meta[silent_video] direct reads in regen/xregen paths
with _resolve_silent_video(meta) — covers _splice_and_save,
regen_taro/mmaudio/hunyuan, and xregen_taro/mmaudio/hunyuan.

Files changed (1) hide show

app.py +45 -10

app.py CHANGED Viewed

@@ -780,15 +780,22 @@ def _log_inference_timing(label: str, elapsed: float, n_segs: int,
 def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
                     silent_video, sr, model, crossfade_s, crossfade_db,
-                    total_dur_s, **extras) -> dict:
     """Build the seg_meta dict shared by all three generate_* functions.
-    Model-specific keys are passed via **extras."""
     meta = {
         "segments":     segments,
         "wav_paths":    wav_paths,
         "audio_path":   audio_path,
         "video_path":   video_path,
         "silent_video": silent_video,
         "sr":           sr,
         "model":        model,
         "crossfade_s":  crossfade_s,
@@ -803,6 +810,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
                            silent_video: str, segments: list,
                            crossfade_s: float, crossfade_db: float,
                            total_dur_s: float, sr: int,
                            extra_meta_fn=None) -> list:
     """Shared CPU post-processing for all three generate_* wrappers.
@@ -832,7 +840,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
             segments=segments, wav_paths=wav_paths, audio_path=audio_path,
             video_path=video_path, silent_video=silent_video, sr=sr,
             model=model, crossfade_s=crossfade_s, crossfade_db=crossfade_db,
-            total_dur_s=total_dur_s, **extras,
         )
         outputs.append((video_path, audio_path, seg_meta))
     return outputs
@@ -974,6 +982,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
         silent_video=silent_video, segments=segments,
         crossfade_s=crossfade_s, crossfade_db=crossfade_db,
         total_dur_s=total_dur_s, sr=TARO_SR_OUT,
         extra_meta_fn=_taro_extras,
     )
     return _pad_outputs(outputs)
@@ -1120,6 +1129,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         silent_video=silent_video, segments=segments,
         crossfade_s=crossfade_s, crossfade_db=crossfade_db,
         total_dur_s=total_dur_s, sr=TARGET_SR,
     )
     return _pad_outputs(outputs)
@@ -1269,6 +1279,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         silent_video=silent_video, segments=segments,
         crossfade_s=crossfade_s, crossfade_db=crossfade_db,
         total_dur_s=total_dur_s, sr=48000,
         extra_meta_fn=_hunyuan_extras,
     )
     return _pad_outputs(outputs)
@@ -1295,7 +1306,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     crossfade_db = float(meta["crossfade_db"])
     sr           = int(meta["sr"])
     total_dur_s  = float(meta["total_dur_s"])
-    silent_video = meta["silent_video"]
     segments     = meta["segments"]
     model        = meta["model"]
@@ -1378,7 +1389,7 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
         print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
         from TARO.onset_util import extract_onset
         extract_cavp, onset_model = _load_taro_feature_extractors(device)
-        silent_video = meta["silent_video"]
         tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
@@ -1449,7 +1460,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
     # This avoids any cross-process context passing that fails under ZeroGPU isolation.
     seg_path = _extract_segment_clip(
-        meta["silent_video"], seg_start, seg_dur,
         os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
     )
@@ -1531,7 +1542,7 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
     seg_path = _extract_segment_clip(
-        meta["silent_video"], seg_start, seg_dur,
         os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
     )
@@ -1620,6 +1631,30 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
     return wav
 def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
     """Compute the video clip window for a cross-model regen.
@@ -1718,7 +1753,7 @@ def xregen_taro(seg_idx, state_json, slot_id,
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, TARO_MODEL_DUR)
         tmp_dir    = _register_tmp_dir(tempfile.mkdtemp())
         clip_path  = _extract_segment_clip(
-            meta["silent_video"], clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_taro_clip.mp4"),
         )
         # Build a minimal fake-video meta so generate_taro can run on clip_path
@@ -1755,7 +1790,7 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
         tmp_dir   = _register_tmp_dir(tempfile.mkdtemp())
         clip_path = _extract_segment_clip(
-            meta["silent_video"], clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
         )
         sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
@@ -1787,7 +1822,7 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
         tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
         clip_path    = _extract_segment_clip(
-            meta["silent_video"], clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
         )
         sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))

 def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
                     silent_video, sr, model, crossfade_s, crossfade_db,
+                    total_dur_s, source_video=None, **extras) -> dict:
     """Build the seg_meta dict shared by all three generate_* functions.
+    Model-specific keys are passed via **extras.
+    *source_video* is the original Gradio-managed upload path
+    (/tmp/gradio/...).  It lives for the entire session and is used as a
+    fallback when *silent_video* (which lives in a managed tmp dir) has been
+    evicted or the Space has restarted since generation.
+    """
     meta = {
         "segments":     segments,
         "wav_paths":    wav_paths,
         "audio_path":   audio_path,
         "video_path":   video_path,
         "silent_video": silent_video,
+        "source_video": source_video or video_path,
         "sr":           sr,
         "model":        model,
         "crossfade_s":  crossfade_s,
                            silent_video: str, segments: list,
                            crossfade_s: float, crossfade_db: float,
                            total_dur_s: float, sr: int,
+                           source_video: str = None,
                            extra_meta_fn=None) -> list:
     """Shared CPU post-processing for all three generate_* wrappers.
             segments=segments, wav_paths=wav_paths, audio_path=audio_path,
             video_path=video_path, silent_video=silent_video, sr=sr,
             model=model, crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+            total_dur_s=total_dur_s, source_video=source_video, **extras,
         )
         outputs.append((video_path, audio_path, seg_meta))
     return outputs
         silent_video=silent_video, segments=segments,
         crossfade_s=crossfade_s, crossfade_db=crossfade_db,
         total_dur_s=total_dur_s, sr=TARO_SR_OUT,
+        source_video=video_file,
         extra_meta_fn=_taro_extras,
     )
     return _pad_outputs(outputs)
         silent_video=silent_video, segments=segments,
         crossfade_s=crossfade_s, crossfade_db=crossfade_db,
         total_dur_s=total_dur_s, sr=TARGET_SR,
+        source_video=video_file,
     )
     return _pad_outputs(outputs)
         silent_video=silent_video, segments=segments,
         crossfade_s=crossfade_s, crossfade_db=crossfade_db,
         total_dur_s=total_dur_s, sr=48000,
+        source_video=video_file,
         extra_meta_fn=_hunyuan_extras,
     )
     return _pad_outputs(outputs)
     crossfade_db = float(meta["crossfade_db"])
     sr           = int(meta["sr"])
     total_dur_s  = float(meta["total_dur_s"])
+    silent_video = _resolve_silent_video(meta)
     segments     = meta["segments"]
     model        = meta["model"]
         print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
         from TARO.onset_util import extract_onset
         extract_cavp, onset_model = _load_taro_feature_extractors(device)
+        silent_video = _resolve_silent_video(meta)
         tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
         cavp_feats   = extract_cavp(silent_video, tmp_path=tmp_dir)
         onset_feats  = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
     # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
     # This avoids any cross-process context passing that fails under ZeroGPU isolation.
     seg_path = _extract_segment_clip(
+        _resolve_silent_video(meta), seg_start, seg_dur,
         os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
     )
     # Extract segment clip inside the GPU function — ffmpeg is CPU-only and safe here.
     seg_path = _extract_segment_clip(
+        _resolve_silent_video(meta), seg_start, seg_dur,
         os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "regen_seg.mp4"),
     )
     return wav
+def _resolve_silent_video(meta: dict) -> str:
+    """Return a valid silent (audio-stripped) video path for *meta*.
+    Prefers meta["silent_video"] if it still exists on disk.  Falls back to
+    re-stripping meta["source_video"] (the original Gradio upload path, which
+    persists for the full session lifetime) into a fresh tmp file.
+    This prevents xregen failures caused by tmp-dir eviction or Space restarts
+    between initial generation and the regen call.
+    """
+    sv = meta.get("silent_video", "")
+    if sv and os.path.exists(sv):
+        return sv
+    source = meta.get("source_video") or meta.get("video_path", "")
+    if not source or not os.path.exists(source):
+        raise FileNotFoundError(
+            f"Cannot locate source video for regen — "
+            f"silent_video={sv!r}, source_video={source!r}"
+        )
+    out = os.path.join(_register_tmp_dir(tempfile.mkdtemp()), "silent_input.mp4")
+    print(f"[regen] silent_video missing, re-stripping from source_video: {source}")
+    strip_audio_from_video(source, out)
+    return out
 def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
     """Compute the video clip window for a cross-model regen.
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, TARO_MODEL_DUR)
         tmp_dir    = _register_tmp_dir(tempfile.mkdtemp())
         clip_path  = _extract_segment_clip(
+            _resolve_silent_video(meta), clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_taro_clip.mp4"),
         )
         # Build a minimal fake-video meta so generate_taro can run on clip_path
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
         tmp_dir   = _register_tmp_dir(tempfile.mkdtemp())
         clip_path = _extract_segment_clip(
+            _resolve_silent_video(meta), clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
         )
         sub_segs = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
         clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
         tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
         clip_path    = _extract_segment_clip(
+            _resolve_silent_video(meta), clip_start, clip_dur,
             os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
         )
         sub_segs = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))