Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

BoxOfColors commited on 5 days ago

Commit

09846c9

1 Parent(s): e0917de

.

Browse files

Files changed (1) hide show

app.py +25 -22

app.py CHANGED Viewed

@@ -9,9 +9,9 @@ Supported models
 """
 import os
 import tempfile
 import random
-from math import floor
 from pathlib import Path
 import time
@@ -79,7 +79,6 @@ def set_global_seed(seed: int):
     random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
-    torch.backends.cudnn.deterministic = True
 def get_random_seed() -> int:
     return random.randint(0, 2**32 - 1)
@@ -90,18 +89,18 @@ def get_video_duration(video_path: str) -> float:
     return float(probe["format"]["duration"])
 def strip_audio_from_video(video_path: str, output_path: str):
-    """Write a silent copy of *video_path* to *output_path*."""
-    ffmpeg.input(video_path).output(output_path, vcodec="libx264", an=None).run(
         overwrite_output=True, quiet=True
     )
 def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
-    """Mux a silent video with an audio file into *output_path*."""
     ffmpeg.output(
         ffmpeg.input(silent_video),
         ffmpeg.input(audio_path),
         output_path,
-        vcodec="libx264", acodec="aac", strict="experimental",
     ).run(overwrite_output=True, quiet=True)
@@ -175,13 +174,14 @@ HUNYUAN_SECS_PER_STEP  = 0.35  # measured 0.328s/step on H200 (8.3s video, 1 seg
 HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
 GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
-_TARO_INFERENCE_CACHE: dict = {}
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
     n_segs        = len(_build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s))
     time_per_seg  = num_steps * TARO_SECS_PER_STEP
-    max_s         = floor(600.0 / (n_segs * time_per_seg))
     return max(1, min(max_s, MAX_SLOTS))
@@ -287,8 +287,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     # TARO modules use bare imports (e.g. `from cavp_util import ...`) that
     # assume the TARO directory is on sys.path.  Add it before importing.
-    import sys, os as _os
-    _taro_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "TARO")
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
@@ -375,6 +374,9 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                   f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
                   f"(current constant={TARO_SECS_PER_STEP})")
             _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
@@ -419,10 +421,9 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
-    import sys as _sys, os as _os
-    _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
-    if _mmaudio_dir not in _sys.path:
-        _sys.path.insert(0, _mmaudio_dir)
     from mmaudio.eval_utils        import all_model_cfg, generate, load_video, make_video
     from mmaudio.model.flow_matching   import FlowMatching
@@ -490,10 +491,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
-            # Trim a clean video clip for this segment
             seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
             ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-                seg_path, vcodec="libx264", acodec="aac", strict="experimental"
             ).run(overwrite_output=True, quiet=True)
             fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
@@ -582,11 +583,10 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
-    import sys as _sys
     # Ensure HunyuanVideo-Foley package is importable
     _hf_path = str(Path("HunyuanVideo-Foley").resolve())
-    if _hf_path not in _sys.path:
-        _sys.path.insert(0, _hf_path)
     from hunyuanvideo_foley.utils.model_utils  import load_model, denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
@@ -634,10 +634,10 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
     segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
-    # Pre-encode text features once (same for every segment)
     _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
     ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
-        _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
     ).run(overwrite_output=True, quiet=True)
     _, text_feats, _ = feature_process(
         _dummy_seg_path,
@@ -656,9 +656,12 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
             seg_dur = seg_end - seg_start
             seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
             ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-                seg_path, vcodec="libx264", acodec="aac", strict="experimental"
             ).run(overwrite_output=True, quiet=True)
             visual_feats, _, seg_audio_len = feature_process(
                 seg_path,
                 prompt if prompt else "",

 """
 import os
+import sys
 import tempfile
 import random
 from pathlib import Path
 import time
     random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
 def get_random_seed() -> int:
     return random.randint(0, 2**32 - 1)
     return float(probe["format"]["duration"])
 def strip_audio_from_video(video_path: str, output_path: str):
+    """Write a silent copy of *video_path* to *output_path* (stream-copy, no re-encode)."""
+    ffmpeg.input(video_path).output(output_path, vcodec="copy", an=None).run(
         overwrite_output=True, quiet=True
     )
 def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
+    """Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
     ffmpeg.output(
         ffmpeg.input(silent_video),
         ffmpeg.input(audio_path),
         output_path,
+        vcodec="copy", acodec="aac", strict="experimental",
     ).run(overwrite_output=True, quiet=True)
 HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
 GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
+_TARO_CACHE_MAXLEN = 16   # evict oldest entries beyond this limit
+_TARO_INFERENCE_CACHE: dict = {}   # keyed by (video_file, seed, cfg, steps, mode, crossfade_s)
 def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
     n_segs        = len(_build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s))
     time_per_seg  = num_steps * TARO_SECS_PER_STEP
+    max_s         = int(600.0 / (n_segs * time_per_seg))
     return max(1, min(max_s, MAX_SLOTS))
     # TARO modules use bare imports (e.g. `from cavp_util import ...`) that
     # assume the TARO directory is on sys.path.  Add it before importing.
+    _taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
                   f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
                   f"(current constant={TARO_SECS_PER_STEP})")
             _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
+            # Evict oldest entries if cache exceeds max size
+            while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
+                _TARO_INFERENCE_CACHE.pop(next(iter(_TARO_INFERENCE_CACHE)))
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
     """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
+    _mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
+    if _mmaudio_dir not in sys.path:
+        sys.path.insert(0, _mmaudio_dir)
     from mmaudio.eval_utils        import all_model_cfg, generate, load_video, make_video
     from mmaudio.model.flow_matching   import FlowMatching
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
+            # Trim a clean video clip for this segment (stream-copy, no re-encode)
             seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
             ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+                seg_path, vcodec="copy", an=None
             ).run(overwrite_output=True, quiet=True)
             fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
     """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
     # Ensure HunyuanVideo-Foley package is importable
     _hf_path = str(Path("HunyuanVideo-Foley").resolve())
+    if _hf_path not in sys.path:
+        sys.path.insert(0, _hf_path)
     from hunyuanvideo_foley.utils.model_utils  import load_model, denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
     print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
+    # Pre-extract text features once (same for every segment; stream-copy, no re-encode)
     _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
     ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
+        _dummy_seg_path, vcodec="copy", an=None
     ).run(overwrite_output=True, quiet=True)
     _, text_feats, _ = feature_process(
         _dummy_seg_path,
             seg_dur = seg_end - seg_start
             seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
             ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+                seg_path, vcodec="copy", an=None
             ).run(overwrite_output=True, quiet=True)
+            # feature_process returns (visual_feats, text_feats, audio_len).
+            # We discard the returned text_feats (_) and use the pre-computed
+            # text_feats from above — text encoding runs once, not per segment.
             visual_feats, _, seg_audio_len = feature_process(
                 seg_path,
                 prompt if prompt else "",