Spaces:
Running on Zero
Running on Zero
Commit ·
09846c9
1
Parent(s): e0917de
app.py
CHANGED
|
@@ -9,9 +9,9 @@ Supported models
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
import os
|
|
|
|
| 12 |
import tempfile
|
| 13 |
import random
|
| 14 |
-
from math import floor
|
| 15 |
from pathlib import Path
|
| 16 |
|
| 17 |
import time
|
|
@@ -79,7 +79,6 @@ def set_global_seed(seed: int):
|
|
| 79 |
random.seed(seed)
|
| 80 |
torch.manual_seed(seed)
|
| 81 |
torch.cuda.manual_seed(seed)
|
| 82 |
-
torch.backends.cudnn.deterministic = True
|
| 83 |
|
| 84 |
def get_random_seed() -> int:
|
| 85 |
return random.randint(0, 2**32 - 1)
|
|
@@ -90,18 +89,18 @@ def get_video_duration(video_path: str) -> float:
|
|
| 90 |
return float(probe["format"]["duration"])
|
| 91 |
|
| 92 |
def strip_audio_from_video(video_path: str, output_path: str):
|
| 93 |
-
"""Write a silent copy of *video_path* to *output_path*."""
|
| 94 |
-
ffmpeg.input(video_path).output(output_path, vcodec="
|
| 95 |
overwrite_output=True, quiet=True
|
| 96 |
)
|
| 97 |
|
| 98 |
def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
|
| 99 |
-
"""Mux a silent video with an audio file into *output_path*."""
|
| 100 |
ffmpeg.output(
|
| 101 |
ffmpeg.input(silent_video),
|
| 102 |
ffmpeg.input(audio_path),
|
| 103 |
output_path,
|
| 104 |
-
vcodec="
|
| 105 |
).run(overwrite_output=True, quiet=True)
|
| 106 |
|
| 107 |
|
|
@@ -175,13 +174,14 @@ HUNYUAN_SECS_PER_STEP = 0.35 # measured 0.328s/step on H200 (8.3s video, 1 seg
|
|
| 175 |
HUNYUAN_LOAD_OVERHEAD = 55 # ~55s to load the 10GB XXL model weights into GPU
|
| 176 |
GPU_DURATION_CAP = 300 # hard cap per call — never reserve more than this
|
| 177 |
|
| 178 |
-
|
|
|
|
| 179 |
|
| 180 |
|
| 181 |
def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
|
| 182 |
n_segs = len(_build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s))
|
| 183 |
time_per_seg = num_steps * TARO_SECS_PER_STEP
|
| 184 |
-
max_s =
|
| 185 |
return max(1, min(max_s, MAX_SLOTS))
|
| 186 |
|
| 187 |
|
|
@@ -287,8 +287,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 287 |
|
| 288 |
# TARO modules use bare imports (e.g. `from cavp_util import ...`) that
|
| 289 |
# assume the TARO directory is on sys.path. Add it before importing.
|
| 290 |
-
|
| 291 |
-
_taro_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "TARO")
|
| 292 |
if _taro_dir not in sys.path:
|
| 293 |
sys.path.insert(0, _taro_dir)
|
| 294 |
|
|
@@ -375,6 +374,9 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 375 |
f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
|
| 376 |
f"(current constant={TARO_SECS_PER_STEP})")
|
| 377 |
_TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
|
| 380 |
audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
|
|
@@ -419,10 +421,9 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
|
|
| 419 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 420 |
cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
|
| 421 |
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
|
| 422 |
-
|
| 423 |
-
_mmaudio_dir
|
| 424 |
-
|
| 425 |
-
_sys.path.insert(0, _mmaudio_dir)
|
| 426 |
|
| 427 |
from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
|
| 428 |
from mmaudio.model.flow_matching import FlowMatching
|
|
@@ -490,10 +491,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
|
| 490 |
|
| 491 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 492 |
seg_dur = seg_end - seg_start
|
| 493 |
-
# Trim a clean video clip for this segment
|
| 494 |
seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
|
| 495 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 496 |
-
seg_path, vcodec="
|
| 497 |
).run(overwrite_output=True, quiet=True)
|
| 498 |
|
| 499 |
fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
|
|
@@ -582,11 +583,10 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
|
|
| 582 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 583 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
| 584 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
|
| 585 |
-
import sys as _sys
|
| 586 |
# Ensure HunyuanVideo-Foley package is importable
|
| 587 |
_hf_path = str(Path("HunyuanVideo-Foley").resolve())
|
| 588 |
-
if _hf_path not in
|
| 589 |
-
|
| 590 |
|
| 591 |
from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
|
| 592 |
from hunyuanvideo_foley.utils.feature_utils import feature_process
|
|
@@ -634,10 +634,10 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 634 |
segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
|
| 635 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
| 636 |
|
| 637 |
-
# Pre-
|
| 638 |
_dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
|
| 639 |
ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
|
| 640 |
-
_dummy_seg_path, vcodec="
|
| 641 |
).run(overwrite_output=True, quiet=True)
|
| 642 |
_, text_feats, _ = feature_process(
|
| 643 |
_dummy_seg_path,
|
|
@@ -656,9 +656,12 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 656 |
seg_dur = seg_end - seg_start
|
| 657 |
seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
|
| 658 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 659 |
-
seg_path, vcodec="
|
| 660 |
).run(overwrite_output=True, quiet=True)
|
| 661 |
|
|
|
|
|
|
|
|
|
|
| 662 |
visual_feats, _, seg_audio_len = feature_process(
|
| 663 |
seg_path,
|
| 664 |
prompt if prompt else "",
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
import os
|
| 12 |
+
import sys
|
| 13 |
import tempfile
|
| 14 |
import random
|
|
|
|
| 15 |
from pathlib import Path
|
| 16 |
|
| 17 |
import time
|
|
|
|
| 79 |
random.seed(seed)
|
| 80 |
torch.manual_seed(seed)
|
| 81 |
torch.cuda.manual_seed(seed)
|
|
|
|
| 82 |
|
| 83 |
def get_random_seed() -> int:
|
| 84 |
return random.randint(0, 2**32 - 1)
|
|
|
|
| 89 |
return float(probe["format"]["duration"])
|
| 90 |
|
| 91 |
def strip_audio_from_video(video_path: str, output_path: str):
|
| 92 |
+
"""Write a silent copy of *video_path* to *output_path* (stream-copy, no re-encode)."""
|
| 93 |
+
ffmpeg.input(video_path).output(output_path, vcodec="copy", an=None).run(
|
| 94 |
overwrite_output=True, quiet=True
|
| 95 |
)
|
| 96 |
|
| 97 |
def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
|
| 98 |
+
"""Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
|
| 99 |
ffmpeg.output(
|
| 100 |
ffmpeg.input(silent_video),
|
| 101 |
ffmpeg.input(audio_path),
|
| 102 |
output_path,
|
| 103 |
+
vcodec="copy", acodec="aac", strict="experimental",
|
| 104 |
).run(overwrite_output=True, quiet=True)
|
| 105 |
|
| 106 |
|
|
|
|
| 174 |
HUNYUAN_LOAD_OVERHEAD = 55 # ~55s to load the 10GB XXL model weights into GPU
|
| 175 |
GPU_DURATION_CAP = 300 # hard cap per call — never reserve more than this
|
| 176 |
|
| 177 |
+
_TARO_CACHE_MAXLEN = 16 # evict oldest entries beyond this limit
|
| 178 |
+
_TARO_INFERENCE_CACHE: dict = {} # keyed by (video_file, seed, cfg, steps, mode, crossfade_s)
|
| 179 |
|
| 180 |
|
| 181 |
def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
|
| 182 |
n_segs = len(_build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s))
|
| 183 |
time_per_seg = num_steps * TARO_SECS_PER_STEP
|
| 184 |
+
max_s = int(600.0 / (n_segs * time_per_seg))
|
| 185 |
return max(1, min(max_s, MAX_SLOTS))
|
| 186 |
|
| 187 |
|
|
|
|
| 287 |
|
| 288 |
# TARO modules use bare imports (e.g. `from cavp_util import ...`) that
|
| 289 |
# assume the TARO directory is on sys.path. Add it before importing.
|
| 290 |
+
_taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
|
|
|
|
| 291 |
if _taro_dir not in sys.path:
|
| 292 |
sys.path.insert(0, _taro_dir)
|
| 293 |
|
|
|
|
| 374 |
f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
|
| 375 |
f"(current constant={TARO_SECS_PER_STEP})")
|
| 376 |
_TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
|
| 377 |
+
# Evict oldest entries if cache exceeds max size
|
| 378 |
+
while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
|
| 379 |
+
_TARO_INFERENCE_CACHE.pop(next(iter(_TARO_INFERENCE_CACHE)))
|
| 380 |
|
| 381 |
final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
|
| 382 |
audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
|
|
|
|
| 421 |
def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
|
| 422 |
cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
|
| 423 |
"""MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
|
| 424 |
+
_mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
|
| 425 |
+
if _mmaudio_dir not in sys.path:
|
| 426 |
+
sys.path.insert(0, _mmaudio_dir)
|
|
|
|
| 427 |
|
| 428 |
from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
|
| 429 |
from mmaudio.model.flow_matching import FlowMatching
|
|
|
|
| 491 |
|
| 492 |
for seg_i, (seg_start, seg_end) in enumerate(segments):
|
| 493 |
seg_dur = seg_end - seg_start
|
| 494 |
+
# Trim a clean video clip for this segment (stream-copy, no re-encode)
|
| 495 |
seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
|
| 496 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 497 |
+
seg_path, vcodec="copy", an=None
|
| 498 |
).run(overwrite_output=True, quiet=True)
|
| 499 |
|
| 500 |
fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
|
|
|
|
| 583 |
def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
| 584 |
guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
|
| 585 |
"""HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
|
|
|
|
| 586 |
# Ensure HunyuanVideo-Foley package is importable
|
| 587 |
_hf_path = str(Path("HunyuanVideo-Foley").resolve())
|
| 588 |
+
if _hf_path not in sys.path:
|
| 589 |
+
sys.path.insert(0, _hf_path)
|
| 590 |
|
| 591 |
from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
|
| 592 |
from hunyuanvideo_foley.utils.feature_utils import feature_process
|
|
|
|
| 634 |
segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
|
| 635 |
print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
|
| 636 |
|
| 637 |
+
# Pre-extract text features once (same for every segment; stream-copy, no re-encode)
|
| 638 |
_dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
|
| 639 |
ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
|
| 640 |
+
_dummy_seg_path, vcodec="copy", an=None
|
| 641 |
).run(overwrite_output=True, quiet=True)
|
| 642 |
_, text_feats, _ = feature_process(
|
| 643 |
_dummy_seg_path,
|
|
|
|
| 656 |
seg_dur = seg_end - seg_start
|
| 657 |
seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
|
| 658 |
ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
|
| 659 |
+
seg_path, vcodec="copy", an=None
|
| 660 |
).run(overwrite_output=True, quiet=True)
|
| 661 |
|
| 662 |
+
# feature_process returns (visual_feats, text_feats, audio_len).
|
| 663 |
+
# We discard the returned text_feats (_) and use the pre-computed
|
| 664 |
+
# text_feats from above — text encoding runs once, not per segment.
|
| 665 |
visual_feats, _, seg_audio_len = feature_process(
|
| 666 |
seg_path,
|
| 667 |
prompt if prompt else "",
|