BoxOfColors commited on
Commit
09846c9
·
1 Parent(s): e0917de
Files changed (1) hide show
  1. app.py +25 -22
app.py CHANGED
@@ -9,9 +9,9 @@ Supported models
9
  """
10
 
11
  import os
 
12
  import tempfile
13
  import random
14
- from math import floor
15
  from pathlib import Path
16
 
17
  import time
@@ -79,7 +79,6 @@ def set_global_seed(seed: int):
79
  random.seed(seed)
80
  torch.manual_seed(seed)
81
  torch.cuda.manual_seed(seed)
82
- torch.backends.cudnn.deterministic = True
83
 
84
  def get_random_seed() -> int:
85
  return random.randint(0, 2**32 - 1)
@@ -90,18 +89,18 @@ def get_video_duration(video_path: str) -> float:
90
  return float(probe["format"]["duration"])
91
 
92
  def strip_audio_from_video(video_path: str, output_path: str):
93
- """Write a silent copy of *video_path* to *output_path*."""
94
- ffmpeg.input(video_path).output(output_path, vcodec="libx264", an=None).run(
95
  overwrite_output=True, quiet=True
96
  )
97
 
98
  def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
99
- """Mux a silent video with an audio file into *output_path*."""
100
  ffmpeg.output(
101
  ffmpeg.input(silent_video),
102
  ffmpeg.input(audio_path),
103
  output_path,
104
- vcodec="libx264", acodec="aac", strict="experimental",
105
  ).run(overwrite_output=True, quiet=True)
106
 
107
 
@@ -175,13 +174,14 @@ HUNYUAN_SECS_PER_STEP = 0.35 # measured 0.328s/step on H200 (8.3s video, 1 seg
175
  HUNYUAN_LOAD_OVERHEAD = 55 # ~55s to load the 10GB XXL model weights into GPU
176
  GPU_DURATION_CAP = 300 # hard cap per call — never reserve more than this
177
 
178
- _TARO_INFERENCE_CACHE: dict = {}
 
179
 
180
 
181
  def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
182
  n_segs = len(_build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s))
183
  time_per_seg = num_steps * TARO_SECS_PER_STEP
184
- max_s = floor(600.0 / (n_segs * time_per_seg))
185
  return max(1, min(max_s, MAX_SLOTS))
186
 
187
 
@@ -287,8 +287,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
287
 
288
  # TARO modules use bare imports (e.g. `from cavp_util import ...`) that
289
  # assume the TARO directory is on sys.path. Add it before importing.
290
- import sys, os as _os
291
- _taro_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "TARO")
292
  if _taro_dir not in sys.path:
293
  sys.path.insert(0, _taro_dir)
294
 
@@ -375,6 +374,9 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
375
  f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
376
  f"(current constant={TARO_SECS_PER_STEP})")
377
  _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
 
 
 
378
 
379
  final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
380
  audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
@@ -419,10 +421,9 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
419
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
420
  cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
421
  """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
422
- import sys as _sys, os as _os
423
- _mmaudio_dir = _os.path.join(_os.path.dirname(_os.path.abspath(__file__)), "MMAudio")
424
- if _mmaudio_dir not in _sys.path:
425
- _sys.path.insert(0, _mmaudio_dir)
426
 
427
  from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
428
  from mmaudio.model.flow_matching import FlowMatching
@@ -490,10 +491,10 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
490
 
491
  for seg_i, (seg_start, seg_end) in enumerate(segments):
492
  seg_dur = seg_end - seg_start
493
- # Trim a clean video clip for this segment
494
  seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
495
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
496
- seg_path, vcodec="libx264", acodec="aac", strict="experimental"
497
  ).run(overwrite_output=True, quiet=True)
498
 
499
  fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
@@ -582,11 +583,10 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
582
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
583
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
584
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
585
- import sys as _sys
586
  # Ensure HunyuanVideo-Foley package is importable
587
  _hf_path = str(Path("HunyuanVideo-Foley").resolve())
588
- if _hf_path not in _sys.path:
589
- _sys.path.insert(0, _hf_path)
590
 
591
  from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
592
  from hunyuanvideo_foley.utils.feature_utils import feature_process
@@ -634,10 +634,10 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
634
  segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
635
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
636
 
637
- # Pre-encode text features once (same for every segment)
638
  _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
639
  ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
640
- _dummy_seg_path, vcodec="libx264", acodec="aac", strict="experimental"
641
  ).run(overwrite_output=True, quiet=True)
642
  _, text_feats, _ = feature_process(
643
  _dummy_seg_path,
@@ -656,9 +656,12 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
656
  seg_dur = seg_end - seg_start
657
  seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
658
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
659
- seg_path, vcodec="libx264", acodec="aac", strict="experimental"
660
  ).run(overwrite_output=True, quiet=True)
661
 
 
 
 
662
  visual_feats, _, seg_audio_len = feature_process(
663
  seg_path,
664
  prompt if prompt else "",
 
9
  """
10
 
11
  import os
12
+ import sys
13
  import tempfile
14
  import random
 
15
  from pathlib import Path
16
 
17
  import time
 
79
  random.seed(seed)
80
  torch.manual_seed(seed)
81
  torch.cuda.manual_seed(seed)
 
82
 
83
  def get_random_seed() -> int:
84
  return random.randint(0, 2**32 - 1)
 
89
  return float(probe["format"]["duration"])
90
 
91
  def strip_audio_from_video(video_path: str, output_path: str):
92
+ """Write a silent copy of *video_path* to *output_path* (stream-copy, no re-encode)."""
93
+ ffmpeg.input(video_path).output(output_path, vcodec="copy", an=None).run(
94
  overwrite_output=True, quiet=True
95
  )
96
 
97
  def mux_video_audio(silent_video: str, audio_path: str, output_path: str):
98
+ """Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
99
  ffmpeg.output(
100
  ffmpeg.input(silent_video),
101
  ffmpeg.input(audio_path),
102
  output_path,
103
+ vcodec="copy", acodec="aac", strict="experimental",
104
  ).run(overwrite_output=True, quiet=True)
105
 
106
 
 
174
  HUNYUAN_LOAD_OVERHEAD = 55 # ~55s to load the 10GB XXL model weights into GPU
175
  GPU_DURATION_CAP = 300 # hard cap per call — never reserve more than this
176
 
177
+ _TARO_CACHE_MAXLEN = 16 # evict oldest entries beyond this limit
178
+ _TARO_INFERENCE_CACHE: dict = {} # keyed by (video_file, seed, cfg, steps, mode, crossfade_s)
179
 
180
 
181
  def _taro_calc_max_samples(total_dur_s: float, num_steps: int, crossfade_s: float) -> int:
182
  n_segs = len(_build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s))
183
  time_per_seg = num_steps * TARO_SECS_PER_STEP
184
+ max_s = int(600.0 / (n_segs * time_per_seg))
185
  return max(1, min(max_s, MAX_SLOTS))
186
 
187
 
 
287
 
288
  # TARO modules use bare imports (e.g. `from cavp_util import ...`) that
289
  # assume the TARO directory is on sys.path. Add it before importing.
290
+ _taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
 
291
  if _taro_dir not in sys.path:
292
  sys.path.insert(0, _taro_dir)
293
 
 
374
  f"{_t_infer_elapsed:.1f}s wall → {_secs_per_step:.3f}s/step "
375
  f"(current constant={TARO_SECS_PER_STEP})")
376
  _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
377
+ # Evict oldest entries if cache exceeds max size
378
+ while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
379
+ _TARO_INFERENCE_CACHE.pop(next(iter(_TARO_INFERENCE_CACHE)))
380
 
381
  final_wav = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
382
  audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
 
421
  def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
422
  cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
423
  """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
424
+ _mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
425
+ if _mmaudio_dir not in sys.path:
426
+ sys.path.insert(0, _mmaudio_dir)
 
427
 
428
  from mmaudio.eval_utils import all_model_cfg, generate, load_video, make_video
429
  from mmaudio.model.flow_matching import FlowMatching
 
491
 
492
  for seg_i, (seg_start, seg_end) in enumerate(segments):
493
  seg_dur = seg_end - seg_start
494
+ # Trim a clean video clip for this segment (stream-copy, no re-encode)
495
  seg_path = os.path.join(tmp_dir, f"mma_seg_{sample_idx}_{seg_i}.mp4")
496
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
497
+ seg_path, vcodec="copy", an=None
498
  ).run(overwrite_output=True, quiet=True)
499
 
500
  fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
 
583
  def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
584
  guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
585
  """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
 
586
  # Ensure HunyuanVideo-Foley package is importable
587
  _hf_path = str(Path("HunyuanVideo-Foley").resolve())
588
+ if _hf_path not in sys.path:
589
+ sys.path.insert(0, _hf_path)
590
 
591
  from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
592
  from hunyuanvideo_foley.utils.feature_utils import feature_process
 
634
  segments = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
635
  print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
636
 
637
+ # Pre-extract text features once (same for every segment; stream-copy, no re-encode)
638
  _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
639
  ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
640
+ _dummy_seg_path, vcodec="copy", an=None
641
  ).run(overwrite_output=True, quiet=True)
642
  _, text_feats, _ = feature_process(
643
  _dummy_seg_path,
 
656
  seg_dur = seg_end - seg_start
657
  seg_path = os.path.join(tmp_dir, f"seg_{sample_idx}_{seg_i}.mp4")
658
  ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
659
+ seg_path, vcodec="copy", an=None
660
  ).run(overwrite_output=True, quiet=True)
661
 
662
+ # feature_process returns (visual_feats, text_feats, audio_len).
663
+ # We discard the returned text_feats (_) and use the pre-computed
664
+ # text_feats from above — text encoding runs once, not per segment.
665
  visual_feats, _, seg_audio_len = feature_process(
666
  seg_path,
667
  prompt if prompt else "",