BoxOfColors commited on
Commit
8185e9f
·
1 Parent(s): 36c1b45

Add open_clip dep; remove tab descriptions; fix hunyuan .pt→.pth rename

Browse files
Files changed (2) hide show
  1. app.py +6 -22
  2. requirements.txt +1 -0
app.py CHANGED
@@ -57,7 +57,12 @@ HUNYUAN_MODEL_DIR = Path(CACHE_DIR) / "HunyuanFoley"
57
  HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
58
 
59
  print("Downloading HunyuanVideoFoley checkpoints…")
60
- hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
 
 
 
 
 
61
  hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
62
  hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
63
  print("HunyuanVideoFoley checkpoints downloaded.")
@@ -586,12 +591,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
586
  # Tab 1 — TARO #
587
  # ---------------------------------------------------------- #
588
  with gr.Tab("TARO"):
589
- gr.Markdown(
590
- "**TARO** — Video-conditioned diffusion (ICCV 2025). No text prompt needed — "
591
- "sound is derived entirely from visual motion. "
592
- "Best for scenes with clear physics-driven events: footsteps, impacts, splashing water, "
593
- "crackling fire, rustling leaves, machinery. The model learns timing directly from the video."
594
- )
595
  with gr.Row():
596
  with gr.Column():
597
  taro_video = gr.Video(label="Input Video")
@@ -645,13 +644,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
645
  # Tab 2 — MMAudio #
646
  # ---------------------------------------------------------- #
647
  with gr.Tab("MMAudio"):
648
- gr.Markdown(
649
- "**MMAudio** — Multimodal flow-matching (CVPR 2025). "
650
- "Combines visual grounding with optional text guidance, making it the most flexible choice. "
651
- "Best for mixed or ambiguous scenes — busy environments, nature montages, abstract visuals — "
652
- "where a short prompt lets you steer which element of the scene to emphasise "
653
- "(e.g. *'heavy rain'* over a street scene to suppress traffic noise)."
654
- )
655
  with gr.Row():
656
  with gr.Column():
657
  mma_video = gr.Video(label="Input Video")
@@ -699,14 +691,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
699
  # Tab 3 — HunyuanVideoFoley #
700
  # ---------------------------------------------------------- #
701
  with gr.Tab("HunyuanFoley"):
702
- gr.Markdown(
703
- "**HunyuanVideo-Foley** (Tencent Hunyuan, 2025). "
704
- "Highest-fidelity model for cinematic and creative foley. "
705
- "Best for scenes that call for rich, layered sound design — dramatic SFX, "
706
- "complex environments (crowd + rain + distant thunder), or any clip where you have "
707
- "a clear creative vision you can describe in a prompt. "
708
- "Requires a text prompt."
709
- )
710
  with gr.Row():
711
  with gr.Column():
712
  hf_video = gr.Video(label="Input Video")
 
57
  HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
58
 
59
  print("Downloading HunyuanVideoFoley checkpoints…")
60
+ _hf_raw = hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.pt", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
61
+ # model_utils.py expects the file named .pth — symlink if needed
62
+ _hf_pth = Path(_hf_raw).parent / "hunyuanvideo_foley.pth"
63
+ if not _hf_pth.exists():
64
+ import shutil as _shutil
65
+ _shutil.copy2(_hf_raw, _hf_pth)
66
  hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
67
  hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
68
  print("HunyuanVideoFoley checkpoints downloaded.")
 
591
  # Tab 1 — TARO #
592
  # ---------------------------------------------------------- #
593
  with gr.Tab("TARO"):
 
 
 
 
 
 
594
  with gr.Row():
595
  with gr.Column():
596
  taro_video = gr.Video(label="Input Video")
 
644
  # Tab 2 — MMAudio #
645
  # ---------------------------------------------------------- #
646
  with gr.Tab("MMAudio"):
 
 
 
 
 
 
 
647
  with gr.Row():
648
  with gr.Column():
649
  mma_video = gr.Video(label="Input Video")
 
691
  # Tab 3 — HunyuanVideoFoley #
692
  # ---------------------------------------------------------- #
693
  with gr.Tab("HunyuanFoley"):
 
 
 
 
 
 
 
 
694
  with gr.Row():
695
  with gr.Column():
696
  hf_video = gr.Video(label="Input Video")
requirements.txt CHANGED
@@ -19,6 +19,7 @@ av
19
  colorlog
20
  loguru
21
  torchdiffeq
 
22
  git+https://github.com/descriptinc/audiotools
23
  --extra-index-url https://download.pytorch.org/whl/cu124
24
  torchaudio==2.5.1+cu124
 
19
  colorlog
20
  loguru
21
  torchdiffeq
22
+ open_clip_torch
23
  git+https://github.com/descriptinc/audiotools
24
  --extra-index-url https://download.pytorch.org/whl/cu124
25
  torchaudio==2.5.1+cu124