Spaces:
Running on Zero
Running on Zero
Commit ·
8185e9f
1
Parent(s): 36c1b45
Add open_clip dep; remove tab descriptions; fix hunyuan .pt→.pth rename
Browse files- app.py +6 -22
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -57,7 +57,12 @@ HUNYUAN_MODEL_DIR = Path(CACHE_DIR) / "HunyuanFoley"
|
|
| 57 |
HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 58 |
|
| 59 |
print("Downloading HunyuanVideoFoley checkpoints…")
|
| 60 |
-
hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
|
| 62 |
hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
|
| 63 |
print("HunyuanVideoFoley checkpoints downloaded.")
|
|
@@ -586,12 +591,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
|
|
| 586 |
# Tab 1 — TARO #
|
| 587 |
# ---------------------------------------------------------- #
|
| 588 |
with gr.Tab("TARO"):
|
| 589 |
-
gr.Markdown(
|
| 590 |
-
"**TARO** — Video-conditioned diffusion (ICCV 2025). No text prompt needed — "
|
| 591 |
-
"sound is derived entirely from visual motion. "
|
| 592 |
-
"Best for scenes with clear physics-driven events: footsteps, impacts, splashing water, "
|
| 593 |
-
"crackling fire, rustling leaves, machinery. The model learns timing directly from the video."
|
| 594 |
-
)
|
| 595 |
with gr.Row():
|
| 596 |
with gr.Column():
|
| 597 |
taro_video = gr.Video(label="Input Video")
|
|
@@ -645,13 +644,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
|
|
| 645 |
# Tab 2 — MMAudio #
|
| 646 |
# ---------------------------------------------------------- #
|
| 647 |
with gr.Tab("MMAudio"):
|
| 648 |
-
gr.Markdown(
|
| 649 |
-
"**MMAudio** — Multimodal flow-matching (CVPR 2025). "
|
| 650 |
-
"Combines visual grounding with optional text guidance, making it the most flexible choice. "
|
| 651 |
-
"Best for mixed or ambiguous scenes — busy environments, nature montages, abstract visuals — "
|
| 652 |
-
"where a short prompt lets you steer which element of the scene to emphasise "
|
| 653 |
-
"(e.g. *'heavy rain'* over a street scene to suppress traffic noise)."
|
| 654 |
-
)
|
| 655 |
with gr.Row():
|
| 656 |
with gr.Column():
|
| 657 |
mma_video = gr.Video(label="Input Video")
|
|
@@ -699,14 +691,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
|
|
| 699 |
# Tab 3 — HunyuanVideoFoley #
|
| 700 |
# ---------------------------------------------------------- #
|
| 701 |
with gr.Tab("HunyuanFoley"):
|
| 702 |
-
gr.Markdown(
|
| 703 |
-
"**HunyuanVideo-Foley** (Tencent Hunyuan, 2025). "
|
| 704 |
-
"Highest-fidelity model for cinematic and creative foley. "
|
| 705 |
-
"Best for scenes that call for rich, layered sound design — dramatic SFX, "
|
| 706 |
-
"complex environments (crowd + rain + distant thunder), or any clip where you have "
|
| 707 |
-
"a clear creative vision you can describe in a prompt. "
|
| 708 |
-
"Requires a text prompt."
|
| 709 |
-
)
|
| 710 |
with gr.Row():
|
| 711 |
with gr.Column():
|
| 712 |
hf_video = gr.Video(label="Input Video")
|
|
|
|
| 57 |
HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
| 58 |
|
| 59 |
print("Downloading HunyuanVideoFoley checkpoints…")
|
| 60 |
+
_hf_raw = hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.pt", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
|
| 61 |
+
# model_utils.py expects the file named .pth — symlink if needed
|
| 62 |
+
_hf_pth = Path(_hf_raw).parent / "hunyuanvideo_foley.pth"
|
| 63 |
+
if not _hf_pth.exists():
|
| 64 |
+
import shutil as _shutil
|
| 65 |
+
_shutil.copy2(_hf_raw, _hf_pth)
|
| 66 |
hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
|
| 67 |
hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth", cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
|
| 68 |
print("HunyuanVideoFoley checkpoints downloaded.")
|
|
|
|
| 591 |
# Tab 1 — TARO #
|
| 592 |
# ---------------------------------------------------------- #
|
| 593 |
with gr.Tab("TARO"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
with gr.Row():
|
| 595 |
with gr.Column():
|
| 596 |
taro_video = gr.Video(label="Input Video")
|
|
|
|
| 644 |
# Tab 2 — MMAudio #
|
| 645 |
# ---------------------------------------------------------- #
|
| 646 |
with gr.Tab("MMAudio"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
with gr.Row():
|
| 648 |
with gr.Column():
|
| 649 |
mma_video = gr.Video(label="Input Video")
|
|
|
|
| 691 |
# Tab 3 — HunyuanVideoFoley #
|
| 692 |
# ---------------------------------------------------------- #
|
| 693 |
with gr.Tab("HunyuanFoley"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
with gr.Row():
|
| 695 |
with gr.Column():
|
| 696 |
hf_video = gr.Video(label="Input Video")
|
requirements.txt
CHANGED
|
@@ -19,6 +19,7 @@ av
|
|
| 19 |
colorlog
|
| 20 |
loguru
|
| 21 |
torchdiffeq
|
|
|
|
| 22 |
git+https://github.com/descriptinc/audiotools
|
| 23 |
--extra-index-url https://download.pytorch.org/whl/cu124
|
| 24 |
torchaudio==2.5.1+cu124
|
|
|
|
| 19 |
colorlog
|
| 20 |
loguru
|
| 21 |
torchdiffeq
|
| 22 |
+
open_clip_torch
|
| 23 |
git+https://github.com/descriptinc/audiotools
|
| 24 |
--extra-index-url https://download.pytorch.org/whl/cu124
|
| 25 |
torchaudio==2.5.1+cu124
|