Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

BoxOfColors commited on 6 days ago

Commit

8185e9f

1 Parent(s): 36c1b45

Add open_clip dep; remove tab descriptions; fix hunyuan .pt→.pth rename

Browse files

Files changed (2) hide show

app.py +6 -22
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -57,7 +57,12 @@ HUNYUAN_MODEL_DIR = Path(CACHE_DIR) / "HunyuanFoley"
 HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
 print("Downloading HunyuanVideoFoley checkpoints…")
-hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.pth",      cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
 hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth",            cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
 hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth",  cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
 print("HunyuanVideoFoley checkpoints downloaded.")
@@ -586,12 +591,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
         # Tab 1 — TARO                                                #
         # ---------------------------------------------------------- #
         with gr.Tab("TARO"):
-            gr.Markdown(
-                "**TARO** — Video-conditioned diffusion (ICCV 2025). No text prompt needed — "
-                "sound is derived entirely from visual motion. "
-                "Best for scenes with clear physics-driven events: footsteps, impacts, splashing water, "
-                "crackling fire, rustling leaves, machinery. The model learns timing directly from the video."
-            )
             with gr.Row():
                 with gr.Column():
                     taro_video   = gr.Video(label="Input Video")
@@ -645,13 +644,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
         # Tab 2 — MMAudio                                             #
         # ---------------------------------------------------------- #
         with gr.Tab("MMAudio"):
-            gr.Markdown(
-                "**MMAudio** — Multimodal flow-matching (CVPR 2025). "
-                "Combines visual grounding with optional text guidance, making it the most flexible choice. "
-                "Best for mixed or ambiguous scenes — busy environments, nature montages, abstract visuals — "
-                "where a short prompt lets you steer which element of the scene to emphasise "
-                "(e.g. *'heavy rain'* over a street scene to suppress traffic noise)."
-            )
             with gr.Row():
                 with gr.Column():
                     mma_video    = gr.Video(label="Input Video")
@@ -699,14 +691,6 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
         # Tab 3 — HunyuanVideoFoley                                   #
         # ---------------------------------------------------------- #
         with gr.Tab("HunyuanFoley"):
-            gr.Markdown(
-                "**HunyuanVideo-Foley** (Tencent Hunyuan, 2025). "
-                "Highest-fidelity model for cinematic and creative foley. "
-                "Best for scenes that call for rich, layered sound design — dramatic SFX, "
-                "complex environments (crowd + rain + distant thunder), or any clip where you have "
-                "a clear creative vision you can describe in a prompt. "
-                "Requires a text prompt."
-            )
             with gr.Row():
                 with gr.Column():
                     hf_video    = gr.Video(label="Input Video")

 HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
 print("Downloading HunyuanVideoFoley checkpoints…")
+_hf_raw = hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.pt",  cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
+# model_utils.py expects the file named .pth — symlink if needed
+_hf_pth = Path(_hf_raw).parent / "hunyuanvideo_foley.pth"
+if not _hf_pth.exists():
+    import shutil as _shutil
+    _shutil.copy2(_hf_raw, _hf_pth)
 hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth",            cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
 hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth",  cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
 print("HunyuanVideoFoley checkpoints downloaded.")
         # Tab 1 — TARO                                                #
         # ---------------------------------------------------------- #
         with gr.Tab("TARO"):
             with gr.Row():
                 with gr.Column():
                     taro_video   = gr.Video(label="Input Video")
         # Tab 2 — MMAudio                                             #
         # ---------------------------------------------------------- #
         with gr.Tab("MMAudio"):
             with gr.Row():
                 with gr.Column():
                     mma_video    = gr.Video(label="Input Video")
         # Tab 3 — HunyuanVideoFoley                                   #
         # ---------------------------------------------------------- #
         with gr.Tab("HunyuanFoley"):
             with gr.Row():
                 with gr.Column():
                     hf_video    = gr.Video(label="Input Video")

requirements.txt CHANGED Viewed

@@ -19,6 +19,7 @@ av
 colorlog
 loguru
 torchdiffeq
 git+https://github.com/descriptinc/audiotools
 --extra-index-url https://download.pytorch.org/whl/cu124
 torchaudio==2.5.1+cu124

 colorlog
 loguru
 torchdiffeq
+open_clip_torch
 git+https://github.com/descriptinc/audiotools
 --extra-index-url https://download.pytorch.org/whl/cu124
 torchaudio==2.5.1+cu124