Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running

App Files Files Community

JackIsNotInTheBox commited on 24 days ago

Commit

2c38be2

verified ·

1 Parent(s): f012a29

Reroute encoders + MMAudio downloads through user-controlled mirror

Browse files

app.py: replace direct upstream snapshot_download() for siglip2/clap/clip/audioldm2/bigvgan with downloads from JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints/encoders/<orig>, then symlink into HF cache under the upstream repo IDs so existing from_pretrained() calls resolve from the mirror. Set HF_HUB_OFFLINE=1 after the parallel download phase to force any later HF Hub call (CLAP, AudioLDM2, MMAudio open_clip 'hf-hub:apple/...') to be cache-only.

MMAudio/mmaudio/utils/download_utils.py: hardcoded huggingface.co/hkchengrex/MMAudio + github.com/hkchengrex/MMAudio/releases URLs repointed at upstream/MMAudio/{weights,ext_weights}/ in the mirror. Dormant under normal startup, live if MMAudio's download_model_if_needed() is ever invoked.

Result: full upstream-protection mode. Space pulls all 11 main weights + 5 encoder/vocoder repos from the mirror with no upstream calls at runtime.

Files changed (2) hide show

MMAudio/mmaudio/utils/download_utils.py +16 -10
app.py +67 -20

MMAudio/mmaudio/utils/download_utils.py CHANGED Viewed

@@ -7,51 +7,57 @@ from tqdm import tqdm
 log = logging.getLogger()
 links = [
     {
         'name': 'mmaudio_small_16k.pth',
-        'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_16k.pth',
         'md5': 'af93cde404179f58e3919ac085b8033b',
     },
     {
         'name': 'mmaudio_small_44k.pth',
-        'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_44k.pth',
         'md5': 'babd74c884783d13701ea2820a5f5b6d',
     },
     {
         'name': 'mmaudio_medium_44k.pth',
-        'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_medium_44k.pth',
         'md5': '5a56b6665e45a1e65ada534defa903d0',
     },
     {
         'name': 'mmaudio_large_44k.pth',
-        'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k.pth',
         'md5': 'fed96c325a6785b85ce75ae1aafd2673'
     },
     {
         'name': 'mmaudio_large_44k_v2.pth',
-        'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth',
         'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
     },
     {
         'name': 'v1-16.pth',
-        'url': 'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth',
         'md5': '69f56803f59a549a1a507c93859fd4d7'
     },
     {
         'name': 'best_netG.pt',
-        'url': 'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt',
         'md5': 'eeaf372a38a9c31c362120aba2dde292'
     },
     {
         'name': 'v1-44.pth',
-        'url': 'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth',
         'md5': 'fab020275fa44c6589820ce025191600'
     },
     {
         'name': 'synchformer_state_dict.pth',
-        'url':
-        'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth',
         'md5': '5b2f5594b0730f70e41e549b7c94390c'
     },
 ]

 log = logging.getLogger()
+# URLs repointed at JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints/
+# upstream/MMAudio/, which mirrors hkchengrex/MMAudio at sha eb13a1a9. The
+# original GitHub-released files (v1-16.pth etc.) live in upstream's
+# ext_weights/ folder, so they map cleanly under the same prefix.
+_MIRROR = ("https://huggingface.co/JackIsNotInTheBox/"
+           "Generate_Audio_for_Video_Checkpoints/resolve/main/upstream/MMAudio")
 links = [
     {
         'name': 'mmaudio_small_16k.pth',
+        'url': f'{_MIRROR}/weights/mmaudio_small_16k.pth',
         'md5': 'af93cde404179f58e3919ac085b8033b',
     },
     {
         'name': 'mmaudio_small_44k.pth',
+        'url': f'{_MIRROR}/weights/mmaudio_small_44k.pth',
         'md5': 'babd74c884783d13701ea2820a5f5b6d',
     },
     {
         'name': 'mmaudio_medium_44k.pth',
+        'url': f'{_MIRROR}/weights/mmaudio_medium_44k.pth',
         'md5': '5a56b6665e45a1e65ada534defa903d0',
     },
     {
         'name': 'mmaudio_large_44k.pth',
+        'url': f'{_MIRROR}/weights/mmaudio_large_44k.pth',
         'md5': 'fed96c325a6785b85ce75ae1aafd2673'
     },
     {
         'name': 'mmaudio_large_44k_v2.pth',
+        'url': f'{_MIRROR}/weights/mmaudio_large_44k_v2.pth',
         'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
     },
     {
         'name': 'v1-16.pth',
+        'url': f'{_MIRROR}/ext_weights/v1-16.pth',
         'md5': '69f56803f59a549a1a507c93859fd4d7'
     },
     {
         'name': 'best_netG.pt',
+        'url': f'{_MIRROR}/ext_weights/best_netG.pt',
         'md5': 'eeaf372a38a9c31c362120aba2dde292'
     },
     {
         'name': 'v1-44.pth',
+        'url': f'{_MIRROR}/ext_weights/v1-44.pth',
         'md5': 'fab020275fa44c6589820ce025191600'
     },
     {
         'name': 'synchformer_state_dict.pth',
+        'url': f'{_MIRROR}/ext_weights/synchformer_state_dict.pth',
         'md5': '5b2f5594b0730f70e41e549b7c94390c'
     },
 ]

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ import torchaudio
 import ffmpeg
 import spaces
 import gradio as gr
-from huggingface_hub import hf_hub_download, snapshot_download
 # ================================================================== #
 #                     CHECKPOINT CONFIGURATION                        #
@@ -82,34 +82,74 @@ def _dl_hunyuan():
                     cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
     print("HunyuanVideoFoley checkpoints downloaded.")
 def _dl_siglip2():
-    """Pre-download SigLIP2 (~1.5 GB) used by HunyuanFoley's visual encoder."""
-    snapshot_download(repo_id="google/siglip2-base-patch16-512")
-    print("SigLIP2 pre-downloaded.")
 def _dl_clap():
-    """Pre-download CLAP so from_pretrained() hits local cache inside the ZeroGPU worker."""
-    snapshot_download(repo_id="laion/larger_clap_general")
-    print("CLAP model pre-downloaded.")
 def _dl_clip():
-    """Pre-download MMAudio's CLIP model (~3.95 GB) to avoid GPU-window budget drain."""
-    snapshot_download(repo_id="apple/DFN5B-CLIP-ViT-H-14-384")
-    print("MMAudio CLIP model pre-downloaded.")
 def _dl_audioldm2():
-    """Pre-download AudioLDM2 VAE/vocoder used by TARO's from_pretrained() calls."""
-    snapshot_download(repo_id="cvssp/audioldm2")
-    print("AudioLDM2 pre-downloaded.")
 def _dl_bigvgan():
-    """Pre-download BigVGAN vocoder (~489 MB) used by MMAudio.
-    Returns the local snapshot directory so _load_mmaudio_models can pass it
-    to BigVGANv2.from_pretrained() as a local path, avoiding a network hit
-    inside the ZeroGPU worker."""
-    local_dir = snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
-    print(f"BigVGAN vocoder pre-downloaded to {local_dir}.")
-    return local_dir
 print("[startup] Starting parallel checkpoint + model downloads…")
 _t_dl_start = time.perf_counter()
@@ -133,6 +173,13 @@ mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.re
 bigvgan_local_dir = _fut_bigvgan.result()
 print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
 # ================================================================== #
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #

 import ffmpeg
 import spaces
 import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download, snapshot_download
 # ================================================================== #
 #                     CHECKPOINT CONFIGURATION                        #
                     cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
     print("HunyuanVideoFoley checkpoints downloaded.")
+def _populate_hf_cache_from_mirror(orig_repo_id, mirror_subpath):
+    """Download an encoder slice from CKPT_REPO_ID and wire it into the HF
+    cache under `orig_repo_id` so existing from_pretrained() calls keyed by
+    the upstream id resolve from the mirror without networking upstream."""
+    sha = "mirror-snapshot"
+    mirror_root = snapshot_download(
+        repo_id=CKPT_REPO_ID,
+        allow_patterns=[f"{mirror_subpath}/**"],
+    )
+    src = Path(mirror_root) / mirror_subpath
+    cache_root = Path(os.environ.get(
+        "HF_HUB_CACHE", os.path.expanduser("~/.cache/huggingface/hub")
+    ))
+    repo_cache = cache_root / f"models--{orig_repo_id.replace('/', '--')}"
+    snap = repo_cache / "snapshots" / sha
+    refs = repo_cache / "refs"
+    snap.parent.mkdir(parents=True, exist_ok=True)
+    refs.mkdir(parents=True, exist_ok=True)
+    if snap.is_symlink() or snap.exists():
+        if snap.is_symlink():
+            snap.unlink()
+        else:
+            shutil.rmtree(snap)
+    snap.symlink_to(src, target_is_directory=True)
+    (refs / "main").write_text(sha)
+    return str(snap)
 def _dl_siglip2():
+    p = _populate_hf_cache_from_mirror(
+        "google/siglip2-base-patch16-512",
+        "encoders/google/siglip2-base-patch16-512",
+    )
+    print(f"SigLIP2 pre-downloaded from mirror -> {p}")
 def _dl_clap():
+    p = _populate_hf_cache_from_mirror(
+        "laion/larger_clap_general",
+        "encoders/laion/larger_clap_general",
+    )
+    print(f"CLAP pre-downloaded from mirror -> {p}")
 def _dl_clip():
+    # Space code uses the legacy '-384' alias which upstream redirects to
+    # '-378'. We mirror under the canonical name and key the cache under
+    # the alias so the existing 'hf-hub:apple/...-384' calls resolve.
+    p = _populate_hf_cache_from_mirror(
+        "apple/DFN5B-CLIP-ViT-H-14-384",
+        "encoders/apple/DFN5B-CLIP-ViT-H-14-378",
+    )
+    print(f"MMAudio CLIP pre-downloaded from mirror -> {p}")
 def _dl_audioldm2():
+    p = _populate_hf_cache_from_mirror(
+        "cvssp/audioldm2",
+        "encoders/cvssp/audioldm2",
+    )
+    print(f"AudioLDM2 pre-downloaded from mirror -> {p}")
 def _dl_bigvgan():
+    """Returns the local snapshot dir so MMAudio's BigVGANv2.from_pretrained()
+    receives a local path rather than reaching out to HF Hub."""
+    p = _populate_hf_cache_from_mirror(
+        "nvidia/bigvgan_v2_44khz_128band_512x",
+        "encoders/nvidia/bigvgan_v2_44khz_128band_512x",
+    )
+    print(f"BigVGAN pre-downloaded from mirror -> {p}")
+    return p
 print("[startup] Starting parallel checkpoint + model downloads…")
 _t_dl_start = time.perf_counter()
 bigvgan_local_dir = _fut_bigvgan.result()
 print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
+# Force any later HF Hub call to be cache-only. With every encoder cache
+# pre-populated above from the user's mirror, downstream from_pretrained()
+# calls (CLAP, AudioLDM2, MMAudio's open_clip 'hf-hub:...') resolve from
+# the populated cache instead of networking out to upstream — full
+# upstream-protection mode.
+os.environ["HF_HUB_OFFLINE"] = "1"
 # ================================================================== #
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #