Reroute encoders + MMAudio downloads through user-controlled mirror
Browse filesapp.py: replace direct upstream snapshot_download() for siglip2/clap/clip/audioldm2/bigvgan with downloads from JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints/encoders/<orig>, then symlink into HF cache under the upstream repo IDs so existing from_pretrained() calls resolve from the mirror. Set HF_HUB_OFFLINE=1 after the parallel download phase to force any later HF Hub call (CLAP, AudioLDM2, MMAudio open_clip 'hf-hub:apple/...') to be cache-only.
MMAudio/mmaudio/utils/download_utils.py: hardcoded huggingface.co/hkchengrex/MMAudio + github.com/hkchengrex/MMAudio/releases URLs repointed at upstream/MMAudio/{weights,ext_weights}/ in the mirror. Dormant under normal startup, live if MMAudio's download_model_if_needed() is ever invoked.
Result: full upstream-protection mode. Space pulls all 11 main weights + 5 encoder/vocoder repos from the mirror with no upstream calls at runtime.
- MMAudio/mmaudio/utils/download_utils.py +16 -10
- app.py +67 -20
|
@@ -7,51 +7,57 @@ from tqdm import tqdm
|
|
| 7 |
|
| 8 |
log = logging.getLogger()
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
links = [
|
| 11 |
{
|
| 12 |
'name': 'mmaudio_small_16k.pth',
|
| 13 |
-
'url': '
|
| 14 |
'md5': 'af93cde404179f58e3919ac085b8033b',
|
| 15 |
},
|
| 16 |
{
|
| 17 |
'name': 'mmaudio_small_44k.pth',
|
| 18 |
-
'url': '
|
| 19 |
'md5': 'babd74c884783d13701ea2820a5f5b6d',
|
| 20 |
},
|
| 21 |
{
|
| 22 |
'name': 'mmaudio_medium_44k.pth',
|
| 23 |
-
'url': '
|
| 24 |
'md5': '5a56b6665e45a1e65ada534defa903d0',
|
| 25 |
},
|
| 26 |
{
|
| 27 |
'name': 'mmaudio_large_44k.pth',
|
| 28 |
-
'url': '
|
| 29 |
'md5': 'fed96c325a6785b85ce75ae1aafd2673'
|
| 30 |
},
|
| 31 |
{
|
| 32 |
'name': 'mmaudio_large_44k_v2.pth',
|
| 33 |
-
'url': '
|
| 34 |
'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
|
| 35 |
},
|
| 36 |
{
|
| 37 |
'name': 'v1-16.pth',
|
| 38 |
-
'url': '
|
| 39 |
'md5': '69f56803f59a549a1a507c93859fd4d7'
|
| 40 |
},
|
| 41 |
{
|
| 42 |
'name': 'best_netG.pt',
|
| 43 |
-
'url': '
|
| 44 |
'md5': 'eeaf372a38a9c31c362120aba2dde292'
|
| 45 |
},
|
| 46 |
{
|
| 47 |
'name': 'v1-44.pth',
|
| 48 |
-
'url': '
|
| 49 |
'md5': 'fab020275fa44c6589820ce025191600'
|
| 50 |
},
|
| 51 |
{
|
| 52 |
'name': 'synchformer_state_dict.pth',
|
| 53 |
-
'url':
|
| 54 |
-
'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth',
|
| 55 |
'md5': '5b2f5594b0730f70e41e549b7c94390c'
|
| 56 |
},
|
| 57 |
]
|
|
|
|
| 7 |
|
| 8 |
log = logging.getLogger()
|
| 9 |
|
| 10 |
+
# URLs repointed at JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints/
|
| 11 |
+
# upstream/MMAudio/, which mirrors hkchengrex/MMAudio at sha eb13a1a9. The
|
| 12 |
+
# original GitHub-released files (v1-16.pth etc.) live in upstream's
|
| 13 |
+
# ext_weights/ folder, so they map cleanly under the same prefix.
|
| 14 |
+
_MIRROR = ("https://huggingface.co/JackIsNotInTheBox/"
|
| 15 |
+
"Generate_Audio_for_Video_Checkpoints/resolve/main/upstream/MMAudio")
|
| 16 |
+
|
| 17 |
links = [
|
| 18 |
{
|
| 19 |
'name': 'mmaudio_small_16k.pth',
|
| 20 |
+
'url': f'{_MIRROR}/weights/mmaudio_small_16k.pth',
|
| 21 |
'md5': 'af93cde404179f58e3919ac085b8033b',
|
| 22 |
},
|
| 23 |
{
|
| 24 |
'name': 'mmaudio_small_44k.pth',
|
| 25 |
+
'url': f'{_MIRROR}/weights/mmaudio_small_44k.pth',
|
| 26 |
'md5': 'babd74c884783d13701ea2820a5f5b6d',
|
| 27 |
},
|
| 28 |
{
|
| 29 |
'name': 'mmaudio_medium_44k.pth',
|
| 30 |
+
'url': f'{_MIRROR}/weights/mmaudio_medium_44k.pth',
|
| 31 |
'md5': '5a56b6665e45a1e65ada534defa903d0',
|
| 32 |
},
|
| 33 |
{
|
| 34 |
'name': 'mmaudio_large_44k.pth',
|
| 35 |
+
'url': f'{_MIRROR}/weights/mmaudio_large_44k.pth',
|
| 36 |
'md5': 'fed96c325a6785b85ce75ae1aafd2673'
|
| 37 |
},
|
| 38 |
{
|
| 39 |
'name': 'mmaudio_large_44k_v2.pth',
|
| 40 |
+
'url': f'{_MIRROR}/weights/mmaudio_large_44k_v2.pth',
|
| 41 |
'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
|
| 42 |
},
|
| 43 |
{
|
| 44 |
'name': 'v1-16.pth',
|
| 45 |
+
'url': f'{_MIRROR}/ext_weights/v1-16.pth',
|
| 46 |
'md5': '69f56803f59a549a1a507c93859fd4d7'
|
| 47 |
},
|
| 48 |
{
|
| 49 |
'name': 'best_netG.pt',
|
| 50 |
+
'url': f'{_MIRROR}/ext_weights/best_netG.pt',
|
| 51 |
'md5': 'eeaf372a38a9c31c362120aba2dde292'
|
| 52 |
},
|
| 53 |
{
|
| 54 |
'name': 'v1-44.pth',
|
| 55 |
+
'url': f'{_MIRROR}/ext_weights/v1-44.pth',
|
| 56 |
'md5': 'fab020275fa44c6589820ce025191600'
|
| 57 |
},
|
| 58 |
{
|
| 59 |
'name': 'synchformer_state_dict.pth',
|
| 60 |
+
'url': f'{_MIRROR}/ext_weights/synchformer_state_dict.pth',
|
|
|
|
| 61 |
'md5': '5b2f5594b0730f70e41e549b7c94390c'
|
| 62 |
},
|
| 63 |
]
|
|
@@ -27,7 +27,7 @@ import torchaudio
|
|
| 27 |
import ffmpeg
|
| 28 |
import spaces
|
| 29 |
import gradio as gr
|
| 30 |
-
from huggingface_hub import hf_hub_download, snapshot_download
|
| 31 |
|
| 32 |
# ================================================================== #
|
| 33 |
# CHECKPOINT CONFIGURATION #
|
|
@@ -82,34 +82,74 @@ def _dl_hunyuan():
|
|
| 82 |
cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
|
| 83 |
print("HunyuanVideoFoley checkpoints downloaded.")
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
def _dl_siglip2():
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
|
| 90 |
def _dl_clap():
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
|
| 95 |
def _dl_clip():
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
def _dl_audioldm2():
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def _dl_bigvgan():
|
| 106 |
-
"""
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
print("[startup] Starting parallel checkpoint + model downloads…")
|
| 115 |
_t_dl_start = time.perf_counter()
|
|
@@ -133,6 +173,13 @@ mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.re
|
|
| 133 |
bigvgan_local_dir = _fut_bigvgan.result()
|
| 134 |
print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
# ================================================================== #
|
| 137 |
# SHARED CONSTANTS / HELPERS #
|
| 138 |
# ================================================================== #
|
|
|
|
| 27 |
import ffmpeg
|
| 28 |
import spaces
|
| 29 |
import gradio as gr
|
| 30 |
+
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
| 31 |
|
| 32 |
# ================================================================== #
|
| 33 |
# CHECKPOINT CONFIGURATION #
|
|
|
|
| 82 |
cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
|
| 83 |
print("HunyuanVideoFoley checkpoints downloaded.")
|
| 84 |
|
| 85 |
+
def _populate_hf_cache_from_mirror(orig_repo_id, mirror_subpath):
|
| 86 |
+
"""Download an encoder slice from CKPT_REPO_ID and wire it into the HF
|
| 87 |
+
cache under `orig_repo_id` so existing from_pretrained() calls keyed by
|
| 88 |
+
the upstream id resolve from the mirror without networking upstream."""
|
| 89 |
+
sha = "mirror-snapshot"
|
| 90 |
+
mirror_root = snapshot_download(
|
| 91 |
+
repo_id=CKPT_REPO_ID,
|
| 92 |
+
allow_patterns=[f"{mirror_subpath}/**"],
|
| 93 |
+
)
|
| 94 |
+
src = Path(mirror_root) / mirror_subpath
|
| 95 |
+
cache_root = Path(os.environ.get(
|
| 96 |
+
"HF_HUB_CACHE", os.path.expanduser("~/.cache/huggingface/hub")
|
| 97 |
+
))
|
| 98 |
+
repo_cache = cache_root / f"models--{orig_repo_id.replace('/', '--')}"
|
| 99 |
+
snap = repo_cache / "snapshots" / sha
|
| 100 |
+
refs = repo_cache / "refs"
|
| 101 |
+
snap.parent.mkdir(parents=True, exist_ok=True)
|
| 102 |
+
refs.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
if snap.is_symlink() or snap.exists():
|
| 104 |
+
if snap.is_symlink():
|
| 105 |
+
snap.unlink()
|
| 106 |
+
else:
|
| 107 |
+
shutil.rmtree(snap)
|
| 108 |
+
snap.symlink_to(src, target_is_directory=True)
|
| 109 |
+
(refs / "main").write_text(sha)
|
| 110 |
+
return str(snap)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def _dl_siglip2():
|
| 114 |
+
p = _populate_hf_cache_from_mirror(
|
| 115 |
+
"google/siglip2-base-patch16-512",
|
| 116 |
+
"encoders/google/siglip2-base-patch16-512",
|
| 117 |
+
)
|
| 118 |
+
print(f"SigLIP2 pre-downloaded from mirror -> {p}")
|
| 119 |
|
| 120 |
def _dl_clap():
|
| 121 |
+
p = _populate_hf_cache_from_mirror(
|
| 122 |
+
"laion/larger_clap_general",
|
| 123 |
+
"encoders/laion/larger_clap_general",
|
| 124 |
+
)
|
| 125 |
+
print(f"CLAP pre-downloaded from mirror -> {p}")
|
| 126 |
|
| 127 |
def _dl_clip():
|
| 128 |
+
# Space code uses the legacy '-384' alias which upstream redirects to
|
| 129 |
+
# '-378'. We mirror under the canonical name and key the cache under
|
| 130 |
+
# the alias so the existing 'hf-hub:apple/...-384' calls resolve.
|
| 131 |
+
p = _populate_hf_cache_from_mirror(
|
| 132 |
+
"apple/DFN5B-CLIP-ViT-H-14-384",
|
| 133 |
+
"encoders/apple/DFN5B-CLIP-ViT-H-14-378",
|
| 134 |
+
)
|
| 135 |
+
print(f"MMAudio CLIP pre-downloaded from mirror -> {p}")
|
| 136 |
|
| 137 |
def _dl_audioldm2():
|
| 138 |
+
p = _populate_hf_cache_from_mirror(
|
| 139 |
+
"cvssp/audioldm2",
|
| 140 |
+
"encoders/cvssp/audioldm2",
|
| 141 |
+
)
|
| 142 |
+
print(f"AudioLDM2 pre-downloaded from mirror -> {p}")
|
| 143 |
|
| 144 |
def _dl_bigvgan():
|
| 145 |
+
"""Returns the local snapshot dir so MMAudio's BigVGANv2.from_pretrained()
|
| 146 |
+
receives a local path rather than reaching out to HF Hub."""
|
| 147 |
+
p = _populate_hf_cache_from_mirror(
|
| 148 |
+
"nvidia/bigvgan_v2_44khz_128band_512x",
|
| 149 |
+
"encoders/nvidia/bigvgan_v2_44khz_128band_512x",
|
| 150 |
+
)
|
| 151 |
+
print(f"BigVGAN pre-downloaded from mirror -> {p}")
|
| 152 |
+
return p
|
| 153 |
|
| 154 |
print("[startup] Starting parallel checkpoint + model downloads…")
|
| 155 |
_t_dl_start = time.perf_counter()
|
|
|
|
| 173 |
bigvgan_local_dir = _fut_bigvgan.result()
|
| 174 |
print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
|
| 175 |
|
| 176 |
+
# Force any later HF Hub call to be cache-only. With every encoder cache
|
| 177 |
+
# pre-populated above from the user's mirror, downstream from_pretrained()
|
| 178 |
+
# calls (CLAP, AudioLDM2, MMAudio's open_clip 'hf-hub:...') resolve from
|
| 179 |
+
# the populated cache instead of networking out to upstream — full
|
| 180 |
+
# upstream-protection mode.
|
| 181 |
+
os.environ["HF_HUB_OFFLINE"] = "1"
|
| 182 |
+
|
| 183 |
# ================================================================== #
|
| 184 |
# SHARED CONSTANTS / HELPERS #
|
| 185 |
# ================================================================== #
|