JackIsNotInTheBox commited on
Commit
2c38be2
·
verified ·
1 Parent(s): f012a29

Reroute encoders + MMAudio downloads through user-controlled mirror

Browse files

app.py: replace direct upstream snapshot_download() for siglip2/clap/clip/audioldm2/bigvgan with downloads from JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints/encoders/<orig>, then symlink into HF cache under the upstream repo IDs so existing from_pretrained() calls resolve from the mirror. Set HF_HUB_OFFLINE=1 after the parallel download phase to force any later HF Hub call (CLAP, AudioLDM2, MMAudio open_clip 'hf-hub:apple/...') to be cache-only.

MMAudio/mmaudio/utils/download_utils.py: hardcoded huggingface.co/hkchengrex/MMAudio + github.com/hkchengrex/MMAudio/releases URLs repointed at upstream/MMAudio/{weights,ext_weights}/ in the mirror. Dormant under normal startup, live if MMAudio's download_model_if_needed() is ever invoked.

Result: full upstream-protection mode. Space pulls all 11 main weights + 5 encoder/vocoder repos from the mirror with no upstream calls at runtime.

Files changed (2) hide show
  1. MMAudio/mmaudio/utils/download_utils.py +16 -10
  2. app.py +67 -20
MMAudio/mmaudio/utils/download_utils.py CHANGED
@@ -7,51 +7,57 @@ from tqdm import tqdm
7
 
8
  log = logging.getLogger()
9
 
 
 
 
 
 
 
 
10
  links = [
11
  {
12
  'name': 'mmaudio_small_16k.pth',
13
- 'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_16k.pth',
14
  'md5': 'af93cde404179f58e3919ac085b8033b',
15
  },
16
  {
17
  'name': 'mmaudio_small_44k.pth',
18
- 'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_44k.pth',
19
  'md5': 'babd74c884783d13701ea2820a5f5b6d',
20
  },
21
  {
22
  'name': 'mmaudio_medium_44k.pth',
23
- 'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_medium_44k.pth',
24
  'md5': '5a56b6665e45a1e65ada534defa903d0',
25
  },
26
  {
27
  'name': 'mmaudio_large_44k.pth',
28
- 'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k.pth',
29
  'md5': 'fed96c325a6785b85ce75ae1aafd2673'
30
  },
31
  {
32
  'name': 'mmaudio_large_44k_v2.pth',
33
- 'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth',
34
  'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
35
  },
36
  {
37
  'name': 'v1-16.pth',
38
- 'url': 'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth',
39
  'md5': '69f56803f59a549a1a507c93859fd4d7'
40
  },
41
  {
42
  'name': 'best_netG.pt',
43
- 'url': 'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt',
44
  'md5': 'eeaf372a38a9c31c362120aba2dde292'
45
  },
46
  {
47
  'name': 'v1-44.pth',
48
- 'url': 'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth',
49
  'md5': 'fab020275fa44c6589820ce025191600'
50
  },
51
  {
52
  'name': 'synchformer_state_dict.pth',
53
- 'url':
54
- 'https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth',
55
  'md5': '5b2f5594b0730f70e41e549b7c94390c'
56
  },
57
  ]
 
7
 
8
  log = logging.getLogger()
9
 
10
+ # URLs repointed at JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints/
11
+ # upstream/MMAudio/, which mirrors hkchengrex/MMAudio at sha eb13a1a9. The
12
+ # original GitHub-released files (v1-16.pth etc.) live in upstream's
13
+ # ext_weights/ folder, so they map cleanly under the same prefix.
14
+ _MIRROR = ("https://huggingface.co/JackIsNotInTheBox/"
15
+ "Generate_Audio_for_Video_Checkpoints/resolve/main/upstream/MMAudio")
16
+
17
  links = [
18
  {
19
  'name': 'mmaudio_small_16k.pth',
20
+ 'url': f'{_MIRROR}/weights/mmaudio_small_16k.pth',
21
  'md5': 'af93cde404179f58e3919ac085b8033b',
22
  },
23
  {
24
  'name': 'mmaudio_small_44k.pth',
25
+ 'url': f'{_MIRROR}/weights/mmaudio_small_44k.pth',
26
  'md5': 'babd74c884783d13701ea2820a5f5b6d',
27
  },
28
  {
29
  'name': 'mmaudio_medium_44k.pth',
30
+ 'url': f'{_MIRROR}/weights/mmaudio_medium_44k.pth',
31
  'md5': '5a56b6665e45a1e65ada534defa903d0',
32
  },
33
  {
34
  'name': 'mmaudio_large_44k.pth',
35
+ 'url': f'{_MIRROR}/weights/mmaudio_large_44k.pth',
36
  'md5': 'fed96c325a6785b85ce75ae1aafd2673'
37
  },
38
  {
39
  'name': 'mmaudio_large_44k_v2.pth',
40
+ 'url': f'{_MIRROR}/weights/mmaudio_large_44k_v2.pth',
41
  'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
42
  },
43
  {
44
  'name': 'v1-16.pth',
45
+ 'url': f'{_MIRROR}/ext_weights/v1-16.pth',
46
  'md5': '69f56803f59a549a1a507c93859fd4d7'
47
  },
48
  {
49
  'name': 'best_netG.pt',
50
+ 'url': f'{_MIRROR}/ext_weights/best_netG.pt',
51
  'md5': 'eeaf372a38a9c31c362120aba2dde292'
52
  },
53
  {
54
  'name': 'v1-44.pth',
55
+ 'url': f'{_MIRROR}/ext_weights/v1-44.pth',
56
  'md5': 'fab020275fa44c6589820ce025191600'
57
  },
58
  {
59
  'name': 'synchformer_state_dict.pth',
60
+ 'url': f'{_MIRROR}/ext_weights/synchformer_state_dict.pth',
 
61
  'md5': '5b2f5594b0730f70e41e549b7c94390c'
62
  },
63
  ]
app.py CHANGED
@@ -27,7 +27,7 @@ import torchaudio
27
  import ffmpeg
28
  import spaces
29
  import gradio as gr
30
- from huggingface_hub import hf_hub_download, snapshot_download
31
 
32
  # ================================================================== #
33
  # CHECKPOINT CONFIGURATION #
@@ -82,34 +82,74 @@ def _dl_hunyuan():
82
  cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
83
  print("HunyuanVideoFoley checkpoints downloaded.")
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def _dl_siglip2():
86
- """Pre-download SigLIP2 (~1.5 GB) used by HunyuanFoley's visual encoder."""
87
- snapshot_download(repo_id="google/siglip2-base-patch16-512")
88
- print("SigLIP2 pre-downloaded.")
 
 
89
 
90
  def _dl_clap():
91
- """Pre-download CLAP so from_pretrained() hits local cache inside the ZeroGPU worker."""
92
- snapshot_download(repo_id="laion/larger_clap_general")
93
- print("CLAP model pre-downloaded.")
 
 
94
 
95
  def _dl_clip():
96
- """Pre-download MMAudio's CLIP model (~3.95 GB) to avoid GPU-window budget drain."""
97
- snapshot_download(repo_id="apple/DFN5B-CLIP-ViT-H-14-384")
98
- print("MMAudio CLIP model pre-downloaded.")
 
 
 
 
 
99
 
100
  def _dl_audioldm2():
101
- """Pre-download AudioLDM2 VAE/vocoder used by TARO's from_pretrained() calls."""
102
- snapshot_download(repo_id="cvssp/audioldm2")
103
- print("AudioLDM2 pre-downloaded.")
 
 
104
 
105
  def _dl_bigvgan():
106
- """Pre-download BigVGAN vocoder (~489 MB) used by MMAudio.
107
- Returns the local snapshot directory so _load_mmaudio_models can pass it
108
- to BigVGANv2.from_pretrained() as a local path, avoiding a network hit
109
- inside the ZeroGPU worker."""
110
- local_dir = snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
111
- print(f"BigVGAN vocoder pre-downloaded to {local_dir}.")
112
- return local_dir
 
113
 
114
  print("[startup] Starting parallel checkpoint + model downloads…")
115
  _t_dl_start = time.perf_counter()
@@ -133,6 +173,13 @@ mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.re
133
  bigvgan_local_dir = _fut_bigvgan.result()
134
  print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
135
 
 
 
 
 
 
 
 
136
  # ================================================================== #
137
  # SHARED CONSTANTS / HELPERS #
138
  # ================================================================== #
 
27
  import ffmpeg
28
  import spaces
29
  import gradio as gr
30
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
31
 
32
  # ================================================================== #
33
  # CHECKPOINT CONFIGURATION #
 
82
  cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
83
  print("HunyuanVideoFoley checkpoints downloaded.")
84
 
85
+ def _populate_hf_cache_from_mirror(orig_repo_id, mirror_subpath):
86
+ """Download an encoder slice from CKPT_REPO_ID and wire it into the HF
87
+ cache under `orig_repo_id` so existing from_pretrained() calls keyed by
88
+ the upstream id resolve from the mirror without networking upstream."""
89
+ sha = "mirror-snapshot"
90
+ mirror_root = snapshot_download(
91
+ repo_id=CKPT_REPO_ID,
92
+ allow_patterns=[f"{mirror_subpath}/**"],
93
+ )
94
+ src = Path(mirror_root) / mirror_subpath
95
+ cache_root = Path(os.environ.get(
96
+ "HF_HUB_CACHE", os.path.expanduser("~/.cache/huggingface/hub")
97
+ ))
98
+ repo_cache = cache_root / f"models--{orig_repo_id.replace('/', '--')}"
99
+ snap = repo_cache / "snapshots" / sha
100
+ refs = repo_cache / "refs"
101
+ snap.parent.mkdir(parents=True, exist_ok=True)
102
+ refs.mkdir(parents=True, exist_ok=True)
103
+ if snap.is_symlink() or snap.exists():
104
+ if snap.is_symlink():
105
+ snap.unlink()
106
+ else:
107
+ shutil.rmtree(snap)
108
+ snap.symlink_to(src, target_is_directory=True)
109
+ (refs / "main").write_text(sha)
110
+ return str(snap)
111
+
112
+
113
  def _dl_siglip2():
114
+ p = _populate_hf_cache_from_mirror(
115
+ "google/siglip2-base-patch16-512",
116
+ "encoders/google/siglip2-base-patch16-512",
117
+ )
118
+ print(f"SigLIP2 pre-downloaded from mirror -> {p}")
119
 
120
  def _dl_clap():
121
+ p = _populate_hf_cache_from_mirror(
122
+ "laion/larger_clap_general",
123
+ "encoders/laion/larger_clap_general",
124
+ )
125
+ print(f"CLAP pre-downloaded from mirror -> {p}")
126
 
127
  def _dl_clip():
128
+ # Space code uses the legacy '-384' alias which upstream redirects to
129
+ # '-378'. We mirror under the canonical name and key the cache under
130
+ # the alias so the existing 'hf-hub:apple/...-384' calls resolve.
131
+ p = _populate_hf_cache_from_mirror(
132
+ "apple/DFN5B-CLIP-ViT-H-14-384",
133
+ "encoders/apple/DFN5B-CLIP-ViT-H-14-378",
134
+ )
135
+ print(f"MMAudio CLIP pre-downloaded from mirror -> {p}")
136
 
137
  def _dl_audioldm2():
138
+ p = _populate_hf_cache_from_mirror(
139
+ "cvssp/audioldm2",
140
+ "encoders/cvssp/audioldm2",
141
+ )
142
+ print(f"AudioLDM2 pre-downloaded from mirror -> {p}")
143
 
144
  def _dl_bigvgan():
145
+ """Returns the local snapshot dir so MMAudio's BigVGANv2.from_pretrained()
146
+ receives a local path rather than reaching out to HF Hub."""
147
+ p = _populate_hf_cache_from_mirror(
148
+ "nvidia/bigvgan_v2_44khz_128band_512x",
149
+ "encoders/nvidia/bigvgan_v2_44khz_128band_512x",
150
+ )
151
+ print(f"BigVGAN pre-downloaded from mirror -> {p}")
152
+ return p
153
 
154
  print("[startup] Starting parallel checkpoint + model downloads…")
155
  _t_dl_start = time.perf_counter()
 
173
  bigvgan_local_dir = _fut_bigvgan.result()
174
  print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
175
 
176
+ # Force any later HF Hub call to be cache-only. With every encoder cache
177
+ # pre-populated above from the user's mirror, downstream from_pretrained()
178
+ # calls (CLAP, AudioLDM2, MMAudio's open_clip 'hf-hub:...') resolve from
179
+ # the populated cache instead of networking out to upstream — full
180
+ # upstream-protection mode.
181
+ os.environ["HF_HUB_OFFLINE"] = "1"
182
+
183
  # ================================================================== #
184
  # SHARED CONSTANTS / HELPERS #
185
  # ================================================================== #