BoxOfColors Claude Sonnet 4.6 commited on
Commit
04fdc6c
·
1 Parent(s): aa53ba5

Fix MMAudio: load BigVGAN from local snapshot dir, not HF network

Browse files

BigVGANv2.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x') was hitting
the HF network inside the ZeroGPU worker because AutoEncoderModule hardcoded
the repo ID string for 44k mode, ignoring the vocoder_ckpt_path argument.

Fixes:
- _dl_bigvgan() now returns the local snapshot dir from snapshot_download()
- bigvgan_local_dir captured at startup alongside other checkpoint paths
- _load_mmaudio_models passes bigvgan_local_dir as bigvgan_vocoder_ckpt
- AutoEncoderModule (44k path) uses vocoder_ckpt_path as the from_pretrained
source when it points to a local directory, falling back to the repo ID
string only when no local path is provided

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

MMAudio/mmaudio/ext/autoencoder/autoencoder.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from typing import Literal, Optional
2
 
3
  import torch
@@ -27,8 +28,12 @@ class AutoEncoderModule(nn.Module):
27
  assert vocoder_ckpt_path is not None
28
  self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
29
  elif mode == '44k':
30
- self.vocoder = BigVGANv2.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x',
31
- use_cuda_kernel=False)
 
 
 
 
32
  self.vocoder.remove_weight_norm()
33
  else:
34
  raise ValueError(f'Unknown mode: {mode}')
 
1
+ import os
2
  from typing import Literal, Optional
3
 
4
  import torch
 
28
  assert vocoder_ckpt_path is not None
29
  self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
30
  elif mode == '44k':
31
+ # If vocoder_ckpt_path points to a local snapshot directory, use it
32
+ # directly to avoid a network fetch inside ZeroGPU workers.
33
+ bigvgan_src = vocoder_ckpt_path if (
34
+ vocoder_ckpt_path is not None and os.path.isdir(vocoder_ckpt_path)
35
+ ) else 'nvidia/bigvgan_v2_44khz_128band_512x'
36
+ self.vocoder = BigVGANv2.from_pretrained(bigvgan_src, use_cuda_kernel=False)
37
  self.vocoder.remove_weight_norm()
38
  else:
39
  raise ValueError(f'Unknown mode: {mode}')
app.py CHANGED
@@ -98,9 +98,13 @@ def _dl_audioldm2():
98
  print("AudioLDM2 pre-downloaded.")
99
 
100
  def _dl_bigvgan():
101
- """Pre-download BigVGAN vocoder (~489 MB) used by MMAudio."""
102
- snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
103
- print("BigVGAN vocoder pre-downloaded.")
 
 
 
 
104
 
105
  print("[startup] Starting parallel checkpoint + model downloads…")
106
  _t_dl_start = time.perf_counter()
@@ -119,6 +123,7 @@ with ThreadPoolExecutor(max_workers=7) as _pool:
119
 
120
  cavp_ckpt_path, onset_ckpt_path, taro_ckpt_path = _fut_taro.result()
121
  mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.result()
 
122
  print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
123
 
124
  # ================================================================== #
@@ -380,7 +385,7 @@ def _load_mmaudio_models(device, dtype):
380
  tod_vae_ckpt=str(model_cfg.vae_path),
381
  synchformer_ckpt=str(model_cfg.synchformer_ckpt),
382
  enable_conditions=True, mode=model_cfg.mode,
383
- bigvgan_vocoder_ckpt=None, need_vae_encoder=False,
384
  ).to(device, dtype).eval()
385
  return net, feature_utils, model_cfg, seq_cfg
386
 
 
98
  print("AudioLDM2 pre-downloaded.")
99
 
100
  def _dl_bigvgan():
101
+ """Pre-download BigVGAN vocoder (~489 MB) used by MMAudio.
102
+ Returns the local snapshot directory so _load_mmaudio_models can pass it
103
+ to BigVGANv2.from_pretrained() as a local path, avoiding a network hit
104
+ inside the ZeroGPU worker."""
105
+ local_dir = snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
106
+ print(f"BigVGAN vocoder pre-downloaded to {local_dir}.")
107
+ return local_dir
108
 
109
  print("[startup] Starting parallel checkpoint + model downloads…")
110
  _t_dl_start = time.perf_counter()
 
123
 
124
  cavp_ckpt_path, onset_ckpt_path, taro_ckpt_path = _fut_taro.result()
125
  mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.result()
126
+ bigvgan_local_dir = _fut_bigvgan.result()
127
  print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
128
 
129
  # ================================================================== #
 
385
  tod_vae_ckpt=str(model_cfg.vae_path),
386
  synchformer_ckpt=str(model_cfg.synchformer_ckpt),
387
  enable_conditions=True, mode=model_cfg.mode,
388
+ bigvgan_vocoder_ckpt=bigvgan_local_dir, need_vae_encoder=False,
389
  ).to(device, dtype).eval()
390
  return net, feature_utils, model_cfg, seq_cfg
391