Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on about 8 hours ago

Commit

00e0606

1 Parent(s): 00ee17c

Fix TARO GPU worker crash: use local_files_only=True for audioldm2 VAE/vocoder

from_pretrained('cvssp/audioldm2') was hitting the HF network inside the
ZeroGPU worker to resolve cache, timing out and crashing. local_files_only=True
uses the pre-downloaded cache directly with no network calls.

Files changed (1) hide show

app.py +4 -2

app.py CHANGED Viewed

@@ -322,8 +322,10 @@ def _load_taro_models(device, weight_dtype):
     model_net = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
     model_net.load_state_dict(torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"])
     model_net.eval().to(weight_dtype)
-    vae     = AutoencoderKL.from_pretrained("cvssp/audioldm2", subfolder="vae").to(device).eval()
-    vocoder = SpeechT5HifiGan.from_pretrained("cvssp/audioldm2", subfolder="vocoder").to(device)
     latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
     return model_net, vae, vocoder, latents_scale

     model_net = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
     model_net.load_state_dict(torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"])
     model_net.eval().to(weight_dtype)
+    vae     = AutoencoderKL.from_pretrained("cvssp/audioldm2", subfolder="vae",
+                                            local_files_only=True).to(device).eval()
+    vocoder = SpeechT5HifiGan.from_pretrained("cvssp/audioldm2", subfolder="vocoder",
+                                              local_files_only=True).to(device)
     latents_scale = torch.tensor([0.18215] * 8).view(1, 8, 1, 1).to(device)
     return model_net, vae, vocoder, latents_scale