Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Opus 4.6 commited on 2 days ago

Commit

26e8bca

1 Parent(s): ef4f0ff

Fix FlashSR CUDA init: bypass FASR class, load SynthesizerTrn on CPU

FASR.__init__ calls torch.cuda.is_available() and .to(device),
which initializes CUDA in the main process and violates ZeroGPU
stateless-GPU rule — aborting all subsequent GPU tasks.

Now we load SynthesizerTrn directly on CPU, replicating the same
hyperparams and normalization that FASR uses, without touching CUDA.
This allows FlashSR to run safely in the CPU post-processing step
outside @spaces.GPU, saving GPU quota per segment.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +32 -15

app.py CHANGED Viewed

@@ -514,23 +514,42 @@ FLASHSR_SR_OUT = 48000
 def _load_flashsr():
-    """Load FlashSR model (cached after first call). Returns FASR instance."""
     global _FLASHSR_MODEL
     with _FLASHSR_LOCK:
         if _FLASHSR_MODEL is not None:
             return _FLASHSR_MODEL
-        print("[FlashSR] Loading model weights from HF Hub …")
         from huggingface_hub import hf_hub_download
-        from FastAudioSR import FASR
         ckpt_path = hf_hub_download(
             repo_id="YatharthS/FlashSR",
             filename="upsampler.pth",
             local_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), ".flashsr_cache"),
         )
-        # Always load on CPU — ZeroGPU forbids CUDA init outside @spaces.GPU.
-        # FlashSR is tiny (1.72 MB) and fast enough on CPU for post-processing.
-        model = FASR(ckpt_path)
-        print("[FlashSR] Model loaded on CPU (fp32)")
         _FLASHSR_MODEL = model
         return model
@@ -543,16 +562,14 @@ def _apply_flashsr(wav_16k: np.ndarray) -> np.ndarray:
     """
     try:
         model = _load_flashsr()
-        # Keep on CPU — no CUDA outside @spaces.GPU in ZeroGPU environment
-        t = torch.from_numpy(wav_16k.astype(np.float32)).unsqueeze(0)
         print(f"[FlashSR] Upsampling {len(wav_16k)/FLASHSR_SR_IN:.2f}s @ 16kHz → 48kHz (CPU) …")
         with torch.no_grad():
-            out = model.run(t)
-        # out is a tensor or numpy array — normalise to numpy float32 cpu
-        if isinstance(out, torch.Tensor):
-            out = out.float().cpu().squeeze().numpy()
-        else:
-            out = np.array(out, dtype=np.float32).squeeze()
         print(f"[FlashSR] Done — output shape {out.shape}, sr={FLASHSR_SR_OUT}")
         return out
     except Exception as e:

 def _load_flashsr():
+    """Load FlashSR SynthesizerTrn on CPU (cached after first call).
+    We bypass the FASR wrapper class because it calls
+    ``torch.cuda.is_available()`` and ``.to(device)`` in ``__init__``,
+    which initialises CUDA in the main process and violates ZeroGPU's
+    stateless-GPU rule (aborting all subsequent GPU tasks).
+    Instead we instantiate the underlying ``SynthesizerTrn`` directly on CPU.
+    """
     global _FLASHSR_MODEL
     with _FLASHSR_LOCK:
         if _FLASHSR_MODEL is not None:
             return _FLASHSR_MODEL
+        print("[FlashSR] Loading SynthesizerTrn on CPU (bypassing FASR to avoid CUDA init) …")
         from huggingface_hub import hf_hub_download
+        from FastAudioSR.speechsr import SynthesizerTrn
         ckpt_path = hf_hub_download(
             repo_id="YatharthS/FlashSR",
             filename="upsampler.pth",
             local_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), ".flashsr_cache"),
         )
+        # Replicate FASR's hps exactly, but stay on CPU
+        model = SynthesizerTrn(
+            spec_channels=128,                           # n_mel_channels
+            segment_size=9600 // 320,                    # segment_size // hop_length = 30
+            resblock="0",
+            resblock_kernel_sizes=[3, 7, 11],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            upsample_rates=[3],
+            upsample_initial_channel=32,
+            upsample_kernel_sizes=[3],
+        )
+        checkpoint_dict = torch.load(ckpt_path, map_location="cpu")["model"]
+        model.load_state_dict(checkpoint_dict)
+        model.eval()
+        print("[FlashSR] SynthesizerTrn loaded on CPU (fp32) — no CUDA touched")
         _FLASHSR_MODEL = model
         return model
     """
     try:
         model = _load_flashsr()
+        t = torch.from_numpy(wav_16k.astype(np.float32)).unsqueeze(0)  # [1, T]
         print(f"[FlashSR] Upsampling {len(wav_16k)/FLASHSR_SR_IN:.2f}s @ 16kHz → 48kHz (CPU) …")
         with torch.no_grad():
+            # SynthesizerTrn.forward expects [B, 1, T] — add channel dim
+            out = model(t.unsqueeze(1))   # → [B, 1, T*3]
+        out = out.squeeze()               # → [T*3]
+        out = out / (torch.abs(out).max() + 1e-8) * 0.999  # normalize like FASR.super_resolution
+        out = out.cpu().float().numpy()
         print(f"[FlashSR] Done — output shape {out.shape}, sr={FLASHSR_SR_OUT}")
         return out
     except Exception as e: