Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Opus 4.6 commited on 2 days ago

Commit

dbba693

1 Parent(s): 64f71ea

Replace FlashSR with sinc resampling for ZeroGPU compatibility

FlashSR and its transitive imports (torchaudio, FastAudioSR) trigger
torch.cuda.is_available() during module import, which violates
ZeroGPUs stateless-GPU rule and aborts all subsequent GPU tasks.

Replace _apply_flashsr with torchaudio.functional.resample (sinc,
CPU-only, no CUDA risk). Output is still 48kHz. Remove FlashSR from
requirements.txt and clean up unused _FLASHSR_MODEL/_FLASHSR_LOCK.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

app.py +11 -64
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -506,77 +506,24 @@ def _taro_infer_segment(
 # models produce output at the same sample rate.
 # Model weights are downloaded once from HF Hub and cached on disk.
-_FLASHSR_MODEL = None   # module-level cache — loaded once per process
-_FLASHSR_LOCK  = threading.Lock()
 FLASHSR_SR_IN  = 16000
 FLASHSR_SR_OUT = 48000
-def _load_flashsr():
-    """Load FlashSR SynthesizerTrn on CPU (cached after first call).
-    We bypass the FASR wrapper class because it calls
-    ``torch.cuda.is_available()`` and ``.to(device)`` in ``__init__``,
-    which initialises CUDA in the main process and violates ZeroGPU's
-    stateless-GPU rule (aborting all subsequent GPU tasks).
-    Instead we instantiate the underlying ``SynthesizerTrn`` directly on CPU.
-    """
-    global _FLASHSR_MODEL
-    with _FLASHSR_LOCK:
-        if _FLASHSR_MODEL is not None:
-            return _FLASHSR_MODEL
-        print("[FlashSR] Loading SynthesizerTrn on CPU (bypassing FASR to avoid CUDA init) …")
-        from huggingface_hub import hf_hub_download
-        from FastAudioSR.speechsr import SynthesizerTrn
-        ckpt_path = hf_hub_download(
-            repo_id="YatharthS/FlashSR",
-            filename="upsampler.pth",
-            local_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), ".flashsr_cache"),
-        )
-        # Replicate FASR's hps exactly, but stay on CPU
-        model = SynthesizerTrn(
-            spec_channels=128,                           # n_mel_channels
-            segment_size=9600 // 320,                    # segment_size // hop_length = 30
-            resblock="0",
-            resblock_kernel_sizes=[3, 7, 11],
-            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-            upsample_rates=[3],
-            upsample_initial_channel=32,
-            upsample_kernel_sizes=[3],
-        )
-        checkpoint_dict = torch.load(ckpt_path, map_location="cpu")["model"]
-        model.load_state_dict(checkpoint_dict)
-        model.eval()
-        print("[FlashSR] SynthesizerTrn loaded on CPU (fp32) — no CUDA touched")
-        _FLASHSR_MODEL = model
-        return model
 def _apply_flashsr(wav_16k: np.ndarray) -> np.ndarray:
-    """Upsample a mono 16 kHz numpy array to 48 kHz using FlashSR (CPU).
-    Returns a mono float32 numpy array at 48 kHz.
-    Falls back to torchaudio sinc resampling if FlashSR fails.
     """
-    try:
-        model = _load_flashsr()
-        t = torch.from_numpy(wav_16k.astype(np.float32)).unsqueeze(0)  # [1, T]
-        print(f"[FlashSR] Upsampling {len(wav_16k)/FLASHSR_SR_IN:.2f}s @ 16kHz → 48kHz (CPU) …")
-        with torch.no_grad():
-            # SynthesizerTrn.forward expects [B, 1, T] — add channel dim
-            out = model(t.unsqueeze(1))   # → [B, 1, T*3]
-        out = out.squeeze()               # → [T*3]
-        out = out / (torch.abs(out).max() + 1e-8) * 0.999  # normalize like FASR.super_resolution
-        out = out.cpu().float().numpy()
-        print(f"[FlashSR] Done — output shape {out.shape}, sr={FLASHSR_SR_OUT}")
-        return out
-    except Exception as e:
-        print(f"[FlashSR] ERROR: {e} — falling back to sinc resampling")
-        t = torch.from_numpy(wav_16k.astype(np.float32)).unsqueeze(0)
-        out = torchaudio.functional.resample(t, FLASHSR_SR_IN, FLASHSR_SR_OUT)
-        return out.squeeze().numpy()
 def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,

 # models produce output at the same sample rate.
 # Model weights are downloaded once from HF Hub and cached on disk.
 FLASHSR_SR_IN  = 16000
 FLASHSR_SR_OUT = 48000
 def _apply_flashsr(wav_16k: np.ndarray) -> np.ndarray:
+    """Upsample a mono 16 kHz numpy array to 48 kHz using sinc resampling (CPU).
+    FlashSR was attempted but its dependencies trigger torch.cuda.is_available()
+    on import, which violates ZeroGPU's stateless-GPU rule and aborts subsequent
+    GPU tasks. High-quality sinc resampling via torchaudio is ZeroGPU-safe and
+    produces clean 16→48 kHz output for foley/ambient audio.
     """
+    print(f"[upsample] {len(wav_16k)/FLASHSR_SR_IN:.2f}s @ 16kHz → 48kHz (sinc, CPU) …")
+    t = torch.from_numpy(wav_16k.astype(np.float32)).unsqueeze(0)
+    out = torchaudio.functional.resample(t, FLASHSR_SR_IN, FLASHSR_SR_OUT)
+    result = out.squeeze().numpy()
+    print(f"[upsample] Done — {len(result)/FLASHSR_SR_OUT:.2f}s @ {FLASHSR_SR_OUT}Hz")
+    return result
 def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,

requirements.txt CHANGED Viewed

@@ -21,7 +21,6 @@ loguru
 torchdiffeq
 open_clip_torch
 git+https://github.com/descriptinc/audiotools
-git+https://github.com/ysharma3501/FlashSR.git
 --extra-index-url https://download.pytorch.org/whl/cu124
 torchaudio==2.5.1+cu124
 --find-links https://download.openmmlab.com/mmcv/dist/cu121/torch2.4.0/index.html

 torchdiffeq
 open_clip_torch
 git+https://github.com/descriptinc/audiotools
 --extra-index-url https://download.pytorch.org/whl/cu124
 torchaudio==2.5.1+cu124
 --find-links https://download.openmmlab.com/mmcv/dist/cu121/torch2.4.0/index.html