Spaces:

cronos3k
/

LongCat-AudioDiT-Enhanced

Sleeping

App Files Files Community

cronos3k commited on 6 days ago

Commit

f732a4a

verified ·

1 Parent(s): d7efb89

feat: pre-download models at startup + ZeroGPU support

Browse files

Files changed (2) hide show

requirements.txt +1 -0
spaces_app.py +95 -17

requirements.txt CHANGED Viewed

@@ -10,3 +10,4 @@ faster-whisper>=1.0.0
 gradio>=5.0.0
 huggingface-hub>=1.3.0
 tqdm>=4.65.0

 gradio>=5.0.0
 huggingface-hub>=1.3.0
 tqdm>=4.65.0
+spaces

spaces_app.py CHANGED Viewed

@@ -1,26 +1,45 @@
 """
 HuggingFace Spaces entry point for LongCat-AudioDiT Enhanced.
-Handles:
-  - /tmp storage for outputs, models, voices (Spaces has no persistent /app writes)
-  - HF_HOME → /tmp so model cache lands in writable space
-  - GPU detection and graceful CPU fallback
-  - Gradio theme passed to launch() (Gradio 6 compat)
 """
 import os
 import sys
 from pathlib import Path
-# ── Redirect HF cache + all writable dirs to /tmp ────────────────────────────
-os.environ["HF_HOME"]              = "/tmp/hf_home"
-os.environ["TRANSFORMERS_CACHE"]   = "/tmp/hf_home/transformers"
-os.environ["HF_DATASETS_CACHE"]    = "/tmp/hf_home/datasets"
-for d in ["/tmp/hf_home", "/tmp/audiodit_outputs", "/tmp/audiodit_voices"]:
     Path(d).mkdir(parents=True, exist_ok=True)
-# ── Patch app constants before import ────────────────────────────────────────
 import app as _app
 import voice_library as _vl
 import whisper_helper as _wh
@@ -31,23 +50,82 @@ _app.OUTPUT_DIR = Path("/tmp/audiodit_outputs")
 _vl.VOICES_DIR   = Path("/tmp/audiodit_voices")
 _vl.LIBRARY_FILE = Path("/tmp/audiodit_voices/library.json")
 _vl.VOICES_DIR.mkdir(parents=True, exist_ok=True)
-_vl._library = None  # reset singleton so it picks up patched paths
-# Patch Whisper download root
 _orig_wh_init = _wh.WhisperHelper.__init__
 def _patched_wh_init(self, model_size="turbo", device="auto", compute_type="auto", download_root=None):
     _orig_wh_init(self, model_size=model_size, device=device, compute_type=compute_type,
                   download_root=download_root or "/tmp/hf_home/whisper")
 _wh.WhisperHelper.__init__ = _patched_wh_init
-# ── Launch ───────────────────────────────────────────────────────────────────
 import torch
 import gradio as gr
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"[Spaces] device={device}  CUDA={torch.cuda.is_available()}")
-demo = _app.build_ui(default_device=device)
 demo.launch(
     server_name="0.0.0.0",
     server_port=int(os.environ.get("PORT", 7860)),

 """
 HuggingFace Spaces entry point for LongCat-AudioDiT Enhanced.
+Hackathon version:
+  - Pre-downloads all models at startup (no download lag during use)
+  - Uses ZeroGPU (@spaces.GPU) for on-demand GPU allocation
+  - /tmp storage for outputs, models, voices
 """
 import os
 import sys
+import time
 from pathlib import Path
+# ── Redirect HF cache + writable dirs to /tmp ────────────────────────────────
+os.environ["HF_HOME"]            = "/tmp/hf_home"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home/transformers"
+os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_home/datasets"
+for d in ["/tmp/hf_home", "/tmp/audiodit_outputs", "/tmp/audiodit_voices", "/tmp/hf_home/whisper"]:
     Path(d).mkdir(parents=True, exist_ok=True)
+# ── Pre-download all models at startup ────────────────────────────────────────
+from huggingface_hub import snapshot_download
+t0 = time.time()
+print("[Spaces] Pre-downloading AudioDiT-1B …")
+snapshot_download("meituan-longcat/LongCat-AudioDiT-1B")
+print("[Spaces] Pre-downloading text encoder (google/umt5-base) …")
+snapshot_download("google/umt5-base")
+print("[Spaces] Pre-downloading Whisper Turbo …")
+snapshot_download(
+    "deepdml/faster-whisper-large-v3-turbo-ct2",
+    local_dir="/tmp/hf_home/whisper",
+)
+print(f"[Spaces] All models pre-downloaded in {time.time() - t0:.0f}s")
+# ── Patch app constants before import ─────────────────────────────────────────
 import app as _app
 import voice_library as _vl
 import whisper_helper as _wh
 _vl.VOICES_DIR   = Path("/tmp/audiodit_voices")
 _vl.LIBRARY_FILE = Path("/tmp/audiodit_voices/library.json")
 _vl.VOICES_DIR.mkdir(parents=True, exist_ok=True)
+_vl._library = None
+# Patch Whisper download root to /tmp (already pre-downloaded there)
 _orig_wh_init = _wh.WhisperHelper.__init__
 def _patched_wh_init(self, model_size="turbo", device="auto", compute_type="auto", download_root=None):
     _orig_wh_init(self, model_size=model_size, device=device, compute_type=compute_type,
                   download_root=download_root or "/tmp/hf_home/whisper")
 _wh.WhisperHelper.__init__ = _patched_wh_init
+# ── ZeroGPU: wrap GPU-needing functions before build_ui references them ───────
+import spaces
 import torch
+_orig_clone_voice = _app.clone_voice
+@spaces.GPU(duration=180)
+def _gpu_clone_voice(text, ref_audio_path, ref_transcription, audiodit_size, nfe,
+                     guidance_strength, guidance_method, seed, memory_mode, device):
+    try:
+        _app.get_manager(memory_mode).release_all()
+    except Exception:
+        pass
+    return _orig_clone_voice(text, ref_audio_path, ref_transcription, audiodit_size,
+                             nfe, guidance_strength, guidance_method, seed,
+                             memory_mode, "cuda")
+_app.clone_voice = _gpu_clone_voice
+_orig_plain_tts = _app.plain_tts
+@spaces.GPU(duration=180)
+def _gpu_plain_tts(text, audiodit_size, nfe, guidance_strength, guidance_method,
+                   seed, memory_mode, device):
+    try:
+        _app.get_manager(memory_mode).release_all()
+    except Exception:
+        pass
+    return _orig_plain_tts(text, audiodit_size, nfe, guidance_strength, guidance_method,
+                           seed, memory_mode, "cuda")
+_app.plain_tts = _gpu_plain_tts
+_orig_transcribe = _app.transcribe_reference
+@spaces.GPU(duration=120)
+def _gpu_transcribe(audio_path, whisper_size, language, memory_mode, device):
+    try:
+        _app.get_manager(memory_mode).release_all()
+    except Exception:
+        pass
+    return _orig_transcribe(audio_path, whisper_size, language, memory_mode, "cuda")
+_app.transcribe_reference = _gpu_transcribe
+_orig_stt_flat = _app._stt_flat
+@spaces.GPU(duration=120)
+def _gpu_stt_flat(audio_path, whisper_size, language, memory_mode, device):
+    try:
+        _app.get_manager(memory_mode).release_all()
+    except Exception:
+        pass
+    return _orig_stt_flat(audio_path, whisper_size, language, memory_mode, "cuda")
+_app._stt_flat = _gpu_stt_flat
+# ── Launch ────────────────────────────────────────────────────────────────────
 import gradio as gr
+print(f"[Spaces] ZeroGPU active, CUDA at launch: {torch.cuda.is_available()}")
+demo = _app.build_ui(default_device="cuda")
 demo.launch(
     server_name="0.0.0.0",
     server_port=int(os.environ.get("PORT", 7860)),