cronos3k commited on
Commit
f732a4a
Β·
verified Β·
1 Parent(s): d7efb89

feat: pre-download models at startup + ZeroGPU support

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. spaces_app.py +95 -17
requirements.txt CHANGED
@@ -10,3 +10,4 @@ faster-whisper>=1.0.0
10
  gradio>=5.0.0
11
  huggingface-hub>=1.3.0
12
  tqdm>=4.65.0
 
 
10
  gradio>=5.0.0
11
  huggingface-hub>=1.3.0
12
  tqdm>=4.65.0
13
+ spaces
spaces_app.py CHANGED
@@ -1,26 +1,45 @@
1
  """
2
  HuggingFace Spaces entry point for LongCat-AudioDiT Enhanced.
3
 
4
- Handles:
5
- - /tmp storage for outputs, models, voices (Spaces has no persistent /app writes)
6
- - HF_HOME β†’ /tmp so model cache lands in writable space
7
- - GPU detection and graceful CPU fallback
8
- - Gradio theme passed to launch() (Gradio 6 compat)
9
  """
10
 
11
  import os
12
  import sys
 
13
  from pathlib import Path
14
 
15
- # ── Redirect HF cache + all writable dirs to /tmp ────────────────────────────
16
- os.environ["HF_HOME"] = "/tmp/hf_home"
17
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home/transformers"
18
- os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_home/datasets"
19
 
20
- for d in ["/tmp/hf_home", "/tmp/audiodit_outputs", "/tmp/audiodit_voices"]:
21
  Path(d).mkdir(parents=True, exist_ok=True)
22
 
23
- # ── Patch app constants before import ────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  import app as _app
25
  import voice_library as _vl
26
  import whisper_helper as _wh
@@ -31,23 +50,82 @@ _app.OUTPUT_DIR = Path("/tmp/audiodit_outputs")
31
  _vl.VOICES_DIR = Path("/tmp/audiodit_voices")
32
  _vl.LIBRARY_FILE = Path("/tmp/audiodit_voices/library.json")
33
  _vl.VOICES_DIR.mkdir(parents=True, exist_ok=True)
34
- _vl._library = None # reset singleton so it picks up patched paths
35
 
36
- # Patch Whisper download root
37
  _orig_wh_init = _wh.WhisperHelper.__init__
38
  def _patched_wh_init(self, model_size="turbo", device="auto", compute_type="auto", download_root=None):
39
  _orig_wh_init(self, model_size=model_size, device=device, compute_type=compute_type,
40
  download_root=download_root or "/tmp/hf_home/whisper")
41
  _wh.WhisperHelper.__init__ = _patched_wh_init
42
 
43
- # ── Launch ───────────────────────────────────────────────────────────────────
 
44
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  import gradio as gr
46
 
47
- device = "cuda" if torch.cuda.is_available() else "cpu"
48
- print(f"[Spaces] device={device} CUDA={torch.cuda.is_available()}")
49
 
50
- demo = _app.build_ui(default_device=device)
51
  demo.launch(
52
  server_name="0.0.0.0",
53
  server_port=int(os.environ.get("PORT", 7860)),
 
1
  """
2
  HuggingFace Spaces entry point for LongCat-AudioDiT Enhanced.
3
 
4
+ Hackathon version:
5
+ - Pre-downloads all models at startup (no download lag during use)
6
+ - Uses ZeroGPU (@spaces.GPU) for on-demand GPU allocation
7
+ - /tmp storage for outputs, models, voices
 
8
  """
9
 
10
  import os
11
  import sys
12
+ import time
13
  from pathlib import Path
14
 
15
+ # ── Redirect HF cache + writable dirs to /tmp ────────────────────────────────
16
+ os.environ["HF_HOME"] = "/tmp/hf_home"
17
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home/transformers"
18
+ os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_home/datasets"
19
 
20
+ for d in ["/tmp/hf_home", "/tmp/audiodit_outputs", "/tmp/audiodit_voices", "/tmp/hf_home/whisper"]:
21
  Path(d).mkdir(parents=True, exist_ok=True)
22
 
23
+ # ── Pre-download all models at startup ────────────────────────────────────────
24
+ from huggingface_hub import snapshot_download
25
+
26
+ t0 = time.time()
27
+
28
+ print("[Spaces] Pre-downloading AudioDiT-1B …")
29
+ snapshot_download("meituan-longcat/LongCat-AudioDiT-1B")
30
+
31
+ print("[Spaces] Pre-downloading text encoder (google/umt5-base) …")
32
+ snapshot_download("google/umt5-base")
33
+
34
+ print("[Spaces] Pre-downloading Whisper Turbo …")
35
+ snapshot_download(
36
+ "deepdml/faster-whisper-large-v3-turbo-ct2",
37
+ local_dir="/tmp/hf_home/whisper",
38
+ )
39
+
40
+ print(f"[Spaces] All models pre-downloaded in {time.time() - t0:.0f}s")
41
+
42
+ # ── Patch app constants before import ─────────────────────────────────────────
43
  import app as _app
44
  import voice_library as _vl
45
  import whisper_helper as _wh
 
50
  _vl.VOICES_DIR = Path("/tmp/audiodit_voices")
51
  _vl.LIBRARY_FILE = Path("/tmp/audiodit_voices/library.json")
52
  _vl.VOICES_DIR.mkdir(parents=True, exist_ok=True)
53
+ _vl._library = None
54
 
55
+ # Patch Whisper download root to /tmp (already pre-downloaded there)
56
  _orig_wh_init = _wh.WhisperHelper.__init__
57
  def _patched_wh_init(self, model_size="turbo", device="auto", compute_type="auto", download_root=None):
58
  _orig_wh_init(self, model_size=model_size, device=device, compute_type=compute_type,
59
  download_root=download_root or "/tmp/hf_home/whisper")
60
  _wh.WhisperHelper.__init__ = _patched_wh_init
61
 
62
+ # ── ZeroGPU: wrap GPU-needing functions before build_ui references them ───────
63
+ import spaces
64
  import torch
65
+
66
+
67
+ _orig_clone_voice = _app.clone_voice
68
+
69
+ @spaces.GPU(duration=180)
70
+ def _gpu_clone_voice(text, ref_audio_path, ref_transcription, audiodit_size, nfe,
71
+ guidance_strength, guidance_method, seed, memory_mode, device):
72
+ try:
73
+ _app.get_manager(memory_mode).release_all()
74
+ except Exception:
75
+ pass
76
+ return _orig_clone_voice(text, ref_audio_path, ref_transcription, audiodit_size,
77
+ nfe, guidance_strength, guidance_method, seed,
78
+ memory_mode, "cuda")
79
+
80
+ _app.clone_voice = _gpu_clone_voice
81
+
82
+
83
+ _orig_plain_tts = _app.plain_tts
84
+
85
+ @spaces.GPU(duration=180)
86
+ def _gpu_plain_tts(text, audiodit_size, nfe, guidance_strength, guidance_method,
87
+ seed, memory_mode, device):
88
+ try:
89
+ _app.get_manager(memory_mode).release_all()
90
+ except Exception:
91
+ pass
92
+ return _orig_plain_tts(text, audiodit_size, nfe, guidance_strength, guidance_method,
93
+ seed, memory_mode, "cuda")
94
+
95
+ _app.plain_tts = _gpu_plain_tts
96
+
97
+
98
+ _orig_transcribe = _app.transcribe_reference
99
+
100
+ @spaces.GPU(duration=120)
101
+ def _gpu_transcribe(audio_path, whisper_size, language, memory_mode, device):
102
+ try:
103
+ _app.get_manager(memory_mode).release_all()
104
+ except Exception:
105
+ pass
106
+ return _orig_transcribe(audio_path, whisper_size, language, memory_mode, "cuda")
107
+
108
+ _app.transcribe_reference = _gpu_transcribe
109
+
110
+
111
+ _orig_stt_flat = _app._stt_flat
112
+
113
+ @spaces.GPU(duration=120)
114
+ def _gpu_stt_flat(audio_path, whisper_size, language, memory_mode, device):
115
+ try:
116
+ _app.get_manager(memory_mode).release_all()
117
+ except Exception:
118
+ pass
119
+ return _orig_stt_flat(audio_path, whisper_size, language, memory_mode, "cuda")
120
+
121
+ _app._stt_flat = _gpu_stt_flat
122
+
123
+ # ── Launch ────────────────────────────────────────────────────────────────────
124
  import gradio as gr
125
 
126
+ print(f"[Spaces] ZeroGPU active, CUDA at launch: {torch.cuda.is_available()}")
 
127
 
128
+ demo = _app.build_ui(default_device="cuda")
129
  demo.launch(
130
  server_name="0.0.0.0",
131
  server_port=int(os.environ.get("PORT", 7860)),