fix: T4/ZeroGPU compat — cuDNN pin, font packages, GPU detection

#2
Files changed (4) hide show
  1. app.py +11 -5
  2. packages.txt +2 -0
  3. requirements.txt +3 -2
  4. src/amuseme/transcriber.py +12 -3
app.py CHANGED
@@ -12,22 +12,27 @@ if str(SRC_DIR) not in sys.path:
12
 
13
  from amuseme.transcriber import transcribe
14
  from amuseme.renderer import render_frames
15
- from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY
16
  from amuseme.video_assembler import assemble
17
  from amuseme.logger import get_logger
18
 
19
  logger = get_logger("app")
20
 
21
- # Try to import spaces for ZeroGPU; gracefully degrade locally
 
 
 
 
22
  try:
23
  import spaces
24
  HAS_SPACES = True
25
  except ImportError:
26
  HAS_SPACES = False
27
 
 
28
  if HAS_SPACES:
29
  from huggingface_hub import snapshot_download
30
- logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...")
31
  try:
32
  snapshot_download(repo_id="Systran/faster-whisper-large-v3")
33
  snapshot_download(repo_id="openbmb/MiniCPM5-1B")
@@ -41,8 +46,9 @@ def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_pre
41
  return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt)
42
 
43
 
44
- if HAS_SPACES:
45
- _gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe)
 
46
 
47
 
48
  def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str:
 
12
 
13
  from amuseme.transcriber import transcribe
14
  from amuseme.renderer import render_frames
15
+ from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES
16
  from amuseme.video_assembler import assemble
17
  from amuseme.logger import get_logger
18
 
19
  logger = get_logger("app")
20
 
21
+ import os
22
+
23
+ # ZeroGPU Spaces set SPACES_ZERO_GPU=1; permanent GPU Spaces (T4 etc.) do not.
24
+ IS_ZEROGPU = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
25
+
26
  try:
27
  import spaces
28
  HAS_SPACES = True
29
  except ImportError:
30
  HAS_SPACES = False
31
 
32
+ # Pre-download models at Space startup so they're cached before inference
33
  if HAS_SPACES:
34
  from huggingface_hub import snapshot_download
35
+ logger.info("HF Space detected. Pre-downloading heavy models...")
36
  try:
37
  snapshot_download(repo_id="Systran/faster-whisper-large-v3")
38
  snapshot_download(repo_id="openbmb/MiniCPM5-1B")
 
46
  return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt)
47
 
48
 
49
+ # Only wrap with spaces.GPU on ZeroGPU — on permanent GPU Spaces it raises RuntimeError
50
+ if IS_ZEROGPU and HAS_SPACES:
51
+ _gpu_transcribe = spaces.GPU(duration=150)(_gpu_transcribe)
52
 
53
 
54
  def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str:
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fonts-dejavu-core
2
+ fonts-liberation
requirements.txt CHANGED
@@ -3,10 +3,11 @@
3
  # torch/torchaudio come from the ZeroGPU runtime + the demucs dependency,
4
  # so they are intentionally not pinned here.
5
 
 
 
 
6
  faster-whisper
7
- ctranslate2==4.3.1
8
  demucs
9
- torchcodec
10
  pillow
11
  pydantic
12
  spaces
 
3
  # torch/torchaudio come from the ZeroGPU runtime + the demucs dependency,
4
  # so they are intentionally not pinned here.
5
 
6
+ # ctranslate2: do NOT pin — let pip resolve the wheel matching ZeroGPU's
7
+ # CUDA/cuDNN version. Pinning 4.3.1 (cuDNN 8 build) breaks on ZeroGPU's cuDNN 9
8
+ # with "libcudnn_ops_infer.so.8: cannot open shared object file".
9
  faster-whisper
 
10
  demucs
 
11
  pillow
12
  pydantic
13
  spaces
src/amuseme/transcriber.py CHANGED
@@ -70,15 +70,24 @@ def _load_model(model_size: str = "large-v3"):
70
  if _model is None:
71
  if model_size == "turbo":
72
  model_size = "large-v3-turbo"
73
-
74
  device = "cpu" if os.environ.get("FORCE_CPU") == "1" else "cuda"
75
  logger.info(f"Loading Whisper {model_size} on {device}...")
76
  compute_type = "float16" if device == "cuda" else "int8"
77
  try:
78
  _model = WhisperModel(model_size, device=device, compute_type=compute_type)
79
  except Exception as e:
80
- logger.warning(f"Failed to load {model_size} with {compute_type}: {e}. Falling back to float32.")
81
- _model = WhisperModel(model_size, device=device, compute_type="float32")
 
 
 
 
 
 
 
 
 
82
  return _model
83
 
84
 
 
70
  if _model is None:
71
  if model_size == "turbo":
72
  model_size = "large-v3-turbo"
73
+
74
  device = "cpu" if os.environ.get("FORCE_CPU") == "1" else "cuda"
75
  logger.info(f"Loading Whisper {model_size} on {device}...")
76
  compute_type = "float16" if device == "cuda" else "int8"
77
  try:
78
  _model = WhisperModel(model_size, device=device, compute_type=compute_type)
79
  except Exception as e:
80
+ if device == "cuda":
81
+ # CUDA/cuDNN library mismatch (e.g. on ZeroGPU). Retry on CPU.
82
+ logger.warning(f"CUDA load failed ({e}). Falling back to CPU int8.")
83
+ try:
84
+ _model = WhisperModel(model_size, device="cpu", compute_type="int8")
85
+ except Exception as e2:
86
+ logger.error(f"CPU fallback also failed: {e2}")
87
+ raise
88
+ else:
89
+ logger.warning(f"Failed to load with {compute_type}: {e}. Retrying with float32.")
90
+ _model = WhisperModel(model_size, device=device, compute_type="float32")
91
  return _model
92
 
93