ruslanmv commited on
Commit
a197317
·
1 Parent(s): ead9609
Files changed (2) hide show
  1. Makefile +171 -0
  2. app.py +100 -77
Makefile ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================================================
2
+ # Makefile — AI Story Server (Python 3.11)
3
+ # ================================================================
4
+ # Common usage:
5
+ # make help
6
+ # make install # CPU-friendly install
7
+ # make install-cuda # build llama-cpp-python with CUDA/cuBLAS offload
8
+ # make precache # download models + compute voice latents once
9
+ # make run # run the Gradio app (prefers GPU if available)
10
+ # make clean # clean caches (keeps venv)
11
+ # make deepclean # remove venv + caches
12
+ # ---------------------------------------------------------------
13
+
14
+ # ---- Configurable vars ----
15
+ PYTHON ?= python3.11
16
+ VENV ?= .venv
17
+ PY := $(VENV)/bin/python
18
+ PIP := $(VENV)/bin/pip
19
+
20
+ APP ?= app.py
21
+ PORT ?= 7860
22
+
23
+ # Core runtime deps (CPU-safe). Torch comes via transitive deps where needed;
24
+ # you may pin torch externally if required by your environment.
25
+ REQS = \
26
+ "numpy<2" \
27
+ "gradio==4.27.0" \
28
+ "python-dotenv" \
29
+ "huggingface_hub" \
30
+ "ffmpeg-python" \
31
+ "nltk" \
32
+ "emoji" \
33
+ "langid" \
34
+ "noisereduce" \
35
+ "TTS" \
36
+ "llama-cpp-python>=0.2.90"
37
+
38
+ # Dev tools (optional)
39
+ DEV_REQS = \
40
+ "ruff" \
41
+ "black" \
42
+ "pip-tools"
43
+
44
+ # ================================================================
45
+ # Meta
46
+ # ================================================================
47
+ .PHONY: help venv install install-no-llama install-cuda install-dev \
48
+ precache run run-gpu check-ffmpeg check-python lint format \
49
+ freeze deps-update clean deepclean
50
+
51
+ help:
52
+ @echo "Targets:"
53
+ @echo " install - Create venv (Python 3.11) and install CPU-safe deps"
54
+ @echo " install-cuda - Build llama-cpp-python with CUDA/cuBLAS offload + install deps"
55
+ @echo " install-dev - Install dev tools (ruff, black, pip-tools)"
56
+ @echo " precache - Download models & compute voice latents once (no UI)"
57
+ @echo " run - Run Gradio app on PORT=$(PORT) (prefers native GPU if present)"
58
+ @echo " run-gpu - Run app forcing CUDA_VISIBLE_DEVICES (default 0)"
59
+ @echo " lint - Run ruff"
60
+ @echo " format - Run black and ruff --fix"
61
+ @echo " freeze - Write requirements.txt from current venv"
62
+ @echo " deps-update - Upgrade runtime deps"
63
+ @echo " check-ffmpeg - Verify ffmpeg is installed"
64
+ @echo " check-python - Verify Python 3.11 is available"
65
+ @echo " clean - Clear caches/artifacts (keeps venv)"
66
+ @echo " deepclean - Remove venv and caches"
67
+
68
+ # ================================================================
69
+ # Environment / setup
70
+ # ================================================================
71
+ check-python:
72
+ @command -v $(PYTHON) >/dev/null 2>&1 || \
73
+ { echo "ERROR: $(PYTHON) not found. Please install Python 3.11 and retry."; exit 1; }
74
+ @echo "OK: $(PYTHON) found."
75
+
76
+ venv: check-python
77
+ $(PYTHON) -m venv $(VENV)
78
+ @echo "Virtual environment created at $(VENV)"
79
+
80
+ install-no-llama: venv
81
+ $(PIP) install --upgrade pip setuptools wheel
82
+ $(PIP) install "numpy<2" "gradio==4.27.0" python-dotenv huggingface_hub ffmpeg-python nltk emoji langid noisereduce TTS
83
+
84
+ install: venv
85
+ $(PIP) install --upgrade pip setuptools wheel
86
+ # CPU-friendly install of all deps including llama-cpp-python
87
+ $(PIP) install $(REQS)
88
+
89
+ # CUDA/cuBLAS build for llama-cpp-python (requires CUDA toolkit & compiler)
90
+ install-cuda: venv
91
+ $(PIP) install --upgrade pip setuptools wheel
92
+ @echo "Building llama-cpp-python with CUDA/cuBLAS…"
93
+ @export CMAKE_ARGS="-DLLAMA_CUBLAS=on"; \
94
+ export LLAMA_CUBLAS=1; \
95
+ $(PIP) install --no-binary=:all: --force-reinstall "llama-cpp-python>=0.2.90"
96
+ # Install the rest of the deps (excluding llama-cpp-python which we just built)
97
+ $(MAKE) install-no-llama
98
+ @echo "CUDA install complete."
99
+
100
+ install-dev: venv
101
+ $(PIP) install --upgrade pip
102
+ $(PIP) install $(DEV_REQS)
103
+
104
+ # ================================================================
105
+ # Utility checks
106
+ # ================================================================
107
+ check-ffmpeg:
108
+ @command -v ffmpeg >/dev/null 2>&1 || { echo "ERROR: ffmpeg not found. Install ffmpeg and retry."; exit 1; }
109
+ @ffmpeg -version | head -n 1
110
+
111
+ # ================================================================
112
+ # Workflow targets
113
+ # ================================================================
114
+ # Pre-download model assets and compute voice latents (runs your app's functions)
115
+ precache: install check-ffmpeg
116
+ $(PY) - <<'PY'
117
+ from app import precache_assets, init_models_and_latents
118
+ precache_assets()
119
+ init_models_and_latents()
120
+ print("Precache complete.")
121
+ PY
122
+
123
+ run: install
124
+ @echo "Starting app on port $(PORT)…"
125
+ PORT=$(PORT) $(PY) $(APP)
126
+
127
+ # Run, preferring a specific GPU (default GPU 0). App itself auto-detects CUDA.
128
+ run-gpu: install
129
+ @echo "Starting app with CUDA_VISIBLE_DEVICES=$${CUDA_VISIBLE_DEVICES:-0} on port $(PORT)…"
130
+ CUDA_VISIBLE_DEVICES=$${CUDA_VISIBLE_DEVICES:-0} PORT=$(PORT) $(PY) $(APP)
131
+
132
+ # Lint / format
133
+ lint: install-dev
134
+ $(VENV)/bin/ruff check .
135
+
136
+ format: install-dev
137
+ $(VENV)/bin/black .
138
+ $(VENV)/bin/ruff check --fix .
139
+
140
+ # Freeze dependency snapshot
141
+ freeze:
142
+ @echo "Writing requirements.txt from current venv…"
143
+ $(VENV)/bin/pip freeze > requirements.txt
144
+ @echo "requirements.txt updated."
145
+
146
+ # Upgrade runtime deps (keeps numpy<2 guard)
147
+ deps-update: venv
148
+ $(PIP) install --upgrade pip
149
+ $(PIP) install --upgrade "numpy<2" "gradio==4.27.0" python-dotenv huggingface_hub ffmpeg-python nltk emoji langid noisereduce TTS "llama-cpp-python>=0.2.90"
150
+
151
+ # ================================================================
152
+ # Cleanup
153
+ # ================================================================
154
+ clean:
155
+ @echo "Cleaning caches…"
156
+ @rm -rf __pycache__ */__pycache__
157
+ @rm -rf .pytest_cache .ruff_cache
158
+ @rm -rf voices/*.tmp
159
+ @rm -rf ~/.cache/huggingface/hub/tmp
160
+ @rm -rf ~/.cache/huggingface/transformers
161
+ @rm -rf ~/.cache/torch
162
+ @rm -rf ~/.cache/pip
163
+ @rm -rf ~/.local/share/tts/tmp
164
+ @echo "Done."
165
+
166
+ deepclean: clean
167
+ @echo "Removing venv and model caches…"
168
+ @rm -rf $(VENV)
169
+ @rm -rf ~/.local/share/tts
170
+ @rm -rf voices
171
+ @echo "Done."
app.py CHANGED
@@ -2,6 +2,7 @@
2
  # 1) SETUP & IMPORTS
3
  # ===================================================================================
4
  from __future__ import annotations
 
5
  import os
6
  import sys
7
  import base64
@@ -9,24 +10,24 @@ import struct
9
  import textwrap
10
  import requests
11
  import atexit
12
- from typing import List, Dict, Tuple, Generator
13
 
14
  # --- Fast, safe defaults ---
15
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
16
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
17
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
18
- os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false") # truly disable analytics
19
- os.environ.setdefault("TORCHAUDIO_USE_FFMPEG", "0") # avoid torchaudio/ffmpeg linkage issues
20
 
21
  # --- .env early (HF_TOKEN / SECRET_TOKEN) ---
22
  from dotenv import load_dotenv
23
  load_dotenv()
24
 
25
- # --- NumPy sanity (Torch 2.2.x wants NumPy 1.x) ---
26
  import numpy as _np
27
  if int(_np.__version__.split(".", 1)[0]) >= 2:
28
  raise RuntimeError(
29
- f"Detected numpy=={_np.__version__}. Please ensure numpy<2 (e.g., 1.26.4) for this Space."
30
  )
31
 
32
  # --- Hugging Face Spaces & ZeroGPU (import BEFORE CUDA libs) ---
@@ -48,7 +49,7 @@ import numpy as np
48
  from huggingface_hub import HfApi, hf_hub_download
49
  from llama_cpp import Llama
50
 
51
- # --- Audio decoding (use ffmpeg-python; no torchaudio) ---
52
  import ffmpeg
53
 
54
  # --- TTS Libraries ---
@@ -63,6 +64,7 @@ import langid
63
  import emoji
64
  import noisereduce as nr
65
 
 
66
  # ===================================================================================
67
  # 2) GLOBALS & HELPERS
68
  # ===================================================================================
@@ -70,9 +72,11 @@ import noisereduce as nr
70
  # NLTK data
71
  nltk.download("punkt", quiet=True)
72
 
73
- # Cached models & latents
74
  tts_model: Xtts | None = None
75
  llm_model: Llama | None = None
 
 
76
  voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
77
 
78
  # Config
@@ -83,9 +87,6 @@ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
83
  SENTENCE_SPLIT_LENGTH = 250
84
  LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
85
 
86
- # Prefer native GPU if available; otherwise we’ll rely on ZeroGPU (or CPU)
87
- PREFER_NATIVE_GPU = torch.cuda.is_available()
88
-
89
  # System prompts and roles
90
  default_system_message = (
91
  "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
@@ -99,7 +100,25 @@ ROLE_PROMPTS["Pirate"] = (
99
  "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
100
  )
101
 
102
- # ---------- small utils ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
104
  if pcm_data.startswith(b"RIFF"):
105
  return pcm_data
@@ -132,6 +151,7 @@ def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], sy
132
  prompt += f"<|user|>\n{message}</s><|assistant|>"
133
  return prompt
134
 
 
135
  # ---------- robust audio decode (mono via ffmpeg) ----------
136
  def _decode_audio_ffmpeg_to_mono(path: str, target_sr: int) -> np.ndarray:
137
  """
@@ -153,6 +173,7 @@ def _decode_audio_ffmpeg_to_mono(path: str, target_sr: int) -> np.ndarray:
153
  except ffmpeg.Error as e:
154
  raise RuntimeError(f"ffmpeg decode failed: {e.stderr.decode(errors='ignore') if e.stderr else e}") from e
155
 
 
156
  # ---------- monkey-patch XTTS internal loader to avoid torchaudio/torio ----------
157
  def _patched_load_audio(audiopath: str, load_sr: int):
158
  """
@@ -163,30 +184,28 @@ def _patched_load_audio(audiopath: str, load_sr: int):
163
  """
164
  wav = _decode_audio_ffmpeg_to_mono(audiopath, target_sr=load_sr)
165
  import torch as _torch # local import to avoid any circularities
166
- audio = _torch.from_numpy(wav).float().unsqueeze(0) # [1, N]
167
  return audio
168
 
169
  xtts_module.load_audio = _patched_load_audio
170
-
171
- # Also patch the common utility location, in case this version imports from there:
172
  try:
173
  import TTS.utils.audio as _tts_audio_mod
174
  _tts_audio_mod.load_audio = _patched_load_audio
175
  except Exception:
176
  pass
177
 
178
- # ---------- where Coqui caches models (avoid get_user_data_dir import) ----------
179
  def _coqui_cache_dir() -> str:
180
  # Matches what TTS uses on Linux: ~/.local/share/tts
181
  return os.path.join(os.path.expanduser("~"), ".local", "share", "tts")
182
 
 
183
  # ===================================================================================
184
- # 3) PRECACHE & MODEL LOADERS (RUN BEFORE FIRST INFERENCE)
185
  # ===================================================================================
186
 
187
  def precache_assets() -> None:
188
  """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
189
- # Voices
190
  print("Pre-caching voice files...")
191
  file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
192
  base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
@@ -202,11 +221,9 @@ def precache_assets() -> None:
202
  except Exception as e:
203
  print(f"Failed to download {name}: {e}")
204
 
205
- # XTTS model files
206
  print("Pre-caching XTTS v2 model files...")
207
  ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
208
 
209
- # LLM GGUF
210
  print("Pre-caching Zephyr GGUF...")
211
  try:
212
  hf_hub_download(
@@ -217,8 +234,9 @@ def precache_assets() -> None:
217
  except Exception as e:
218
  print(f"Warning: GGUF pre-cache error: {e}")
219
 
220
- def _load_xtts(device: str) -> Xtts:
221
- """Load XTTS from the local cache. Use checkpoint_dir to avoid None path bugs."""
 
222
  print(f"Loading Coqui XTTS V2 model on {device.upper()}...")
223
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
224
  ModelManager().download_model(model_name) # idempotent
@@ -237,75 +255,72 @@ def _load_xtts(device: str) -> Xtts:
237
  print("XTTS model loaded.")
238
  return model
239
 
 
240
  def _load_llama() -> Llama:
241
  """
242
- Load Llama (Zephyr GGUF). Prefer GPU offload if native CUDA build is present,
243
- otherwise fall back to pure CPU.
244
  """
245
  print("Loading LLM (Zephyr GGUF)...")
246
  zephyr_model_path = hf_hub_download(
247
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
248
  filename="zephyr-7b-beta.Q5_K_M.gguf"
249
  )
 
 
 
 
 
 
 
 
 
250
 
251
- # Heuristic: try to offload a large number of layers if CUDA build exists.
252
- gpu_layers_env = int(os.getenv("LLAMA_GPU_LAYERS", "100"))
253
- n_gpu_layers = gpu_layers_env if PREFER_NATIVE_GPU else 0
254
-
255
- try:
256
- llm = Llama(
257
- model_path=zephyr_model_path,
258
- n_gpu_layers=n_gpu_layers, # if CUDA build exists, this offloads layers
259
- n_ctx=4096,
260
- n_batch=512,
261
- verbose=False
262
- )
263
- used = "GPU-offload" if n_gpu_layers > 0 else "CPU"
264
- print(f"LLM loaded ({used}).")
265
- return llm
266
- except Exception as e:
267
- print(f"LLM GPU offload failed ({e}); falling back to CPU.")
268
- llm = Llama(
269
- model_path=zephyr_model_path,
270
- n_gpu_layers=0,
271
- n_ctx=4096,
272
- n_batch=512,
273
- verbose=False
274
- )
275
- print("LLM loaded (CPU).")
276
- return llm
277
 
278
  def init_models_and_latents() -> None:
279
  """
280
- Preload TTS and LLM. If native GPU is available at startup, load XTTS on CUDA
281
- and precompute voice latents there; otherwise do it on CPU (ZeroGPU will move it later).
282
  """
283
  global tts_model, llm_model, voice_latents
284
 
285
- target_device = "cuda" if PREFER_NATIVE_GPU else "cpu"
286
-
287
  if tts_model is None:
288
- tts_model = _load_xtts(device=target_device)
289
 
290
  if llm_model is None:
291
  llm_model = _load_llama()
292
 
293
- # Pre-compute latents once; uses patched loader (ffmpeg) under the hood
294
  if not voice_latents:
295
- print("Computing voice conditioning latents...")
296
- for role, filename in [
297
- ("Cloée", "cloee-1.wav"),
298
- ("Julian", "julian-bedtime-style-1.wav"),
299
- ("Pirate", "pirate_by_coqui.wav"),
300
- ("Thera", "thera-1.wav"),
301
- ]:
302
- path = os.path.join("voices", filename)
303
- with torch.no_grad():
304
- voice_latents[role] = tts_model.get_conditioning_latents(
 
 
 
 
 
305
  audio_path=path, gpt_cond_len=30, max_ref_length=60
306
  )
 
 
 
 
 
 
 
 
 
 
307
  print("Voice latents ready.")
308
 
 
309
  # Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
310
  def _close_llm():
311
  global llm_model
@@ -316,6 +331,7 @@ def _close_llm():
316
  pass
317
  atexit.register(_close_llm)
318
 
 
319
  # ===================================================================================
320
  # 4) INFERENCE HELPERS
321
  # ===================================================================================
@@ -342,15 +358,19 @@ def generate_text_stream(llm_instance: Llama, prompt: str,
342
  continue
343
  yield ch
344
 
 
345
  def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
346
  latents: Tuple[np.ndarray, np.ndarray]) -> Generator[bytes, None, None]:
347
- gpt_cond_latent, speaker_embedding = latents
 
 
 
348
  try:
349
  for chunk in tts_instance.inference_stream(
350
  text=text,
351
  language=language,
352
- gpt_cond_latent=gpt_cond_latent,
353
- speaker_embedding=speaker_embedding,
354
  temperature=0.85,
355
  ):
356
  if chunk is None:
@@ -360,6 +380,7 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
360
  f32 = np.clip(f32, -1.0, 1.0).astype(np.float32)
361
  s16 = (f32 * 32767.0).astype(np.int16)
362
  yield s16.tobytes()
 
363
  except RuntimeError as e:
364
  print(f"Error during TTS inference: {e}")
365
  if "device-side assert" in str(e) and api:
@@ -369,22 +390,23 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
369
  except Exception:
370
  pass
371
 
 
372
  # ===================================================================================
373
- # 5) ZERO-GPU ENTRYPOINT (also works on native GPU)
374
  # ===================================================================================
375
 
376
- @spaces.GPU(duration=120) # On native-GPU Spaces this simply runs with the resident GPU.
377
  def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
378
  if secret_token_input != SECRET_TOKEN:
379
  raise gr.Error("Invalid secret token provided.")
380
  if not input_text:
381
  return []
382
 
383
- # Ensure models/latents exist
384
  if tts_model is None or llm_model is None or not voice_latents:
385
  init_models_and_latents()
386
 
387
- # Prefer GPU if available at call time (ZeroGPU grants CUDA during this function)
388
  try:
389
  if torch.cuda.is_available():
390
  tts_model.to("cuda")
@@ -393,7 +415,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
393
  except Exception:
394
  tts_model.to("cpu")
395
 
396
- # Generate story text
397
  history: List[Tuple[str, str | None]] = [(input_text, None)]
398
  full_story_text = "".join(
399
  generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
@@ -428,7 +450,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
428
  b64_wav = base64.b64encode(pcm_to_wav(final_pcm, sample_rate=24000, channels=1, bit_depth=16)).decode("utf-8")
429
  results.append({"text": sentence, "audio": b64_wav})
430
 
431
- # Release GPU immediately if we were in a ZeroGPU window
432
  try:
433
  tts_model.to("cpu")
434
  except Exception:
@@ -436,6 +458,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
436
 
437
  return results
438
 
 
439
  # ===================================================================================
440
  # 6) STARTUP: PRECACHE & UI
441
  # ===================================================================================
@@ -450,16 +473,16 @@ def build_ui() -> gr.Interface:
450
  ],
451
  outputs=gr.JSON(label="Story and Audio Output"),
452
  title="AI Storyteller with ZeroGPU",
453
- description="Enter a prompt to generate a short story with voice narration using on-demand GPU or native GPU when available.",
454
  flagging_mode="never",
455
  allow_flagging="never",
456
  )
457
 
458
  if __name__ == "__main__":
459
- print("===== Startup: pre-cache assets and preload models =====")
460
  print(f"Python: {sys.version.split()[0]} | Torch CUDA available: {torch.cuda.is_available()}")
461
  precache_assets() # 1) download everything to disk
462
- init_models_and_latents() # 2) load models (prefer native GPU) + compute voice latents
463
  print("Models and assets ready. Launching UI...")
464
 
465
  demo = build_ui()
 
2
  # 1) SETUP & IMPORTS
3
  # ===================================================================================
4
  from __future__ import annotations
5
+
6
  import os
7
  import sys
8
  import base64
 
10
  import textwrap
11
  import requests
12
  import atexit
13
+ from typing import List, Dict, Tuple, Generator, Any
14
 
15
  # --- Fast, safe defaults ---
16
  os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
17
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
18
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
19
+ os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false") # truly disable analytics
20
+ os.environ.setdefault("TORCHAUDIO_USE_FFMPEG", "0") # avoid torchaudio/ffmpeg linkage quirks
21
 
22
  # --- .env early (HF_TOKEN / SECRET_TOKEN) ---
23
  from dotenv import load_dotenv
24
  load_dotenv()
25
 
26
+ # --- NumPy sanity (Torch 2.2.x prefers NumPy 1.x) ---
27
  import numpy as _np
28
  if int(_np.__version__.split(".", 1)[0]) >= 2:
29
  raise RuntimeError(
30
+ f"Detected numpy=={_np.__version__}. Please ensure numpy<2 (e.g., 1.26.4)."
31
  )
32
 
33
  # --- Hugging Face Spaces & ZeroGPU (import BEFORE CUDA libs) ---
 
49
  from huggingface_hub import HfApi, hf_hub_download
50
  from llama_cpp import Llama
51
 
52
+ # --- Audio decoding (pure ffmpeg-python; no torchaudio) ---
53
  import ffmpeg
54
 
55
  # --- TTS Libraries ---
 
64
  import emoji
65
  import noisereduce as nr
66
 
67
+
68
  # ===================================================================================
69
  # 2) GLOBALS & HELPERS
70
  # ===================================================================================
 
72
  # NLTK data
73
  nltk.download("punkt", quiet=True)
74
 
75
+ # Models & caches
76
  tts_model: Xtts | None = None
77
  llm_model: Llama | None = None
78
+
79
+ # Store latents as NumPy on CPU for portability; convert to device at inference time
80
  voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
81
 
82
  # Config
 
87
  SENTENCE_SPLIT_LENGTH = 250
88
  LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
89
 
 
 
 
90
  # System prompts and roles
91
  default_system_message = (
92
  "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
 
100
  "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
101
  )
102
 
103
+
104
+ # ---------- tiny utilities ----------
105
+ def _model_device(m: torch.nn.Module) -> torch.device:
106
+ try:
107
+ return next(m.parameters()).device
108
+ except StopIteration:
109
+ return torch.device("cpu")
110
+
111
+ def _to_device_float_tensor(x: Any, device: torch.device) -> torch.Tensor:
112
+ if isinstance(x, np.ndarray):
113
+ return torch.from_numpy(x).float().to(device)
114
+ if torch.is_tensor(x):
115
+ return x.to(device, dtype=torch.float32)
116
+ return torch.as_tensor(x, dtype=torch.float32, device=device)
117
+
118
+ def _latents_for_device(latents: Tuple[Any, Any], device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
119
+ gpt_cond, spk = latents
120
+ return _to_device_float_tensor(gpt_cond, device), _to_device_float_tensor(spk, device)
121
+
122
  def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
123
  if pcm_data.startswith(b"RIFF"):
124
  return pcm_data
 
151
  prompt += f"<|user|>\n{message}</s><|assistant|>"
152
  return prompt
153
 
154
+
155
  # ---------- robust audio decode (mono via ffmpeg) ----------
156
  def _decode_audio_ffmpeg_to_mono(path: str, target_sr: int) -> np.ndarray:
157
  """
 
173
  except ffmpeg.Error as e:
174
  raise RuntimeError(f"ffmpeg decode failed: {e.stderr.decode(errors='ignore') if e.stderr else e}") from e
175
 
176
+
177
  # ---------- monkey-patch XTTS internal loader to avoid torchaudio/torio ----------
178
  def _patched_load_audio(audiopath: str, load_sr: int):
179
  """
 
184
  """
185
  wav = _decode_audio_ffmpeg_to_mono(audiopath, target_sr=load_sr)
186
  import torch as _torch # local import to avoid any circularities
187
+ audio = _torch.from_numpy(wav).float().unsqueeze(0) # [1, N] on CPU
188
  return audio
189
 
190
  xtts_module.load_audio = _patched_load_audio
 
 
191
  try:
192
  import TTS.utils.audio as _tts_audio_mod
193
  _tts_audio_mod.load_audio = _patched_load_audio
194
  except Exception:
195
  pass
196
 
197
+
198
  def _coqui_cache_dir() -> str:
199
  # Matches what TTS uses on Linux: ~/.local/share/tts
200
  return os.path.join(os.path.expanduser("~"), ".local", "share", "tts")
201
 
202
+
203
  # ===================================================================================
204
+ # 3) PRECACHE & MODEL LOADERS (CPU at startup to avoid ZeroGPU issues)
205
  # ===================================================================================
206
 
207
  def precache_assets() -> None:
208
  """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
 
209
  print("Pre-caching voice files...")
210
  file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
211
  base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
 
221
  except Exception as e:
222
  print(f"Failed to download {name}: {e}")
223
 
 
224
  print("Pre-caching XTTS v2 model files...")
225
  ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
226
 
 
227
  print("Pre-caching Zephyr GGUF...")
228
  try:
229
  hf_hub_download(
 
234
  except Exception as e:
235
  print(f"Warning: GGUF pre-cache error: {e}")
236
 
237
+
238
+ def _load_xtts(device: str = "cpu") -> Xtts:
239
+ """Load XTTS from the local cache. Keep CPU at startup to avoid ZeroGPU device mixups."""
240
  print(f"Loading Coqui XTTS V2 model on {device.upper()}...")
241
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
242
  ModelManager().download_model(model_name) # idempotent
 
255
  print("XTTS model loaded.")
256
  return model
257
 
258
+
259
  def _load_llama() -> Llama:
260
  """
261
+ Load Llama (Zephyr GGUF).
262
+ Keep simple & robust: default to CPU (works everywhere).
263
  """
264
  print("Loading LLM (Zephyr GGUF)...")
265
  zephyr_model_path = hf_hub_download(
266
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
267
  filename="zephyr-7b-beta.Q5_K_M.gguf"
268
  )
269
+ llm = Llama(
270
+ model_path=zephyr_model_path,
271
+ n_gpu_layers=0, # CPU-only for reliability across Spaces/ZeroGPU
272
+ n_ctx=4096,
273
+ n_batch=512,
274
+ verbose=False
275
+ )
276
+ print("LLM loaded (CPU).")
277
+ return llm
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  def init_models_and_latents() -> None:
281
  """
282
+ Preload models on CPU and compute voice latents on CPU.
283
+ This avoids ZeroGPU's "mixed device" errors from torchaudio-based resampling.
284
  """
285
  global tts_model, llm_model, voice_latents
286
 
 
 
287
  if tts_model is None:
288
+ tts_model = _load_xtts(device="cpu") # always CPU at startup
289
 
290
  if llm_model is None:
291
  llm_model = _load_llama()
292
 
 
293
  if not voice_latents:
294
+ print("Computing voice conditioning latents (CPU)...")
295
+ # Ensure the TTS model is on CPU while computing latents
296
+ orig_dev = _model_device(tts_model)
297
+ if orig_dev.type != "cpu":
298
+ tts_model.to("cpu")
299
+
300
+ with torch.no_grad():
301
+ for role, filename in [
302
+ ("Cloée", "cloee-1.wav"),
303
+ ("Julian", "julian-bedtime-style-1.wav"),
304
+ ("Pirate", "pirate_by_coqui.wav"),
305
+ ("Thera", "thera-1.wav"),
306
+ ]:
307
+ path = os.path.join("voices", filename)
308
+ gpt_lat, spk_emb = tts_model.get_conditioning_latents(
309
  audio_path=path, gpt_cond_len=30, max_ref_length=60
310
  )
311
+ # Store as NumPy on CPU; convert to device on demand later
312
+ voice_latents[role] = (
313
+ gpt_lat.detach().cpu().numpy(),
314
+ spk_emb.detach().cpu().numpy(),
315
+ )
316
+
317
+ # Return model to original device (keep CPU at startup for safety)
318
+ if orig_dev.type != "cpu":
319
+ tts_model.to(orig_dev)
320
+
321
  print("Voice latents ready.")
322
 
323
+
324
  # Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
325
  def _close_llm():
326
  global llm_model
 
331
  pass
332
  atexit.register(_close_llm)
333
 
334
+
335
  # ===================================================================================
336
  # 4) INFERENCE HELPERS
337
  # ===================================================================================
 
358
  continue
359
  yield ch
360
 
361
+
362
  def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
363
  latents: Tuple[np.ndarray, np.ndarray]) -> Generator[bytes, None, None]:
364
+ # Convert stored CPU NumPy latents to tensors on the model's current device
365
+ device = _model_device(tts_instance)
366
+ gpt_cond_latent_t, speaker_embedding_t = _latents_for_device(latents, device)
367
+
368
  try:
369
  for chunk in tts_instance.inference_stream(
370
  text=text,
371
  language=language,
372
+ gpt_cond_latent=gpt_cond_latent_t,
373
+ speaker_embedding=speaker_embedding_t,
374
  temperature=0.85,
375
  ):
376
  if chunk is None:
 
380
  f32 = np.clip(f32, -1.0, 1.0).astype(np.float32)
381
  s16 = (f32 * 32767.0).astype(np.int16)
382
  yield s16.tobytes()
383
+
384
  except RuntimeError as e:
385
  print(f"Error during TTS inference: {e}")
386
  if "device-side assert" in str(e) and api:
 
390
  except Exception:
391
  pass
392
 
393
+
394
  # ===================================================================================
395
+ # 5) ZERO-GPU ENTRYPOINT (safe on native GPU as well)
396
  # ===================================================================================
397
 
398
+ @spaces.GPU(duration=120) # GPU ops must occur inside this function when on ZeroGPU
399
  def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
400
  if secret_token_input != SECRET_TOKEN:
401
  raise gr.Error("Invalid secret token provided.")
402
  if not input_text:
403
  return []
404
 
405
+ # Ensure models/latents exist (loaded on CPU)
406
  if tts_model is None or llm_model is None or not voice_latents:
407
  init_models_and_latents()
408
 
409
+ # During the GPU window, move XTTS to CUDA if available; otherwise stay on CPU
410
  try:
411
  if torch.cuda.is_available():
412
  tts_model.to("cuda")
 
415
  except Exception:
416
  tts_model.to("cpu")
417
 
418
+ # Generate story text (LLM kept CPU for simplicity & reliability)
419
  history: List[Tuple[str, str | None]] = [(input_text, None)]
420
  full_story_text = "".join(
421
  generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
 
450
  b64_wav = base64.b64encode(pcm_to_wav(final_pcm, sample_rate=24000, channels=1, bit_depth=16)).decode("utf-8")
451
  results.append({"text": sentence, "audio": b64_wav})
452
 
453
+ # Leave model on CPU after the ZeroGPU window
454
  try:
455
  tts_model.to("cpu")
456
  except Exception:
 
458
 
459
  return results
460
 
461
+
462
  # ===================================================================================
463
  # 6) STARTUP: PRECACHE & UI
464
  # ===================================================================================
 
473
  ],
474
  outputs=gr.JSON(label="Story and Audio Output"),
475
  title="AI Storyteller with ZeroGPU",
476
+ description="Enter a prompt to generate a short story with voice narration. Uses GPU only within the generation call when available.",
477
  flagging_mode="never",
478
  allow_flagging="never",
479
  )
480
 
481
  if __name__ == "__main__":
482
+ print("===== Startup: pre-cache assets and preload models (CPU) =====")
483
  print(f"Python: {sys.version.split()[0]} | Torch CUDA available: {torch.cuda.is_available()}")
484
  precache_assets() # 1) download everything to disk
485
+ init_models_and_latents() # 2) load models on CPU + compute voice latents on CPU
486
  print("Models and assets ready. Launching UI...")
487
 
488
  demo = build_ui()