Spaces:

ruslanmv
/

ai-story-server-cpu

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

a197317

1 Parent(s): ead9609

UP

Browse files

Files changed (2) hide show

Makefile +171 -0
app.py +100 -77

Makefile ADDED Viewed

	@@ -0,0 +1,171 @@

+# ================================================================
+# Makefile — AI Story Server (Python 3.11)
+# ================================================================
+# Common usage:
+#   make help
+#   make install           # CPU-friendly install
+#   make install-cuda      # build llama-cpp-python with CUDA/cuBLAS offload
+#   make precache          # download models + compute voice latents once
+#   make run               # run the Gradio app (prefers GPU if available)
+#   make clean             # clean caches (keeps venv)
+#   make deepclean         # remove venv + caches
+# ---------------------------------------------------------------
+# ---- Configurable vars ----
+PYTHON ?= python3.11
+VENV   ?= .venv
+PY     := $(VENV)/bin/python
+PIP    := $(VENV)/bin/pip
+APP    ?= app.py
+PORT   ?= 7860
+# Core runtime deps (CPU-safe). Torch comes via transitive deps where needed;
+# you may pin torch externally if required by your environment.
+REQS = \
+    "numpy<2" \
+    "gradio==4.27.0" \
+    "python-dotenv" \
+    "huggingface_hub" \
+    "ffmpeg-python" \
+    "nltk" \
+    "emoji" \
+    "langid" \
+    "noisereduce" \
+    "TTS" \
+    "llama-cpp-python>=0.2.90"
+# Dev tools (optional)
+DEV_REQS = \
+    "ruff" \
+    "black" \
+    "pip-tools"
+# ================================================================
+# Meta
+# ================================================================
+.PHONY: help venv install install-no-llama install-cuda install-dev \
+        precache run run-gpu check-ffmpeg check-python lint format \
+        freeze deps-update clean deepclean
+help:
+	@echo "Targets:"
+	@echo "  install          - Create venv (Python 3.11) and install CPU-safe deps"
+	@echo "  install-cuda     - Build llama-cpp-python with CUDA/cuBLAS offload + install deps"
+	@echo "  install-dev      - Install dev tools (ruff, black, pip-tools)"
+	@echo "  precache         - Download models & compute voice latents once (no UI)"
+	@echo "  run              - Run Gradio app on PORT=$(PORT) (prefers native GPU if present)"
+	@echo "  run-gpu          - Run app forcing CUDA_VISIBLE_DEVICES (default 0)"
+	@echo "  lint             - Run ruff"
+	@echo "  format           - Run black and ruff --fix"
+	@echo "  freeze           - Write requirements.txt from current venv"
+	@echo "  deps-update      - Upgrade runtime deps"
+	@echo "  check-ffmpeg     - Verify ffmpeg is installed"
+	@echo "  check-python     - Verify Python 3.11 is available"
+	@echo "  clean            - Clear caches/artifacts (keeps venv)"
+	@echo "  deepclean        - Remove venv and caches"
+# ================================================================
+# Environment / setup
+# ================================================================
+check-python:
+	@command -v $(PYTHON) >/dev/null 2>&1 || \
+	{ echo "ERROR: $(PYTHON) not found. Please install Python 3.11 and retry."; exit 1; }
+	@echo "OK: $(PYTHON) found."
+venv: check-python
+	$(PYTHON) -m venv $(VENV)
+	@echo "Virtual environment created at $(VENV)"
+install-no-llama: venv
+	$(PIP) install --upgrade pip setuptools wheel
+	$(PIP) install "numpy<2" "gradio==4.27.0" python-dotenv huggingface_hub ffmpeg-python nltk emoji langid noisereduce TTS
+install: venv
+	$(PIP) install --upgrade pip setuptools wheel
+	# CPU-friendly install of all deps including llama-cpp-python
+	$(PIP) install $(REQS)
+# CUDA/cuBLAS build for llama-cpp-python (requires CUDA toolkit & compiler)
+install-cuda: venv
+	$(PIP) install --upgrade pip setuptools wheel
+	@echo "Building llama-cpp-python with CUDA/cuBLAS…"
+	@export CMAKE_ARGS="-DLLAMA_CUBLAS=on"; \
+	export LLAMA_CUBLAS=1; \
+	$(PIP) install --no-binary=:all: --force-reinstall "llama-cpp-python>=0.2.90"
+	# Install the rest of the deps (excluding llama-cpp-python which we just built)
+	$(MAKE) install-no-llama
+	@echo "CUDA install complete."
+install-dev: venv
+	$(PIP) install --upgrade pip
+	$(PIP) install $(DEV_REQS)
+# ================================================================
+# Utility checks
+# ================================================================
+check-ffmpeg:
+	@command -v ffmpeg >/dev/null 2>&1 || { echo "ERROR: ffmpeg not found. Install ffmpeg and retry."; exit 1; }
+	@ffmpeg -version | head -n 1
+# ================================================================
+# Workflow targets
+# ================================================================
+# Pre-download model assets and compute voice latents (runs your app's functions)
+precache: install check-ffmpeg
+	$(PY) - <<'PY'
+from app import precache_assets, init_models_and_latents
+precache_assets()
+init_models_and_latents()
+print("Precache complete.")
+PY
+run: install
+	@echo "Starting app on port $(PORT)…"
+	PORT=$(PORT) $(PY) $(APP)
+# Run, preferring a specific GPU (default GPU 0). App itself auto-detects CUDA.
+run-gpu: install
+	@echo "Starting app with CUDA_VISIBLE_DEVICES=$${CUDA_VISIBLE_DEVICES:-0} on port $(PORT)…"
+	CUDA_VISIBLE_DEVICES=$${CUDA_VISIBLE_DEVICES:-0} PORT=$(PORT) $(PY) $(APP)
+# Lint / format
+lint: install-dev
+	$(VENV)/bin/ruff check .
+format: install-dev
+	$(VENV)/bin/black .
+	$(VENV)/bin/ruff check --fix .
+# Freeze dependency snapshot
+freeze:
+	@echo "Writing requirements.txt from current venv…"
+	$(VENV)/bin/pip freeze > requirements.txt
+	@echo "requirements.txt updated."
+# Upgrade runtime deps (keeps numpy<2 guard)
+deps-update: venv
+	$(PIP) install --upgrade pip
+	$(PIP) install --upgrade "numpy<2" "gradio==4.27.0" python-dotenv huggingface_hub ffmpeg-python nltk emoji langid noisereduce TTS "llama-cpp-python>=0.2.90"
+# ================================================================
+# Cleanup
+# ================================================================
+clean:
+	@echo "Cleaning caches…"
+	@rm -rf __pycache__ */__pycache__
+	@rm -rf .pytest_cache .ruff_cache
+	@rm -rf voices/*.tmp
+	@rm -rf ~/.cache/huggingface/hub/tmp
+	@rm -rf ~/.cache/huggingface/transformers
+	@rm -rf ~/.cache/torch
+	@rm -rf ~/.cache/pip
+	@rm -rf ~/.local/share/tts/tmp
+	@echo "Done."
+deepclean: clean
+	@echo "Removing venv and model caches…"
+	@rm -rf $(VENV)
+	@rm -rf ~/.local/share/tts
+	@rm -rf voices
+	@echo "Done."

app.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # 1) SETUP & IMPORTS
 # ===================================================================================
 from __future__ import annotations
 import os
 import sys
 import base64
@@ -9,24 +10,24 @@ import struct
 import textwrap
 import requests
 import atexit
-from typing import List, Dict, Tuple, Generator
 # --- Fast, safe defaults ---
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
-os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")  # truly disable analytics
-os.environ.setdefault("TORCHAUDIO_USE_FFMPEG", "0")         # avoid torchaudio/ffmpeg linkage issues
 # --- .env early (HF_TOKEN / SECRET_TOKEN) ---
 from dotenv import load_dotenv
 load_dotenv()
-# --- NumPy sanity (Torch 2.2.x wants NumPy 1.x) ---
 import numpy as _np
 if int(_np.__version__.split(".", 1)[0]) >= 2:
     raise RuntimeError(
-        f"Detected numpy=={_np.__version__}. Please ensure numpy<2 (e.g., 1.26.4) for this Space."
     )
 # --- Hugging Face Spaces & ZeroGPU (import BEFORE CUDA libs) ---
@@ -48,7 +49,7 @@ import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
-# --- Audio decoding (use ffmpeg-python; no torchaudio) ---
 import ffmpeg
 # --- TTS Libraries ---
@@ -63,6 +64,7 @@ import langid
 import emoji
 import noisereduce as nr
 # ===================================================================================
 # 2) GLOBALS & HELPERS
 # ===================================================================================
@@ -70,9 +72,11 @@ import noisereduce as nr
 # NLTK data
 nltk.download("punkt", quiet=True)
-# Cached models & latents
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
 voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
 # Config
@@ -83,9 +87,6 @@ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
-# Prefer native GPU if available; otherwise we’ll rely on ZeroGPU (or CPU)
-PREFER_NATIVE_GPU = torch.cuda.is_available()
 # System prompts and roles
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
@@ -99,7 +100,25 @@ ROLE_PROMPTS["Pirate"] = (
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
-# ---------- small utils ----------
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
@@ -132,6 +151,7 @@ def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], sy
     prompt += f"<|user|>\n{message}</s><|assistant|>"
     return prompt
 # ---------- robust audio decode (mono via ffmpeg) ----------
 def _decode_audio_ffmpeg_to_mono(path: str, target_sr: int) -> np.ndarray:
     """
@@ -153,6 +173,7 @@ def _decode_audio_ffmpeg_to_mono(path: str, target_sr: int) -> np.ndarray:
     except ffmpeg.Error as e:
         raise RuntimeError(f"ffmpeg decode failed: {e.stderr.decode(errors='ignore') if e.stderr else e}") from e
 # ---------- monkey-patch XTTS internal loader to avoid torchaudio/torio ----------
 def _patched_load_audio(audiopath: str, load_sr: int):
     """
@@ -163,30 +184,28 @@ def _patched_load_audio(audiopath: str, load_sr: int):
     """
     wav = _decode_audio_ffmpeg_to_mono(audiopath, target_sr=load_sr)
     import torch as _torch  # local import to avoid any circularities
-    audio = _torch.from_numpy(wav).float().unsqueeze(0)  # [1, N]
     return audio
 xtts_module.load_audio = _patched_load_audio
-# Also patch the common utility location, in case this version imports from there:
 try:
     import TTS.utils.audio as _tts_audio_mod
     _tts_audio_mod.load_audio = _patched_load_audio
 except Exception:
     pass
-# ---------- where Coqui caches models (avoid get_user_data_dir import) ----------
 def _coqui_cache_dir() -> str:
     # Matches what TTS uses on Linux: ~/.local/share/tts
     return os.path.join(os.path.expanduser("~"), ".local", "share", "tts")
 # ===================================================================================
-# 3) PRECACHE & MODEL LOADERS (RUN BEFORE FIRST INFERENCE)
 # ===================================================================================
 def precache_assets() -> None:
     """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
-    # Voices
     print("Pre-caching voice files...")
     file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
     base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
@@ -202,11 +221,9 @@ def precache_assets() -> None:
             except Exception as e:
                 print(f"Failed to download {name}: {e}")
-    # XTTS model files
     print("Pre-caching XTTS v2 model files...")
     ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
-    # LLM GGUF
     print("Pre-caching Zephyr GGUF...")
     try:
         hf_hub_download(
@@ -217,8 +234,9 @@ def precache_assets() -> None:
     except Exception as e:
         print(f"Warning: GGUF pre-cache error: {e}")
-def _load_xtts(device: str) -> Xtts:
-    """Load XTTS from the local cache. Use checkpoint_dir to avoid None path bugs."""
     print(f"Loading Coqui XTTS V2 model on {device.upper()}...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
     ModelManager().download_model(model_name)  # idempotent
@@ -237,75 +255,72 @@ def _load_xtts(device: str) -> Xtts:
     print("XTTS model loaded.")
     return model
 def _load_llama() -> Llama:
     """
-    Load Llama (Zephyr GGUF). Prefer GPU offload if native CUDA build is present,
-    otherwise fall back to pure CPU.
     """
     print("Loading LLM (Zephyr GGUF)...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
-    # Heuristic: try to offload a large number of layers if CUDA build exists.
-    gpu_layers_env = int(os.getenv("LLAMA_GPU_LAYERS", "100"))
-    n_gpu_layers = gpu_layers_env if PREFER_NATIVE_GPU else 0
-    try:
-        llm = Llama(
-            model_path=zephyr_model_path,
-            n_gpu_layers=n_gpu_layers,   # if CUDA build exists, this offloads layers
-            n_ctx=4096,
-            n_batch=512,
-            verbose=False
-        )
-        used = "GPU-offload" if n_gpu_layers > 0 else "CPU"
-        print(f"LLM loaded ({used}).")
-        return llm
-    except Exception as e:
-        print(f"LLM GPU offload failed ({e}); falling back to CPU.")
-        llm = Llama(
-            model_path=zephyr_model_path,
-            n_gpu_layers=0,
-            n_ctx=4096,
-            n_batch=512,
-            verbose=False
-        )
-        print("LLM loaded (CPU).")
-        return llm
 def init_models_and_latents() -> None:
     """
-    Preload TTS and LLM. If native GPU is available at startup, load XTTS on CUDA
-    and precompute voice latents there; otherwise do it on CPU (ZeroGPU will move it later).
     """
     global tts_model, llm_model, voice_latents
-    target_device = "cuda" if PREFER_NATIVE_GPU else "cpu"
     if tts_model is None:
-        tts_model = _load_xtts(device=target_device)
     if llm_model is None:
         llm_model = _load_llama()
-    # Pre-compute latents once; uses patched loader (ffmpeg) under the hood
     if not voice_latents:
-        print("Computing voice conditioning latents...")
-        for role, filename in [
-            ("Cloée", "cloee-1.wav"),
-            ("Julian", "julian-bedtime-style-1.wav"),
-            ("Pirate", "pirate_by_coqui.wav"),
-            ("Thera", "thera-1.wav"),
-        ]:
-            path = os.path.join("voices", filename)
-            with torch.no_grad():
-                voice_latents[role] = tts_model.get_conditioning_latents(
                     audio_path=path, gpt_cond_len=30, max_ref_length=60
                 )
         print("Voice latents ready.")
 # Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
 def _close_llm():
     global llm_model
@@ -316,6 +331,7 @@ def _close_llm():
         pass
 atexit.register(_close_llm)
 # ===================================================================================
 # 4) INFERENCE HELPERS
 # ===================================================================================
@@ -342,15 +358,19 @@ def generate_text_stream(llm_instance: Llama, prompt: str,
             continue
         yield ch
 def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
                           latents: Tuple[np.ndarray, np.ndarray]) -> Generator[bytes, None, None]:
-    gpt_cond_latent, speaker_embedding = latents
     try:
         for chunk in tts_instance.inference_stream(
             text=text,
             language=language,
-            gpt_cond_latent=gpt_cond_latent,
-            speaker_embedding=speaker_embedding,
             temperature=0.85,
         ):
             if chunk is None:
@@ -360,6 +380,7 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
             f32 = np.clip(f32, -1.0, 1.0).astype(np.float32)
             s16 = (f32 * 32767.0).astype(np.int16)
             yield s16.tobytes()
     except RuntimeError as e:
         print(f"Error during TTS inference: {e}")
         if "device-side assert" in str(e) and api:
@@ -369,22 +390,23 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
             except Exception:
                 pass
 # ===================================================================================
-# 5) ZERO-GPU ENTRYPOINT (also works on native GPU)
 # ===================================================================================
-@spaces.GPU(duration=120)  # On native-GPU Spaces this simply runs with the resident GPU.
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
-    # Ensure models/latents exist
     if tts_model is None or llm_model is None or not voice_latents:
         init_models_and_latents()
-    # Prefer GPU if available at call time (ZeroGPU grants CUDA during this function)
     try:
         if torch.cuda.is_available():
             tts_model.to("cuda")
@@ -393,7 +415,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
     except Exception:
         tts_model.to("cpu")
-    # Generate story text
     history: List[Tuple[str, str | None]] = [(input_text, None)]
     full_story_text = "".join(
         generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
@@ -428,7 +450,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
         b64_wav = base64.b64encode(pcm_to_wav(final_pcm, sample_rate=24000, channels=1, bit_depth=16)).decode("utf-8")
         results.append({"text": sentence, "audio": b64_wav})
-    # Release GPU immediately if we were in a ZeroGPU window
     try:
         tts_model.to("cpu")
     except Exception:
@@ -436,6 +458,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
     return results
 # ===================================================================================
 # 6) STARTUP: PRECACHE & UI
 # ===================================================================================
@@ -450,16 +473,16 @@ def build_ui() -> gr.Interface:
         ],
         outputs=gr.JSON(label="Story and Audio Output"),
         title="AI Storyteller with ZeroGPU",
-        description="Enter a prompt to generate a short story with voice narration using on-demand GPU or native GPU when available.",
         flagging_mode="never",
         allow_flagging="never",
     )
 if __name__ == "__main__":
-    print("===== Startup: pre-cache assets and preload models =====")
     print(f"Python: {sys.version.split()[0]} | Torch CUDA available: {torch.cuda.is_available()}")
     precache_assets()              # 1) download everything to disk
-    init_models_and_latents()      # 2) load models (prefer native GPU) + compute voice latents
     print("Models and assets ready. Launching UI...")
     demo = build_ui()

 # 1) SETUP & IMPORTS
 # ===================================================================================
 from __future__ import annotations
 import os
 import sys
 import base64
 import textwrap
 import requests
 import atexit
+from typing import List, Dict, Tuple, Generator, Any
 # --- Fast, safe defaults ---
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
+os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")   # truly disable analytics
+os.environ.setdefault("TORCHAUDIO_USE_FFMPEG", "0")          # avoid torchaudio/ffmpeg linkage quirks
 # --- .env early (HF_TOKEN / SECRET_TOKEN) ---
 from dotenv import load_dotenv
 load_dotenv()
+# --- NumPy sanity (Torch 2.2.x prefers NumPy 1.x) ---
 import numpy as _np
 if int(_np.__version__.split(".", 1)[0]) >= 2:
     raise RuntimeError(
+        f"Detected numpy=={_np.__version__}. Please ensure numpy<2 (e.g., 1.26.4)."
     )
 # --- Hugging Face Spaces & ZeroGPU (import BEFORE CUDA libs) ---
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
+# --- Audio decoding (pure ffmpeg-python; no torchaudio) ---
 import ffmpeg
 # --- TTS Libraries ---
 import emoji
 import noisereduce as nr
 # ===================================================================================
 # 2) GLOBALS & HELPERS
 # ===================================================================================
 # NLTK data
 nltk.download("punkt", quiet=True)
+# Models & caches
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
+# Store latents as NumPy on CPU for portability; convert to device at inference time
 voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
 # Config
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
 # System prompts and roles
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
+# ---------- tiny utilities ----------
+def _model_device(m: torch.nn.Module) -> torch.device:
+    try:
+        return next(m.parameters()).device
+    except StopIteration:
+        return torch.device("cpu")
+def _to_device_float_tensor(x: Any, device: torch.device) -> torch.Tensor:
+    if isinstance(x, np.ndarray):
+        return torch.from_numpy(x).float().to(device)
+    if torch.is_tensor(x):
+        return x.to(device, dtype=torch.float32)
+    return torch.as_tensor(x, dtype=torch.float32, device=device)
+def _latents_for_device(latents: Tuple[Any, Any], device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    gpt_cond, spk = latents
+    return _to_device_float_tensor(gpt_cond, device), _to_device_float_tensor(spk, device)
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
     prompt += f"<|user|>\n{message}</s><|assistant|>"
     return prompt
 # ---------- robust audio decode (mono via ffmpeg) ----------
 def _decode_audio_ffmpeg_to_mono(path: str, target_sr: int) -> np.ndarray:
     """
     except ffmpeg.Error as e:
         raise RuntimeError(f"ffmpeg decode failed: {e.stderr.decode(errors='ignore') if e.stderr else e}") from e
 # ---------- monkey-patch XTTS internal loader to avoid torchaudio/torio ----------
 def _patched_load_audio(audiopath: str, load_sr: int):
     """
     """
     wav = _decode_audio_ffmpeg_to_mono(audiopath, target_sr=load_sr)
     import torch as _torch  # local import to avoid any circularities
+    audio = _torch.from_numpy(wav).float().unsqueeze(0)  # [1, N] on CPU
     return audio
 xtts_module.load_audio = _patched_load_audio
 try:
     import TTS.utils.audio as _tts_audio_mod
     _tts_audio_mod.load_audio = _patched_load_audio
 except Exception:
     pass
 def _coqui_cache_dir() -> str:
     # Matches what TTS uses on Linux: ~/.local/share/tts
     return os.path.join(os.path.expanduser("~"), ".local", "share", "tts")
 # ===================================================================================
+# 3) PRECACHE & MODEL LOADERS (CPU at startup to avoid ZeroGPU issues)
 # ===================================================================================
 def precache_assets() -> None:
     """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
     print("Pre-caching voice files...")
     file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
     base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
             except Exception as e:
                 print(f"Failed to download {name}: {e}")
     print("Pre-caching XTTS v2 model files...")
     ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
     print("Pre-caching Zephyr GGUF...")
     try:
         hf_hub_download(
     except Exception as e:
         print(f"Warning: GGUF pre-cache error: {e}")
+def _load_xtts(device: str = "cpu") -> Xtts:
+    """Load XTTS from the local cache. Keep CPU at startup to avoid ZeroGPU device mixups."""
     print(f"Loading Coqui XTTS V2 model on {device.upper()}...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
     ModelManager().download_model(model_name)  # idempotent
     print("XTTS model loaded.")
     return model
 def _load_llama() -> Llama:
     """
+    Load Llama (Zephyr GGUF).
+    Keep simple & robust: default to CPU (works everywhere).
     """
     print("Loading LLM (Zephyr GGUF)...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
+    llm = Llama(
+        model_path=zephyr_model_path,
+        n_gpu_layers=0,     # CPU-only for reliability across Spaces/ZeroGPU
+        n_ctx=4096,
+        n_batch=512,
+        verbose=False
+    )
+    print("LLM loaded (CPU).")
+    return llm
 def init_models_and_latents() -> None:
     """
+    Preload models on CPU and compute voice latents on CPU.
+    This avoids ZeroGPU's "mixed device" errors from torchaudio-based resampling.
     """
     global tts_model, llm_model, voice_latents
     if tts_model is None:
+        tts_model = _load_xtts(device="cpu")  # always CPU at startup
     if llm_model is None:
         llm_model = _load_llama()
     if not voice_latents:
+        print("Computing voice conditioning latents (CPU)...")
+        # Ensure the TTS model is on CPU while computing latents
+        orig_dev = _model_device(tts_model)
+        if orig_dev.type != "cpu":
+            tts_model.to("cpu")
+        with torch.no_grad():
+            for role, filename in [
+                ("Cloée", "cloee-1.wav"),
+                ("Julian", "julian-bedtime-style-1.wav"),
+                ("Pirate", "pirate_by_coqui.wav"),
+                ("Thera", "thera-1.wav"),
+            ]:
+                path = os.path.join("voices", filename)
+                gpt_lat, spk_emb = tts_model.get_conditioning_latents(
                     audio_path=path, gpt_cond_len=30, max_ref_length=60
                 )
+                # Store as NumPy on CPU; convert to device on demand later
+                voice_latents[role] = (
+                    gpt_lat.detach().cpu().numpy(),
+                    spk_emb.detach().cpu().numpy(),
+                )
+        # Return model to original device (keep CPU at startup for safety)
+        if orig_dev.type != "cpu":
+            tts_model.to(orig_dev)
         print("Voice latents ready.")
 # Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
 def _close_llm():
     global llm_model
         pass
 atexit.register(_close_llm)
 # ===================================================================================
 # 4) INFERENCE HELPERS
 # ===================================================================================
             continue
         yield ch
 def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
                           latents: Tuple[np.ndarray, np.ndarray]) -> Generator[bytes, None, None]:
+    # Convert stored CPU NumPy latents to tensors on the model's current device
+    device = _model_device(tts_instance)
+    gpt_cond_latent_t, speaker_embedding_t = _latents_for_device(latents, device)
     try:
         for chunk in tts_instance.inference_stream(
             text=text,
             language=language,
+            gpt_cond_latent=gpt_cond_latent_t,
+            speaker_embedding=speaker_embedding_t,
             temperature=0.85,
         ):
             if chunk is None:
             f32 = np.clip(f32, -1.0, 1.0).astype(np.float32)
             s16 = (f32 * 32767.0).astype(np.int16)
             yield s16.tobytes()
     except RuntimeError as e:
         print(f"Error during TTS inference: {e}")
         if "device-side assert" in str(e) and api:
             except Exception:
                 pass
 # ===================================================================================
+# 5) ZERO-GPU ENTRYPOINT (safe on native GPU as well)
 # ===================================================================================
+@spaces.GPU(duration=120)  # GPU ops must occur inside this function when on ZeroGPU
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
+    # Ensure models/latents exist (loaded on CPU)
     if tts_model is None or llm_model is None or not voice_latents:
         init_models_and_latents()
+    # During the GPU window, move XTTS to CUDA if available; otherwise stay on CPU
     try:
         if torch.cuda.is_available():
             tts_model.to("cuda")
     except Exception:
         tts_model.to("cpu")
+    # Generate story text (LLM kept CPU for simplicity & reliability)
     history: List[Tuple[str, str | None]] = [(input_text, None)]
     full_story_text = "".join(
         generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
         b64_wav = base64.b64encode(pcm_to_wav(final_pcm, sample_rate=24000, channels=1, bit_depth=16)).decode("utf-8")
         results.append({"text": sentence, "audio": b64_wav})
+    # Leave model on CPU after the ZeroGPU window
     try:
         tts_model.to("cpu")
     except Exception:
     return results
 # ===================================================================================
 # 6) STARTUP: PRECACHE & UI
 # ===================================================================================
         ],
         outputs=gr.JSON(label="Story and Audio Output"),
         title="AI Storyteller with ZeroGPU",
+        description="Enter a prompt to generate a short story with voice narration. Uses GPU only within the generation call when available.",
         flagging_mode="never",
         allow_flagging="never",
     )
 if __name__ == "__main__":
+    print("===== Startup: pre-cache assets and preload models (CPU) =====")
     print(f"Python: {sys.version.split()[0]} | Torch CUDA available: {torch.cuda.is_available()}")
     precache_assets()              # 1) download everything to disk
+    init_models_and_latents()      # 2) load models on CPU + compute voice latents on CPU
     print("Models and assets ready. Launching UI...")
     demo = build_ui()