Spaces:

ruslanmv
/

ai-story-server-cpu

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

d662d9a

1 Parent(s): f3fa464

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -107

app.py CHANGED Viewed

@@ -1,20 +1,22 @@
 # ===================================================================================
-# 1. SETUP AND IMPORTS
 # ===================================================================================
 from __future__ import annotations
 import os
-import requests
 import base64
 import struct
 import re
 import textwrap
-import uuid
 from typing import List, Dict, Tuple, Generator
-# Make sure Gradio analytics is off (so we don't need pandas 2.x)
-os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")
-# --- Load .env early (for HF_TOKEN / SECRET_TOKEN) ---
 from dotenv import load_dotenv
 load_dotenv()
@@ -22,7 +24,6 @@ load_dotenv()
 try:
     import spaces  # Required for ZeroGPU on HF
 except Exception:
-    # Allow local runs without the spaces package
     class _SpacesShim:
         def GPU(self, *args, **kwargs):
             def _wrap(fn):
@@ -51,19 +52,18 @@ import emoji
 import noisereduce as nr
 # ===================================================================================
-# 2. GLOBAL CONFIGURATION & HELPER FUNCTIONS
 # ===================================================================================
-# Download NLTK data (punkt)
 nltk.download("punkt", quiet=True)
-os.environ["COQUI_TOS_AGREED"] = "1"
-# Cached models
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
-# Configuration
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
@@ -84,7 +84,7 @@ ROLE_PROMPTS["Pirate"] = (
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
-# --- Audio helpers ---
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
@@ -118,68 +118,117 @@ def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], sy
     return prompt
 # ===================================================================================
-# 3. CORE AI FUNCTIONS (Model Loading & Inference)
 # ===================================================================================
 def _load_xtts(device: str) -> Xtts:
-    print("Loading Coqui XTTS V2 model (first run)...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
-    ModelManager().download_model(model_name)
-    model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    # NOTE: deepspeed not installed; keep False for Spaces
     model.load_checkpoint(
-        config,
-        checkpoint_path=os.path.join(model_path, "model.pth"),
-        vocab_path=os.path.join(model_path, "vocab.json"),
         eval=True,
-        use_deepspeed=False,
     )
     model.to(device)
     print("XTTS model loaded.")
     return model
 def _load_llama() -> Llama:
-    print("Loading LLM (Zephyr) (first run)...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
-    # Try GPU offload if available, else CPU
-    for n_gpu_layers in (-1, 0):
-        try:
-            llm = Llama(
-                model_path=zephyr_model_path,
-                n_gpu_layers=n_gpu_layers,
-                n_ctx=4096,
-                n_batch=512,
-                verbose=False
-            )
-            if n_gpu_layers == -1:
-                print("LLM loaded with GPU offload.")
-            else:
-                print("LLM loaded (CPU).")
-            return llm
-        except Exception as e:
-            print(f"LLM init with n_gpu_layers={n_gpu_layers} failed: {e}")
-    raise RuntimeError("Failed to initialize Llama model.")
-def load_models() -> Tuple[Xtts, Llama]:
-    global tts_model, llm_model
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if tts_model is None:
-        tts_model = _load_xtts(device)
     if llm_model is None:
         llm_model = _load_llama()
-    return tts_model, llm_model
 def generate_text_stream(llm_instance: Llama, prompt: str,
                          history: List[Tuple[str, str | None]],
-                         system_message: str) -> Generator[str, None, None]:
-    formatted_prompt = format_prompt_zephyr(prompt, history, system_message)
     stream = llm_instance(
         formatted_prompt,
         temperature=0.7,
@@ -190,9 +239,8 @@ def generate_text_stream(llm_instance: Llama, prompt: str,
     )
     for response in stream:
         ch = response["choices"][0]["text"]
-        # Guard against control tokens & isolated emoji artefacts
         try:
-            is_single_emoji = (len(ch) == 1 and emoji.is_emoji(ch))  # emoji>=2.x
         except Exception:
             is_single_emoji = False
         if "<|user|>" in ch or is_single_emoji:
@@ -214,7 +262,6 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
                 yield chunk.detach().cpu().numpy().squeeze().tobytes()
     except RuntimeError as e:
         print(f"Error during TTS inference: {e}")
-        # Soft-restart if GPU went bad and we can talk to the HF API
         if "device-side assert" in str(e) and api:
             gr.Warning("Critical GPU error. Attempting to restart the Space...")
             try:
@@ -223,38 +270,34 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
                 pass
 # ===================================================================================
-# 4. MAIN GRADIO FUNCTION (Decorated for ZeroGPU)
 # ===================================================================================
-@spaces.GPU(duration=120)  # Request GPU for 120 seconds
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
-    # Load models
-    tts, llm = load_models()
-    # Pre-compute voice latents
-    latent_map: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
-    for role, filename in [
-        ("Cloée", "cloee-1.wav"),
-        ("Julian", "julian-bedtime-style-1.wav"),
-        ("Pirate", "pirate_by_coqui.wav"),
-        ("Thera", "thera-1.wav"),
-    ]:
-        path = os.path.join("voices", filename)
-        latent_map[role] = tts.get_conditioning_latents(
-            audio_path=path, gpt_cond_len=30, max_ref_length=60
-        )
     # Generate story text
     history: List[Tuple[str, str | None]] = [(input_text, None)]
     full_story_text = "".join(
-        generate_text_stream(llm, history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role])
     ).strip()
     if not full_story_text:
         return []
@@ -267,7 +310,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
         if not any(c.isalnum() for c in sentence):
             continue
-        audio_chunks = generate_audio_stream(tts, sentence, lang, latent_map[chatbot_role])
         pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
         # Optional noise reduction (best-effort)
@@ -285,43 +328,38 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
         b64_wav = base64.b64encode(pcm_to_wav(final_pcm)).decode("utf-8")
         results.append({"text": sentence, "audio": b64_wav})
     return results
 # ===================================================================================
-# 5. GRADIO INTERFACE LAUNCH
 # ===================================================================================
-# Download voice files on startup
-print("Downloading voice files...")
-file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
-base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
-os.makedirs("voices", exist_ok=True)
-for name in file_names:
-    dst = os.path.join("voices", name)
-    if not os.path.exists(dst):
-        try:
-            resp = requests.get(base_url + name, timeout=30)
-            resp.raise_for_status()
-            with open(dst, "wb") as f:
-                f.write(resp.content)
-        except Exception as e:
-            print(f"Failed to download {name}: {e}")
-# Define the Gradio Interface
-demo = gr.Interface(
-    fn=generate_story_and_speech,
-    inputs=[
-        gr.Textbox(label="Secret Token", type="password", value=SECRET_TOKEN),
-        gr.Textbox(placeholder="What should the story be about?", label="Story Prompt"),
-        gr.Dropdown(choices=ROLES, label="Select a Storyteller", value="Cloée"),
-    ],
-    outputs=gr.JSON(label="Story and Audio Output"),
-    title="AI Storyteller with ZeroGPU",
-    description="Enter a prompt to generate a short story with voice narration using on-demand GPU.",
-    allow_flagging="never",
-    analytics_enabled=False,  # <- keep analytics off to avoid pandas 2.x requirement
-)
 if __name__ == "__main__":
-    # For Spaces or Docker, these defaults are handy; adjust as needed.
     demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

 # ===================================================================================
+# 1) SETUP & IMPORTS
 # ===================================================================================
 from __future__ import annotations
 import os
 import base64
 import struct
 import re
 import textwrap
+import requests
 from typing import List, Dict, Tuple, Generator
+# --- Fast, safe defaults ---
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+os.environ.setdefault("COQUI_TOS_AGREED", "1")
+os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")  # truly disable analytics
+# --- Load .env early (HF_TOKEN / SECRET_TOKEN) ---
 from dotenv import load_dotenv
 load_dotenv()
 try:
     import spaces  # Required for ZeroGPU on HF
 except Exception:
     class _SpacesShim:
         def GPU(self, *args, **kwargs):
             def _wrap(fn):
 import noisereduce as nr
 # ===================================================================================
+# 2) GLOBALS & HELPERS
 # ===================================================================================
+# Download NLTK data (punkt) once
 nltk.download("punkt", quiet=True)
+# Cached models & latents
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
+voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
+# Config
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
+# ---------- small utils ----------
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
     return prompt
 # ===================================================================================
+# 3) PRECACHE & MODEL LOADERS (RUN BEFORE FIRST INFERENCE)
 # ===================================================================================
+def precache_assets() -> None:
+    """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
+    # Voices
+    print("Pre-caching voice files...")
+    file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
+    base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
+    os.makedirs("voices", exist_ok=True)
+    for name in file_names:
+        dst = os.path.join("voices", name)
+        if not os.path.exists(dst):
+            try:
+                resp = requests.get(base_url + name, timeout=30)
+                resp.raise_for_status()
+                with open(dst, "wb") as f:
+                    f.write(resp.content)
+            except Exception as e:
+                print(f"Failed to download {name}: {e}")
+    # XTTS model files
+    print("Pre-caching XTTS v2 model files...")
+    ModelManager().download_model("tts_models/multilingual/multi-dataset/xtts_v2")
+    # LLM GGUF
+    print("Pre-caching Zephyr GGUF...")
+    try:
+        hf_hub_download(
+            repo_id="TheBloke/zephyr-7B-beta-GGUF",
+            filename="zephyr-7b-beta.Q5_K_M.gguf",
+            force_download=False
+        )
+    except Exception as e:
+        print(f"Warning: GGUF pre-cache error: {e}")
 def _load_xtts(device: str) -> Xtts:
+    """Load XTTS from the local cache. Use checkpoint_dir to avoid None path bug."""
+    print("Loading Coqui XTTS V2 model (CPU first)...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+    ModelManager().download_model(model_name)  # idempotent
+    model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
+    cfg = XttsConfig()
+    cfg.load_json(os.path.join(model_dir, "config.json"))
+    model = Xtts.init_from_config(cfg)
+    # IMPORTANT: use checkpoint_dir (fixes speakers file path resolution)
     model.load_checkpoint(
+        cfg,
+        checkpoint_dir=model_dir,
         eval=True,
+        use_deepspeed=False,  # deepspeed not installed in Spaces
     )
     model.to(device)
     print("XTTS model loaded.")
     return model
 def _load_llama() -> Llama:
+    """Load Llama (Zephyr GGUF) on CPU so it's ready immediately."""
+    print("Loading LLM (Zephyr GGUF) on CPU...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
+    # Initialize CPU instance (n_gpu_layers=0). If you want GPU offload, you can
+    # create a second instance inside the GPU window, but CPU is simpler & ready now.
+    llm = Llama(
+        model_path=zephyr_model_path,
+        n_gpu_layers=0,   # CPU by default to keep it ready without GPU
+        n_ctx=4096,
+        n_batch=512,
+        verbose=False
+    )
+    print("LLM loaded (CPU).")
+    return llm
+def init_models_and_latents() -> None:
+    """Preload TTS and LLM on CPU and compute voice latents once."""
+    global tts_model, llm_model, voice_latents
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if tts_model is None:
+        tts_model = _load_xtts(device="cpu")  # keep on CPU at startup
     if llm_model is None:
         llm_model = _load_llama()
+    # Pre-compute latents once (CPU OK)
+    if not voice_latents:
+        print("Computing voice conditioning latents...")
+        for role, filename in [
+            ("Cloée", "cloee-1.wav"),
+            ("Julian", "julian-bedtime-style-1.wav"),
+            ("Pirate", "pirate_by_coqui.wav"),
+            ("Thera", "thera-1.wav"),
+        ]:
+            path = os.path.join("voices", filename)
+            voice_latents[role] = tts_model.get_conditioning_latents(
+                audio_path=path, gpt_cond_len=30, max_ref_length=60
+            )
+        print("Voice latents ready.")
+# ===================================================================================
+# 4) INFERENCE HELPERS
+# ===================================================================================
 def generate_text_stream(llm_instance: Llama, prompt: str,
                          history: List[Tuple[str, str | None]],
+                         system_message_text: str) -> Generator[str, None, None]:
+    formatted_prompt = format_prompt_zephyr(prompt, history, system_message_text)
     stream = llm_instance(
         formatted_prompt,
         temperature=0.7,
     )
     for response in stream:
         ch = response["choices"][0]["text"]
         try:
+            is_single_emoji = (len(ch) == 1 and emoji.is_emoji(ch))
         except Exception:
             is_single_emoji = False
         if "<|user|>" in ch or is_single_emoji:
                 yield chunk.detach().cpu().numpy().squeeze().tobytes()
     except RuntimeError as e:
         print(f"Error during TTS inference: {e}")
         if "device-side assert" in str(e) and api:
             gr.Warning("Critical GPU error. Attempting to restart the Space...")
             try:
                 pass
 # ===================================================================================
+# 5) ZERO-GPU ENTRYPOINT
 # ===================================================================================
+@spaces.GPU(duration=120)  # Request GPU for 120s (can tune later)
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
+    # Models & latents are preloaded at startup; ensure available
+    if tts_model is None or llm_model is None or not voice_latents:
+        init_models_and_latents()
+    # If ZeroGPU provided a GPU for this call, move XTTS to CUDA for faster audio
+    try:
+        if torch.cuda.is_available():
+            tts_model.to("cuda")
+        else:
+            tts_model.to("cpu")
+    except Exception:
+        tts_model.to("cpu")
     # Generate story text
     history: List[Tuple[str, str | None]] = [(input_text, None)]
     full_story_text = "".join(
+        generate_text_stream(llm_model, history[-1][0], history[:-1], system_message_text=ROLE_PROMPTS[chatbot_role])
     ).strip()
     if not full_story_text:
         return []
         if not any(c.isalnum() for c in sentence):
             continue
+        audio_chunks = generate_audio_stream(tts_model, sentence, lang, voice_latents[chatbot_role])
         pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
         # Optional noise reduction (best-effort)
         b64_wav = base64.b64encode(pcm_to_wav(final_pcm)).decode("utf-8")
         results.append({"text": sentence, "audio": b64_wav})
+    # Return XTTS to CPU to free GPU instantly after the call
+    try:
+        tts_model.to("cpu")
+    except Exception:
+        pass
     return results
 # ===================================================================================
+# 6) STARTUP: PRECACHE & UI
 # ===================================================================================
+def build_ui() -> gr.Interface:
+    return gr.Interface(
+        fn=generate_story_and_speech,
+        inputs=[
+            gr.Textbox(label="Secret Token", type="password", value=SECRET_TOKEN),
+            gr.Textbox(placeholder="What should the story be about?", label="Story Prompt"),
+            gr.Dropdown(choices=ROLES, label="Select a Storyteller", value="Cloée"),
+        ],
+        outputs=gr.JSON(label="Story and Audio Output"),
+        title="AI Storyteller with ZeroGPU",
+        description="Enter a prompt to generate a short story with voice narration using on-demand GPU.",
+        flagging_mode="never",       # replaces deprecated allow_flagging
+    )
 if __name__ == "__main__":
+    print("===== Startup: pre-cache assets and preload models =====")
+    precache_assets()              # 1) download everything to disk
+    init_models_and_latents()      # 2) load models on CPU + compute voice latents
+    print("Models and assets ready. Launching UI...")
+    demo = build_ui()
+    # queue + analytics disabled (env) keeps pandas out of the path
     demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))