Spaces:

ruslanmv
/

ai-story-server-cpu

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

207e4c3

1 Parent(s): 7741539

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -189

app.py CHANGED Viewed

@@ -1,211 +1,168 @@
 # ===================================================================================
-# 1) SETUP & IMPORTS
 # ===================================================================================
 from __future__ import annotations
-import os, base64, struct, textwrap, re
 import requests
-from typing import List, Tuple, Dict, Generator
-# Load .env early (HF_TOKEN / SECRET_TOKEN)
 from dotenv import load_dotenv
 load_dotenv()
-# Fast downloads & stable behavior
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")       # faster HF downloads
-os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-os.environ.setdefault("COQUI_TOS_AGREED", "1")
-os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")    # avoid pandas analytics path
-os.environ.setdefault("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "1")
-# HF Spaces / Gradio
 try:
-    import spaces  # ZeroGPU decorator
 except Exception:
     class _SpacesShim:
-        def GPU(self, *a, **k):
-            def _wrap(fn): return fn
             return _wrap
     spaces = _SpacesShim()
 import gradio as gr
-# Core ML
 import torch
 import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
-# Coqui TTS (XTTS v2)
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.manage import ModelManager
 from TTS.utils.generic_utils import get_user_data_dir
-# Text / audio processing
-import nltk, langid, emoji, noisereduce as nr
-# Download NLTK data once
-nltk.download("punkt", quiet=True)
 # ===================================================================================
-# 2) GLOBALS & HELPERS
 # ===================================================================================
-HF_TOKEN = os.getenv("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
 SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
-# Cached models & latents
-tts_model: Xtts | None = None
-llm_model: Llama | None = None
-voice_latents: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
-ROLES = ["Cloée", "Julian", "Pirate", "Thera"]
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
     "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
 )
 system_message = os.environ.get("SYSTEM_MESSAGE", default_system_message)
-ROLE_PROMPTS = {r: system_message for r in ROLES}
 ROLE_PROMPTS["Pirate"] = (
     "You are AI Beard, a pirate. Craft your response from his first-person perspective. "
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
-def pcm_to_wav(pcm: bytes, sr: int = 24000, ch: int = 1, bit: int = 16) -> bytes:
-    if pcm.startswith(b"RIFF"):  # already WAV
-        return pcm
-    chunk = 36 + len(pcm)
-    hdr = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
-        b"RIFF", chunk, b"WAVE", b"fmt ", 16, 1, ch, sr,
-        sr * ch * bit // 8, ch * bit // 8, bit, b"data", len(pcm)
     )
-    return hdr + pcm
 def split_sentences(text: str, max_len: int) -> List[str]:
-    out: List[str] = []
-    for sent in nltk.sent_tokenize(text):
         if len(sent) > max_len:
-            out.extend(textwrap.wrap(sent, max_len, break_long_words=True))
         else:
-            out.append(sent)
-    return out
-def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], sys_msg: str) -> str:
-    prompt = f"<|system|>\n{sys_msg}</s>"
-    for u, a in history:
-        if a:
-            prompt += f"<|user|>\n{u}</s><|assistant|>\n{a}</s>"
     prompt += f"<|user|>\n{message}</s><|assistant|>"
     return prompt
 # ===================================================================================
-# 3) PRE-CACHE (FIRST-RUN DOWNLOADS ONLY)
-# ===================================================================================
-def _xtts_paths() -> Tuple[str, str, str, str]:
-    """
-    Returns (model_dir, model_pth, vocab_json, speakers_pth) for XTTS v2.
-    Ensures the model is downloaded.
-    """
-    model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
-    ModelManager().download_model(model_name)  # idempotent
-    model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
-    return (
-        model_dir,
-        os.path.join(model_dir, "model.pth"),
-        os.path.join(model_dir, "vocab.json"),
-        os.path.join(model_dir, "speakers_xtts.pth"),
-    )
-def precache_assets() -> None:
-    """Download all large artifacts so the first inference is fast."""
-    # Voices
-    print("Pre-caching voice files...")
-    base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
-    os.makedirs("voices", exist_ok=True)
-    for name in ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]:
-        dst = os.path.join("voices", name)
-        if not os.path.exists(dst):
-            try:
-                r = requests.get(base_url + name, timeout=30)
-                r.raise_for_status()
-                with open(dst, "wb") as f:
-                    f.write(r.content)
-            except Exception as e:
-                print(f"Warning: failed to fetch {name}: {e}")
-    # XTTS weights (CPU-safe: just files)
-    print("Pre-caching XTTS model files...")
-    model_dir, model_pth, vocab_json, speakers_pth = _xtts_paths()
-    for p in [model_pth, vocab_json, speakers_pth, os.path.join(model_dir, "config.json")]:
-        if not os.path.exists(p):
-            print(f"Warning: missing expected XTTS file: {p}")
-    # Llama GGUF
-    print("Pre-caching LLM (Zephyr GGUF)...")
-    try:
-        hf_hub_download(
-            repo_id="TheBloke/zephyr-7B-beta-GGUF",
-            filename="zephyr-7b-beta.Q5_K_M.gguf",
-            force_download=False
-        )
-    except Exception as e:
-        print(f"Warning: GGUF download error: {e}")
-# Run pre-cache at import time (downloads only; no GPU needed)
-precache_assets()
-# ===================================================================================
-# 4) MODEL LOADERS
 # ===================================================================================
 def _load_xtts(device: str) -> Xtts:
     print("Loading Coqui XTTS V2 model (first run)...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
-    ModelManager().download_model(model_name)  # idempotent
-    model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
-    cfg = XttsConfig()
-    cfg.load_json(os.path.join(model_dir, "config.json"))
-    model = Xtts.init_from_config(cfg)
-    # Use checkpoint_dir so the library finds model.pth, vocab.json and speakers_xtts.pth itself
     model.load_checkpoint(
-        cfg,
-        checkpoint_dir=model_dir,
         eval=True,
-        use_deepspeed=False,   # deepspeed not installed in your Space
     )
     model.to(device)
-    print("XTTS model ready.")
     return model
 def _load_llama() -> Llama:
     print("Loading LLM (Zephyr) (first run)...")
-    gguf = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
-    # Try GPU offload then CPU
     for n_gpu_layers in (-1, 0):
         try:
             llm = Llama(
-                model_path=gguf,
                 n_gpu_layers=n_gpu_layers,
                 n_ctx=4096,
                 n_batch=512,
                 verbose=False
             )
-            print("LLM loaded with " + ("GPU offload" if n_gpu_layers == -1 else "CPU"))
             return llm
         except Exception as e:
-            print(f"LLM init failed (n_gpu_layers={n_gpu_layers}): {e}")
-    raise RuntimeError("Failed to initialize Llama.")
 def load_models() -> Tuple[Xtts, Llama]:
     global tts_model, llm_model
@@ -216,118 +173,134 @@ def load_models() -> Tuple[Xtts, Llama]:
         llm_model = _load_llama()
     return tts_model, llm_model
-# ===================================================================================
-# 5) GENERATION
-# ===================================================================================
-def generate_text_stream(llm: Llama, prompt: str,
                          history: List[Tuple[str, str | None]],
-                         sys_msg: str) -> Generator[str, None, None]:
-    formatted = format_prompt_zephyr(prompt, history, sys_msg)
-    stream = llm(
-        formatted,
         temperature=0.7,
         max_tokens=512,
         top_p=0.95,
         stop=LLM_STOP_WORDS,
         stream=True
     )
-    for resp in stream:
-        ch = resp["choices"][0]["text"]
         if "<|user|>" in ch or (len(ch) == 1 and emoji.is_emoji(ch)):
             continue
         yield ch
-def generate_audio_stream(tts: Xtts, text: str, lang: str,
                           latents: Tuple[np.ndarray, np.ndarray]) -> Generator[bytes, None, None]:
-    gpt_lat, spk_emb = latents
     try:
-        for chunk in tts.inference_stream(
             text=text,
-            language=lang,
-            gpt_cond_latent=gpt_lat,
-            speaker_embedding=spk_emb,
             temperature=0.85,
         ):
             if chunk is not None:
                 yield chunk.detach().cpu().numpy().squeeze().tobytes()
     except RuntimeError as e:
-        print(f"TTS inference error: {e}")
         if "device-side assert" in str(e) and api:
             try:
-                gr.Warning("Critical GPU error. Attempting to restart the Space...")
                 api.restart_space(repo_id=repo_id)
-            except Exception:
                 pass
 # ===================================================================================
-# 6) ZERO-GPU MAIN FUNCTION
 # ===================================================================================
-@spaces.GPU(duration=120)
-def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str):
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
     tts, llm = load_models()
-    # Pre-compute & cache voice latents once per worker
-    global voice_latents
-    if not voice_latents:
-        for role, fname in [
-            ("Cloée", "cloee-1.wav"),
-            ("Julian", "julian-bedtime-style-1.wav"),
-            ("Pirate", "pirate_by_coqui.wav"),
-            ("Thera", "thera-1.wav"),
-        ]:
-            path = os.path.join("voices", fname)
-            voice_latents[role] = tts.get_conditioning_latents(
-                audio_path=path, gpt_cond_len=30, max_ref_length=60
-            )
-    # Generate story
-    history = [(input_text, None)]
-    story = "".join(generate_text_stream(llm, history[-1][0], history[:-1], ROLE_PROMPTS[chatbot_role])).strip()
-    if not story:
         return []
-    # Clean & split
-    story = re.sub(r"([^\x00-\x7F]|\w)([.?!]+)", r"\1 \2", story)
-    sentences = split_sentences(story, SENTENCE_SPLIT_LENGTH)
     lang = langid.classify(sentences[0])[0] if sentences else "en"
-    results = []
-    for s in sentences:
-        if not any(c.isalnum() for c in s):
             continue
-        pcm_chunks = generate_audio_stream(tts, s, lang, voice_latents[chatbot_role])
-        pcm = b"".join(ch for ch in pcm_chunks if ch)
-        # Best-effort noise reduction
         try:
-            arr = np.frombuffer(pcm, dtype=np.int16)
-            if arr.size:
-                wav_f32 = arr.astype(np.float32) / 32767.0
-                denoised = nr.reduce_noise(y=wav_f32, sr=24000)
-                pcm = (denoised * 32767).astype(np.int16).tobytes()
         except Exception:
-            pass
-        b64 = base64.b64encode(pcm_to_wav(pcm)).decode("utf-8")
-        results.append({"text": s, "audio": b64})
     return results
 # ===================================================================================
-# 7) UI
 # ===================================================================================
-print("Downloading voice files (idempotent)...")
-# Already handled in precache, but keep for local dev logs
 demo = gr.Interface(
     fn=generate_story_and_speech,
     inputs=[
@@ -338,8 +311,8 @@ demo = gr.Interface(
     outputs=gr.JSON(label="Story and Audio Output"),
     title="AI Storyteller with ZeroGPU",
     description="Enter a prompt to generate a short story with voice narration using on-demand GPU.",
-    flagging_mode="never",  # replaces deprecated allow_flagging
 )
 if __name__ == "__main__":
-    demo.queue().launch(analytics_enabled=False)

 # ===================================================================================
+# 1. SETUP AND IMPORTS
 # ===================================================================================
 from __future__ import annotations
+import os
 import requests
+import base64
+import struct
+import re
+import textwrap
+import uuid
+from typing import List, Dict, Tuple, Generator
+# --- Load .env early (for HF_TOKEN / SECRET_TOKEN) ---
 from dotenv import load_dotenv
 load_dotenv()
+# --- Hugging Face Spaces & ZeroGPU ---
 try:
+    import spaces  # Required for ZeroGPU on HF
 except Exception:
+    # Allow local runs without the spaces package
     class _SpacesShim:
+        def GPU(self, *args, **kwargs):
+            def _wrap(fn):
+                return fn
             return _wrap
     spaces = _SpacesShim()
 import gradio as gr
+# --- Core ML & Data Libraries ---
 import torch
 import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
+# --- TTS Libraries ---
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.manage import ModelManager
 from TTS.utils.generic_utils import get_user_data_dir
+# --- Text & Audio Processing ---
+import nltk
+import langid
+import emoji
+import noisereduce as nr
 # ===================================================================================
+# 2. GLOBAL CONFIGURATION & HELPER FUNCTIONS
 # ===================================================================================
+# Download NLTK data (punkt)
+nltk.download("punkt", quiet=True)
+os.environ["COQUI_TOS_AGREED"] = "1"
+# Cached models
+tts_model: Xtts | None = None
+llm_model: Llama | None = None
+# Configuration
+HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
 SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
+# System prompts and roles
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
     "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
 )
 system_message = os.environ.get("SYSTEM_MESSAGE", default_system_message)
+ROLES = ["Cloée", "Julian", "Pirate", "Thera"]
+ROLE_PROMPTS = {role: system_message for role in ROLES}
 ROLE_PROMPTS["Pirate"] = (
     "You are AI Beard, a pirate. Craft your response from his first-person perspective. "
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
+# --- Audio helpers ---
+def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
+    if pcm_data.startswith(b"RIFF"):
+        return pcm_data
+    chunk_size = 36 + len(pcm_data)
+    header = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
+        b"RIFF", chunk_size, b"WAVE", b"fmt ",
+        16, 1, channels, sample_rate,
+        sample_rate * channels * bit_depth // 8,
+        channels * bit_depth // 8, bit_depth,
+        b"data", len(pcm_data)
     )
+    return header + pcm_data
 def split_sentences(text: str, max_len: int) -> List[str]:
+    sentences = nltk.sent_tokenize(text)
+    chunks: List[str] = []
+    for sent in sentences:
         if len(sent) > max_len:
+            chunks.extend(textwrap.wrap(sent, max_len, break_long_words=True))
         else:
+            chunks.append(sent)
+    return chunks
+def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], system_message: str) -> str:
+    prompt = f"<|system|>\n{system_message}</s>"
+    for user_prompt, bot_response in history:
+        if bot_response:
+            prompt += f"<|user|>\n{user_prompt}</s><|assistant|>\n{bot_response}</s>"
     prompt += f"<|user|>\n{message}</s><|assistant|>"
     return prompt
 # ===================================================================================
+# 3. CORE AI FUNCTIONS (Model Loading & Inference)
 # ===================================================================================
 def _load_xtts(device: str) -> Xtts:
     print("Loading Coqui XTTS V2 model (first run)...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+    ModelManager().download_model(model_name)
+    model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
+    config = XttsConfig()
+    config.load_json(os.path.join(model_path, "config.json"))
+    model = Xtts.init_from_config(config)
+    # NOTE: deepspeed not installed; keep False for Spaces
     model.load_checkpoint(
+        config,
+        checkpoint_path=os.path.join(model_path, "model.pth"),
+        vocab_path=os.path.join(model_path, "vocab.json"),
         eval=True,
+        use_deepspeed=False,
     )
     model.to(device)
+    print("XTTS model loaded.")
     return model
 def _load_llama() -> Llama:
     print("Loading LLM (Zephyr) (first run)...")
+    zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
+    # Try GPU offload if available, else CPU
     for n_gpu_layers in (-1, 0):
         try:
             llm = Llama(
+                model_path=zephyr_model_path,
                 n_gpu_layers=n_gpu_layers,
                 n_ctx=4096,
                 n_batch=512,
                 verbose=False
             )
+            if n_gpu_layers == -1:
+                print("LLM loaded with GPU offload.")
+            else:
+                print("LLM loaded (CPU).")
             return llm
         except Exception as e:
+            print(f"LLM init with n_gpu_layers={n_gpu_layers} failed: {e}")
+    raise RuntimeError("Failed to initialize Llama model.")
 def load_models() -> Tuple[Xtts, Llama]:
     global tts_model, llm_model
         llm_model = _load_llama()
     return tts_model, llm_model
+def generate_text_stream(llm_instance: Llama, prompt: str,
                          history: List[Tuple[str, str | None]],
+                         system_message: str) -> Generator[str, None, None]:
+    formatted_prompt = format_prompt_zephyr(prompt, history, system_message)
+    stream = llm_instance(
+        formatted_prompt,
         temperature=0.7,
         max_tokens=512,
         top_p=0.95,
         stop=LLM_STOP_WORDS,
         stream=True
     )
+    for response in stream:
+        ch = response["choices"][0]["text"]
+        # Guard against control tokens & isolated emoji artefacts
         if "<|user|>" in ch or (len(ch) == 1 and emoji.is_emoji(ch)):
             continue
         yield ch
+def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
                           latents: Tuple[np.ndarray, np.ndarray]) -> Generator[bytes, None, None]:
+    gpt_cond_latent, speaker_embedding = latents
     try:
+        for chunk in tts_instance.inference_stream(
             text=text,
+            language=language,
+            gpt_cond_latent=gpt_cond_latent,
+            speaker_embedding=speaker_embedding,
             temperature=0.85,
         ):
             if chunk is not None:
                 yield chunk.detach().cpu().numpy().squeeze().tobytes()
     except RuntimeError as e:
+        print(f"Error during TTS inference: {e}")
+        # Soft-restart if GPU went bad and we can talk to the HF API
         if "device-side assert" in str(e) and api:
+            gr.Warning("Critical GPU error. Attempting to restart the Space...")
             try:
                 api.restart_space(repo_id=repo_id)
+            except Exception as _:
                 pass
 # ===================================================================================
+# 4. MAIN GRADIO FUNCTION (Decorated for ZeroGPU)
 # ===================================================================================
+@spaces.GPU(duration=120)  # Request GPU for 120 seconds
+def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     if not input_text:
         return []
+    # Load models
     tts, llm = load_models()
+    # Pre-compute voice latents
+    latent_map: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
+    for role, filename in [
+        ("Cloée", "cloee-1.wav"),
+        ("Julian", "julian-bedtime-style-1.wav"),
+        ("Pirate", "pirate_by_coqui.wav"),
+        ("Thera", "thera-1.wav"),
+    ]:
+        path = os.path.join("voices", filename)
+        latent_map[role] = tts.get_conditioning_latents(
+            audio_path=path, gpt_cond_len=30, max_ref_length=60
+        )
+    # Generate story text
+    history: List[Tuple[str, str | None]] = [(input_text, None)]
+    full_story_text = "".join(
+        generate_text_stream(llm, history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role])
+    ).strip()
+    if not full_story_text:
         return []
+    # Tokenize into shorter sentences for TTS
+    sentences = split_sentences(full_story_text, SENTENCE_SPLIT_LENGTH)
     lang = langid.classify(sentences[0])[0] if sentences else "en"
+    results: List[Dict[str, str]] = []
+    for sentence in sentences:
+        if not any(c.isalnum() for c in sentence):
             continue
+        audio_chunks = generate_audio_stream(tts, sentence, lang, latent_map[chatbot_role])
+        pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
+        # Optional noise reduction (best-effort)
         try:
+            data_s16 = np.frombuffer(pcm_data, dtype=np.int16)
+            if data_s16.size > 0:
+                float_data = data_s16.astype(np.float32) / 32767.0
+                reduced = nr.reduce_noise(y=float_data, sr=24000)
+                final_pcm = (reduced * 32767).astype(np.int16).tobytes()
+            else:
+                final_pcm = pcm_data
         except Exception:
+            final_pcm = pcm_data
+        b64_wav = base64.b64encode(pcm_to_wav(final_pcm)).decode("utf-8")
+        results.append({"text": sentence, "audio": b64_wav})
     return results
 # ===================================================================================
+# 5. GRADIO INTERFACE LAUNCH
 # ===================================================================================
+# Download voice files on startup
+print("Downloading voice files...")
+file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
+base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
+os.makedirs("voices", exist_ok=True)
+for name in file_names:
+    dst = os.path.join("voices", name)
+    if not os.path.exists(dst):
+        try:
+            resp = requests.get(base_url + name, timeout=30)
+            resp.raise_for_status()
+            with open(dst, "wb") as f:
+                f.write(resp.content)
+        except Exception as e:
+            print(f"Failed to download {name}: {e}")
+# Define the Gradio Interface
 demo = gr.Interface(
     fn=generate_story_and_speech,
     inputs=[
     outputs=gr.JSON(label="Story and Audio Output"),
     title="AI Storyteller with ZeroGPU",
     description="Enter a prompt to generate a short story with voice narration using on-demand GPU.",
+    allow_flagging="never",
 )
 if __name__ == "__main__":
+    demo.queue().launch()