Spaces:

ruslanmv
/

ai-story-server-cpu

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

fa37078

1 Parent(s): 2b38b16

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -35

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import struct
 import textwrap
 import requests
 import atexit
 from typing import List, Dict, Tuple, Generator
 # --- Fast, safe defaults ---
@@ -38,8 +39,8 @@ import torch
 import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
-import torchaudio # Still needed for transforms, just not loading
-import soundfile as sf # <-- FIX: Import soundfile for robust audio loading
 # --- TTS Libraries ---
 from TTS.tts.configs.xtts_config import XttsConfig
@@ -57,15 +58,12 @@ import noisereduce as nr
 # 2) GLOBALS & HELPERS
 # ===================================================================================
-# Download NLTK data (punkt) once
 nltk.download("punkt", quiet=True)
-# Cached models & latents
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
 voice_latents: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
-# Config
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
@@ -73,7 +71,6 @@ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
-# System prompts and roles
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
     "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
@@ -86,7 +83,6 @@ ROLE_PROMPTS["Pirate"] = (
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
-# ---------- small utils ----------
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
@@ -124,7 +120,6 @@ def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], sy
 # ===================================================================================
 def precache_assets() -> None:
-    """Download voice WAVs, XTTS weights, and Zephyr GGUF to local cache before any inference."""
     print("Pre-caching voice files...")
     file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
     base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
@@ -154,7 +149,6 @@ def precache_assets() -> None:
         print(f"Warning: GGUF pre-cache error: {e}")
 def _load_xtts(device: str) -> Xtts:
-    """Load XTTS from the local cache."""
     print("Loading Coqui XTTS V2 model (CPU first)...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
     model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
@@ -170,7 +164,6 @@ def _load_xtts(device: str) -> Xtts:
     return model
 def _load_llama() -> Llama:
-    """Load Llama (Zephyr GGUF) on CPU so it's ready immediately."""
     print("Loading LLM (Zephyr GGUF) on CPU...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
@@ -183,33 +176,26 @@ def _load_llama() -> Llama:
     print("LLM loaded (CPU).")
     return llm
-# --- FIX: Replaced torchaudio.load with soundfile.read to fix RuntimeError ---
-def load_audio_for_tts(path: str, target_sr: int = 24000) -> torch.Tensor:
-    """Loads audio using soundfile, converts to a Torch tensor, and resamples if needed."""
     try:
-        # Read audio file into a NumPy array
         audio_np, original_sr = sf.read(path, dtype='float32')
-        # Ensure it's mono
         if audio_np.ndim > 1:
             audio_np = np.mean(audio_np, axis=1)
-        # Convert to a PyTorch tensor
         waveform = torch.from_numpy(audio_np).float()
-        # Resample if the sample rate is not the target rate
         if original_sr != target_sr:
             print(f"Resampling audio from {original_sr}Hz to {target_sr}Hz.")
             resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
             waveform = resampler(waveform)
-        return waveform.unsqueeze(0) # Add batch dimension: shape (1, T)
     except Exception as e:
         print(f"Error loading audio file {path}: {e}")
         raise
 def init_models_and_latents() -> None:
-    """Preload TTS and LLM on CPU and compute voice latents once."""
     global tts_model, llm_model, voice_latents
     if tts_model is None:
@@ -220,17 +206,28 @@ def init_models_and_latents() -> None:
     if not voice_latents:
         print("Computing voice conditioning latents...")
-        voice_files = {
-            "Cloée": "cloee-1.wav", "Julian": "julian-bedtime-style-1.wav",
-            "Pirate": "pirate_by_coqui.wav", "Thera": "thera-1.wav",
-        }
-        for role, filename in voice_files.items():
-            path = os.path.join("voices", filename)
-            # Load audio externally and pass the waveform tensor directly
-            waveform = load_audio_for_tts(path)
-            voice_latents[role] = tts_model.get_conditioning_latents(
-                waveform=waveform, gpt_cond_len=30, max_ref_length=60
-            )
         print("Voice latents ready.")
 def _close_llm():
@@ -270,7 +267,6 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
     if not input_text:
         return []
-    # Models must be preloaded, this is a fallback.
     if tts_model is None or llm_model is None:
         raise gr.Error("Models not initialized. Please restart the Space.")
@@ -311,7 +307,6 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
         return results
     finally:
-        # Crucial for ZeroGPU: ensure model returns to CPU to free the GPU
         if tts_model is not None:
             tts_model.to("cpu")

 import textwrap
 import requests
 import atexit
+import tempfile # <-- FIX: Import tempfile to manage temporary audio files
 from typing import List, Dict, Tuple, Generator
 # --- Fast, safe defaults ---
 import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
+import torchaudio
+import soundfile as sf
 # --- TTS Libraries ---
 from TTS.tts.configs.xtts_config import XttsConfig
 # 2) GLOBALS & HELPERS
 # ===================================================================================
 nltk.download("punkt", quiet=True)
 tts_model: Xtts | None = None
 llm_model: Llama | None = None
 voice_latents: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 repo_id = "ruslanmv/ai-story-server"
 SENTENCE_SPLIT_LENGTH = 250
 LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
 default_system_message = (
     "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
     "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
     "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
 )
 def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
     if pcm_data.startswith(b"RIFF"):
         return pcm_data
 # ===================================================================================
 def precache_assets() -> None:
     print("Pre-caching voice files...")
     file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
     base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
         print(f"Warning: GGUF pre-cache error: {e}")
 def _load_xtts(device: str) -> Xtts:
     print("Loading Coqui XTTS V2 model (CPU first)...")
     model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
     model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
     return model
 def _load_llama() -> Llama:
     print("Loading LLM (Zephyr GGUF) on CPU...")
     zephyr_model_path = hf_hub_download(
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
     print("LLM loaded (CPU).")
     return llm
+def load_and_resample_audio(path: str, target_sr: int = 24000) -> torch.Tensor:
+    """Loads audio, converts to a Torch tensor, and resamples if needed."""
     try:
         audio_np, original_sr = sf.read(path, dtype='float32')
         if audio_np.ndim > 1:
             audio_np = np.mean(audio_np, axis=1)
         waveform = torch.from_numpy(audio_np).float()
         if original_sr != target_sr:
             print(f"Resampling audio from {original_sr}Hz to {target_sr}Hz.")
             resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
             waveform = resampler(waveform)
+        return waveform.unsqueeze(0)
     except Exception as e:
         print(f"Error loading audio file {path}: {e}")
         raise
 def init_models_and_latents() -> None:
+    """Preload models and compute voice latents, using temporary files for compatibility."""
     global tts_model, llm_model, voice_latents
     if tts_model is None:
     if not voice_latents:
         print("Computing voice conditioning latents...")
+        # --- FIX: Use a temporary directory to store resampled audio files ---
+        with tempfile.TemporaryDirectory() as temp_dir:
+            voice_files = {
+                "Cloée": "cloee-1.wav", "Julian": "julian-bedtime-style-1.wav",
+                "Pirate": "pirate_by_coqui.wav", "Thera": "thera-1.wav",
+            }
+            for role, filename in voice_files.items():
+                original_path = os.path.join("voices", filename)
+                # 1. Load and resample audio into a tensor
+                resampled_waveform = load_and_resample_audio(original_path)
+                # 2. Save the corrected tensor to a temporary file
+                temp_path = os.path.join(temp_dir, f"resampled_{filename}")
+                torchaudio.save(temp_path, resampled_waveform.squeeze(0), 24000)
+                # 3. Pass the path of the clean, temporary file to the model
+                voice_latents[role] = tts_model.get_conditioning_latents(
+                    audio_path=temp_path,
+                    gpt_cond_len=30,
+                    max_ref_length=60
+                )
         print("Voice latents ready.")
 def _close_llm():
     if not input_text:
         return []
     if tts_model is None or llm_model is None:
         raise gr.Error("Models not initialized. Please restart the Space.")
         return results
     finally:
         if tts_model is not None:
             tts_model.to("cpu")