Spaces:

ruslanmv
/

ai-story-server-cpu

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

8c573f7

1 Parent(s): d662d9a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -7

app.py CHANGED Viewed

@@ -5,9 +5,9 @@ from __future__ import annotations
 import os
 import base64
 import struct
-import re
 import textwrap
 import requests
 from typing import List, Dict, Tuple, Generator
 # --- Fast, safe defaults ---
@@ -39,6 +39,23 @@ import numpy as np
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
 # --- TTS Libraries ---
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
@@ -183,8 +200,6 @@ def _load_llama() -> Llama:
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
-    # Initialize CPU instance (n_gpu_layers=0). If you want GPU offload, you can
-    # create a second instance inside the GPU window, but CPU is simpler & ready now.
     llm = Llama(
         model_path=zephyr_model_path,
         n_gpu_layers=0,   # CPU by default to keep it ready without GPU
@@ -198,7 +213,6 @@ def _load_llama() -> Llama:
 def init_models_and_latents() -> None:
     """Preload TTS and LLM on CPU and compute voice latents once."""
     global tts_model, llm_model, voice_latents
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     if tts_model is None:
         tts_model = _load_xtts(device="cpu")  # keep on CPU at startup
@@ -206,7 +220,7 @@ def init_models_and_latents() -> None:
     if llm_model is None:
         llm_model = _load_llama()
-    # Pre-compute latents once (CPU OK)
     if not voice_latents:
         print("Computing voice conditioning latents...")
         for role, filename in [
@@ -221,6 +235,16 @@ def init_models_and_latents() -> None:
             )
         print("Voice latents ready.")
 # ===================================================================================
 # 4) INFERENCE HELPERS
 # ===================================================================================
@@ -273,7 +297,7 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
 # 5) ZERO-GPU ENTRYPOINT
 # ===================================================================================
-@spaces.GPU(duration=120)  # Request GPU for 120s (can tune later)
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
@@ -361,5 +385,4 @@ if __name__ == "__main__":
     print("Models and assets ready. Launching UI...")
     demo = build_ui()
-    # queue + analytics disabled (env) keeps pandas out of the path
     demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

 import os
 import base64
 import struct
 import textwrap
 import requests
+import atexit
 from typing import List, Dict, Tuple, Generator
 # --- Fast, safe defaults ---
 from huggingface_hub import HfApi, hf_hub_download
 from llama_cpp import Llama
+# --- Prefer torchaudio sox_io/soundfile backend (avoid FFmpeg/torio bug) ---
+try:
+    import torchaudio
+    _backend_set = False
+    for _cand in ("sox_io", "soundfile"):
+        try:
+            torchaudio.set_audio_backend(_cand)
+            _backend_set = True
+            break
+        except Exception:
+            pass
+    if not _backend_set:
+        # If neither is available, at least try to disable ffmpeg path
+        os.environ["TORCHAUDIO_USE_FFMPEG"] = "0"
+except Exception:
+    torchaudio = None  # continue; TTS can still read via its own loaders
 # --- TTS Libraries ---
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
         repo_id="TheBloke/zephyr-7B-beta-GGUF",
         filename="zephyr-7b-beta.Q5_K_M.gguf"
     )
     llm = Llama(
         model_path=zephyr_model_path,
         n_gpu_layers=0,   # CPU by default to keep it ready without GPU
 def init_models_and_latents() -> None:
     """Preload TTS and LLM on CPU and compute voice latents once."""
     global tts_model, llm_model, voice_latents
     if tts_model is None:
         tts_model = _load_xtts(device="cpu")  # keep on CPU at startup
     if llm_model is None:
         llm_model = _load_llama()
+    # Pre-compute latents once (CPU OK); torchaudio backend already forced above
     if not voice_latents:
         print("Computing voice conditioning latents...")
         for role, filename in [
             )
         print("Voice latents ready.")
+# Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
+def _close_llm():
+    global llm_model
+    try:
+        if llm_model is not None:
+            llm_model.close()
+    except Exception:
+        pass
+atexit.register(_close_llm)
 # ===================================================================================
 # 4) INFERENCE HELPERS
 # ===================================================================================
 # 5) ZERO-GPU ENTRYPOINT
 # ===================================================================================
+@spaces.GPU(duration=120)  # Request GPU for 120s (tune as needed)
 def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
     if secret_token_input != SECRET_TOKEN:
         raise gr.Error("Invalid secret token provided.")
     print("Models and assets ready. Launching UI...")
     demo = build_ui()
     demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))