Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,9 +5,9 @@ from __future__ import annotations
|
|
| 5 |
import os
|
| 6 |
import base64
|
| 7 |
import struct
|
| 8 |
-
import re
|
| 9 |
import textwrap
|
| 10 |
import requests
|
|
|
|
| 11 |
from typing import List, Dict, Tuple, Generator
|
| 12 |
|
| 13 |
# --- Fast, safe defaults ---
|
|
@@ -39,6 +39,23 @@ import numpy as np
|
|
| 39 |
from huggingface_hub import HfApi, hf_hub_download
|
| 40 |
from llama_cpp import Llama
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
# --- TTS Libraries ---
|
| 43 |
from TTS.tts.configs.xtts_config import XttsConfig
|
| 44 |
from TTS.tts.models.xtts import Xtts
|
|
@@ -183,8 +200,6 @@ def _load_llama() -> Llama:
|
|
| 183 |
repo_id="TheBloke/zephyr-7B-beta-GGUF",
|
| 184 |
filename="zephyr-7b-beta.Q5_K_M.gguf"
|
| 185 |
)
|
| 186 |
-
# Initialize CPU instance (n_gpu_layers=0). If you want GPU offload, you can
|
| 187 |
-
# create a second instance inside the GPU window, but CPU is simpler & ready now.
|
| 188 |
llm = Llama(
|
| 189 |
model_path=zephyr_model_path,
|
| 190 |
n_gpu_layers=0, # CPU by default to keep it ready without GPU
|
|
@@ -198,7 +213,6 @@ def _load_llama() -> Llama:
|
|
| 198 |
def init_models_and_latents() -> None:
|
| 199 |
"""Preload TTS and LLM on CPU and compute voice latents once."""
|
| 200 |
global tts_model, llm_model, voice_latents
|
| 201 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 202 |
|
| 203 |
if tts_model is None:
|
| 204 |
tts_model = _load_xtts(device="cpu") # keep on CPU at startup
|
|
@@ -206,7 +220,7 @@ def init_models_and_latents() -> None:
|
|
| 206 |
if llm_model is None:
|
| 207 |
llm_model = _load_llama()
|
| 208 |
|
| 209 |
-
# Pre-compute latents once (CPU OK)
|
| 210 |
if not voice_latents:
|
| 211 |
print("Computing voice conditioning latents...")
|
| 212 |
for role, filename in [
|
|
@@ -221,6 +235,16 @@ def init_models_and_latents() -> None:
|
|
| 221 |
)
|
| 222 |
print("Voice latents ready.")
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
# ===================================================================================
|
| 225 |
# 4) INFERENCE HELPERS
|
| 226 |
# ===================================================================================
|
|
@@ -273,7 +297,7 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
|
|
| 273 |
# 5) ZERO-GPU ENTRYPOINT
|
| 274 |
# ===================================================================================
|
| 275 |
|
| 276 |
-
@spaces.GPU(duration=120) # Request GPU for 120s (
|
| 277 |
def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
|
| 278 |
if secret_token_input != SECRET_TOKEN:
|
| 279 |
raise gr.Error("Invalid secret token provided.")
|
|
@@ -361,5 +385,4 @@ if __name__ == "__main__":
|
|
| 361 |
print("Models and assets ready. Launching UI...")
|
| 362 |
|
| 363 |
demo = build_ui()
|
| 364 |
-
# queue + analytics disabled (env) keeps pandas out of the path
|
| 365 |
demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
|
|
|
|
| 5 |
import os
|
| 6 |
import base64
|
| 7 |
import struct
|
|
|
|
| 8 |
import textwrap
|
| 9 |
import requests
|
| 10 |
+
import atexit
|
| 11 |
from typing import List, Dict, Tuple, Generator
|
| 12 |
|
| 13 |
# --- Fast, safe defaults ---
|
|
|
|
| 39 |
from huggingface_hub import HfApi, hf_hub_download
|
| 40 |
from llama_cpp import Llama
|
| 41 |
|
| 42 |
+
# --- Prefer torchaudio sox_io/soundfile backend (avoid FFmpeg/torio bug) ---
|
| 43 |
+
try:
|
| 44 |
+
import torchaudio
|
| 45 |
+
_backend_set = False
|
| 46 |
+
for _cand in ("sox_io", "soundfile"):
|
| 47 |
+
try:
|
| 48 |
+
torchaudio.set_audio_backend(_cand)
|
| 49 |
+
_backend_set = True
|
| 50 |
+
break
|
| 51 |
+
except Exception:
|
| 52 |
+
pass
|
| 53 |
+
if not _backend_set:
|
| 54 |
+
# If neither is available, at least try to disable ffmpeg path
|
| 55 |
+
os.environ["TORCHAUDIO_USE_FFMPEG"] = "0"
|
| 56 |
+
except Exception:
|
| 57 |
+
torchaudio = None # continue; TTS can still read via its own loaders
|
| 58 |
+
|
| 59 |
# --- TTS Libraries ---
|
| 60 |
from TTS.tts.configs.xtts_config import XttsConfig
|
| 61 |
from TTS.tts.models.xtts import Xtts
|
|
|
|
| 200 |
repo_id="TheBloke/zephyr-7B-beta-GGUF",
|
| 201 |
filename="zephyr-7b-beta.Q5_K_M.gguf"
|
| 202 |
)
|
|
|
|
|
|
|
| 203 |
llm = Llama(
|
| 204 |
model_path=zephyr_model_path,
|
| 205 |
n_gpu_layers=0, # CPU by default to keep it ready without GPU
|
|
|
|
| 213 |
def init_models_and_latents() -> None:
|
| 214 |
"""Preload TTS and LLM on CPU and compute voice latents once."""
|
| 215 |
global tts_model, llm_model, voice_latents
|
|
|
|
| 216 |
|
| 217 |
if tts_model is None:
|
| 218 |
tts_model = _load_xtts(device="cpu") # keep on CPU at startup
|
|
|
|
| 220 |
if llm_model is None:
|
| 221 |
llm_model = _load_llama()
|
| 222 |
|
| 223 |
+
# Pre-compute latents once (CPU OK); torchaudio backend already forced above
|
| 224 |
if not voice_latents:
|
| 225 |
print("Computing voice conditioning latents...")
|
| 226 |
for role, filename in [
|
|
|
|
| 235 |
)
|
| 236 |
print("Voice latents ready.")
|
| 237 |
|
| 238 |
+
# Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
|
| 239 |
+
def _close_llm():
|
| 240 |
+
global llm_model
|
| 241 |
+
try:
|
| 242 |
+
if llm_model is not None:
|
| 243 |
+
llm_model.close()
|
| 244 |
+
except Exception:
|
| 245 |
+
pass
|
| 246 |
+
atexit.register(_close_llm)
|
| 247 |
+
|
| 248 |
# ===================================================================================
|
| 249 |
# 4) INFERENCE HELPERS
|
| 250 |
# ===================================================================================
|
|
|
|
| 297 |
# 5) ZERO-GPU ENTRYPOINT
|
| 298 |
# ===================================================================================
|
| 299 |
|
| 300 |
+
@spaces.GPU(duration=120) # Request GPU for 120s (tune as needed)
|
| 301 |
def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
|
| 302 |
if secret_token_input != SECRET_TOKEN:
|
| 303 |
raise gr.Error("Invalid secret token provided.")
|
|
|
|
| 385 |
print("Models and assets ready. Launching UI...")
|
| 386 |
|
| 387 |
demo = build_ui()
|
|
|
|
| 388 |
demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
|