ruslanmv commited on
Commit
8c573f7
·
1 Parent(s): d662d9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -7
app.py CHANGED
@@ -5,9 +5,9 @@ from __future__ import annotations
5
  import os
6
  import base64
7
  import struct
8
- import re
9
  import textwrap
10
  import requests
 
11
  from typing import List, Dict, Tuple, Generator
12
 
13
  # --- Fast, safe defaults ---
@@ -39,6 +39,23 @@ import numpy as np
39
  from huggingface_hub import HfApi, hf_hub_download
40
  from llama_cpp import Llama
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # --- TTS Libraries ---
43
  from TTS.tts.configs.xtts_config import XttsConfig
44
  from TTS.tts.models.xtts import Xtts
@@ -183,8 +200,6 @@ def _load_llama() -> Llama:
183
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
184
  filename="zephyr-7b-beta.Q5_K_M.gguf"
185
  )
186
- # Initialize CPU instance (n_gpu_layers=0). If you want GPU offload, you can
187
- # create a second instance inside the GPU window, but CPU is simpler & ready now.
188
  llm = Llama(
189
  model_path=zephyr_model_path,
190
  n_gpu_layers=0, # CPU by default to keep it ready without GPU
@@ -198,7 +213,6 @@ def _load_llama() -> Llama:
198
  def init_models_and_latents() -> None:
199
  """Preload TTS and LLM on CPU and compute voice latents once."""
200
  global tts_model, llm_model, voice_latents
201
- device = "cuda" if torch.cuda.is_available() else "cpu"
202
 
203
  if tts_model is None:
204
  tts_model = _load_xtts(device="cpu") # keep on CPU at startup
@@ -206,7 +220,7 @@ def init_models_and_latents() -> None:
206
  if llm_model is None:
207
  llm_model = _load_llama()
208
 
209
- # Pre-compute latents once (CPU OK)
210
  if not voice_latents:
211
  print("Computing voice conditioning latents...")
212
  for role, filename in [
@@ -221,6 +235,16 @@ def init_models_and_latents() -> None:
221
  )
222
  print("Voice latents ready.")
223
 
 
 
 
 
 
 
 
 
 
 
224
  # ===================================================================================
225
  # 4) INFERENCE HELPERS
226
  # ===================================================================================
@@ -273,7 +297,7 @@ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
273
  # 5) ZERO-GPU ENTRYPOINT
274
  # ===================================================================================
275
 
276
- @spaces.GPU(duration=120) # Request GPU for 120s (can tune later)
277
  def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
278
  if secret_token_input != SECRET_TOKEN:
279
  raise gr.Error("Invalid secret token provided.")
@@ -361,5 +385,4 @@ if __name__ == "__main__":
361
  print("Models and assets ready. Launching UI...")
362
 
363
  demo = build_ui()
364
- # queue + analytics disabled (env) keeps pandas out of the path
365
  demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
5
  import os
6
  import base64
7
  import struct
 
8
  import textwrap
9
  import requests
10
+ import atexit
11
  from typing import List, Dict, Tuple, Generator
12
 
13
  # --- Fast, safe defaults ---
 
39
  from huggingface_hub import HfApi, hf_hub_download
40
  from llama_cpp import Llama
41
 
42
+ # --- Prefer torchaudio sox_io/soundfile backend (avoid FFmpeg/torio bug) ---
43
+ try:
44
+ import torchaudio
45
+ _backend_set = False
46
+ for _cand in ("sox_io", "soundfile"):
47
+ try:
48
+ torchaudio.set_audio_backend(_cand)
49
+ _backend_set = True
50
+ break
51
+ except Exception:
52
+ pass
53
+ if not _backend_set:
54
+ # If neither is available, at least try to disable ffmpeg path
55
+ os.environ["TORCHAUDIO_USE_FFMPEG"] = "0"
56
+ except Exception:
57
+ torchaudio = None # continue; TTS can still read via its own loaders
58
+
59
  # --- TTS Libraries ---
60
  from TTS.tts.configs.xtts_config import XttsConfig
61
  from TTS.tts.models.xtts import Xtts
 
200
  repo_id="TheBloke/zephyr-7B-beta-GGUF",
201
  filename="zephyr-7b-beta.Q5_K_M.gguf"
202
  )
 
 
203
  llm = Llama(
204
  model_path=zephyr_model_path,
205
  n_gpu_layers=0, # CPU by default to keep it ready without GPU
 
213
  def init_models_and_latents() -> None:
214
  """Preload TTS and LLM on CPU and compute voice latents once."""
215
  global tts_model, llm_model, voice_latents
 
216
 
217
  if tts_model is None:
218
  tts_model = _load_xtts(device="cpu") # keep on CPU at startup
 
220
  if llm_model is None:
221
  llm_model = _load_llama()
222
 
223
+ # Pre-compute latents once (CPU OK); torchaudio backend already forced above
224
  if not voice_latents:
225
  print("Computing voice conditioning latents...")
226
  for role, filename in [
 
235
  )
236
  print("Voice latents ready.")
237
 
238
+ # Ensure we close Llama cleanly to avoid __del__ issues at interpreter shutdown
239
+ def _close_llm():
240
+ global llm_model
241
+ try:
242
+ if llm_model is not None:
243
+ llm_model.close()
244
+ except Exception:
245
+ pass
246
+ atexit.register(_close_llm)
247
+
248
  # ===================================================================================
249
  # 4) INFERENCE HELPERS
250
  # ===================================================================================
 
297
  # 5) ZERO-GPU ENTRYPOINT
298
  # ===================================================================================
299
 
300
+ @spaces.GPU(duration=120) # Request GPU for 120s (tune as needed)
301
  def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
302
  if secret_token_input != SECRET_TOKEN:
303
  raise gr.Error("Invalid secret token provided.")
 
385
  print("Models and assets ready. Launching UI...")
386
 
387
  demo = build_ui()
 
388
  demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))