ruslanmv commited on
Commit
7741539
·
1 Parent(s): 96b9f29
Files changed (2) hide show
  1. app.py +14 -14
  2. requirements.txt +3 -5
app.py CHANGED
@@ -10,12 +10,12 @@ from typing import List, Tuple, Dict, Generator
10
  from dotenv import load_dotenv
11
  load_dotenv()
12
 
13
- # Make downloads fast & quiet
14
- os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
15
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
16
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
17
- # Avoid Gradio analytics pandas edge-cases
18
- os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")
19
 
20
  # HF Spaces / Gradio
21
  try:
@@ -79,7 +79,8 @@ def pcm_to_wav(pcm: bytes, sr: int = 24000, ch: int = 1, bit: int = 16) -> bytes
79
  if pcm.startswith(b"RIFF"): # already WAV
80
  return pcm
81
  chunk = 36 + len(pcm)
82
- hdr = struct.pack("<4sI4s4sIHHIIHH4sI",
 
83
  b"RIFF", chunk, b"WAVE", b"fmt ", 16, 1, ch, sr,
84
  sr * ch * bit // 8, ch * bit // 8, bit, b"data", len(pcm)
85
  )
@@ -165,20 +166,20 @@ precache_assets()
165
 
166
  def _load_xtts(device: str) -> Xtts:
167
  print("Loading Coqui XTTS V2 model (first run)...")
168
- model_dir, model_pth, vocab_json, speakers_pth = _xtts_paths()
 
 
169
 
170
  cfg = XttsConfig()
171
  cfg.load_json(os.path.join(model_dir, "config.json"))
172
 
173
  model = Xtts.init_from_config(cfg)
174
- # IMPORTANT: pass speaker_file_path to avoid NoneType join inside library
175
  model.load_checkpoint(
176
  cfg,
177
- checkpoint_path=model_pth,
178
- vocab_path=vocab_json,
179
- speaker_file_path=speakers_pth, # <-- fixes TypeError
180
  eval=True,
181
- use_deepspeed=False, # deepspeed not installed
182
  )
183
  model.to(device)
184
  print("XTTS model ready.")
@@ -272,7 +273,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
272
 
273
  tts, llm = load_models()
274
 
275
- # Pre-compute & cache voice latents once per session
276
  global voice_latents
277
  if not voice_latents:
278
  for role, fname in [
@@ -326,7 +327,6 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
326
 
327
  print("Downloading voice files (idempotent)...")
328
  # Already handled in precache, but keep for local dev logs
329
- # (No-op if files exist)
330
 
331
  demo = gr.Interface(
332
  fn=generate_story_and_speech,
@@ -342,4 +342,4 @@ demo = gr.Interface(
342
  )
343
 
344
  if __name__ == "__main__":
345
- demo.queue().launch() # you can add ssr_mode=False if you prefer
 
10
  from dotenv import load_dotenv
11
  load_dotenv()
12
 
13
+ # Fast downloads & stable behavior
14
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # faster HF downloads
15
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
16
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
17
+ os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false") # avoid pandas analytics path
18
+ os.environ.setdefault("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "1")
19
 
20
  # HF Spaces / Gradio
21
  try:
 
79
  if pcm.startswith(b"RIFF"): # already WAV
80
  return pcm
81
  chunk = 36 + len(pcm)
82
+ hdr = struct.pack(
83
+ "<4sI4s4sIHHIIHH4sI",
84
  b"RIFF", chunk, b"WAVE", b"fmt ", 16, 1, ch, sr,
85
  sr * ch * bit // 8, ch * bit // 8, bit, b"data", len(pcm)
86
  )
 
166
 
167
  def _load_xtts(device: str) -> Xtts:
168
  print("Loading Coqui XTTS V2 model (first run)...")
169
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
170
+ ModelManager().download_model(model_name) # idempotent
171
+ model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
172
 
173
  cfg = XttsConfig()
174
  cfg.load_json(os.path.join(model_dir, "config.json"))
175
 
176
  model = Xtts.init_from_config(cfg)
177
+ # Use checkpoint_dir so the library finds model.pth, vocab.json and speakers_xtts.pth itself
178
  model.load_checkpoint(
179
  cfg,
180
+ checkpoint_dir=model_dir,
 
 
181
  eval=True,
182
+ use_deepspeed=False, # deepspeed not installed in your Space
183
  )
184
  model.to(device)
185
  print("XTTS model ready.")
 
273
 
274
  tts, llm = load_models()
275
 
276
+ # Pre-compute & cache voice latents once per worker
277
  global voice_latents
278
  if not voice_latents:
279
  for role, fname in [
 
327
 
328
  print("Downloading voice files (idempotent)...")
329
  # Already handled in precache, but keep for local dev logs
 
330
 
331
  demo = gr.Interface(
332
  fn=generate_story_and_speech,
 
342
  )
343
 
344
  if __name__ == "__main__":
345
+ demo.queue().launch(analytics_enabled=False)
requirements.txt CHANGED
@@ -7,16 +7,14 @@ python-dotenv
7
  spaces
8
  requests
9
  numpy
10
- pandas>=2.2.2,<3 # Fixes Gradio analytics OptionError
11
 
12
- # TTS
13
- TTS @ git+https://github.com/coqui-ai/TTS@v0.22.0
14
- pydantic==2.5.3
15
 
16
  # LLM
17
  llama-cpp-python==0.2.79
18
 
19
- # Audio & Text
20
  noisereduce==3.0.3
21
  pydub
22
  langid
 
7
  spaces
8
  requests
9
  numpy
 
10
 
11
+ # TTS (maintained fork; keeps "from TTS..." imports)
12
+ coqui-tts==0.27.2
 
13
 
14
  # LLM
15
  llama-cpp-python==0.2.79
16
 
17
+ # Audio & Text Processing
18
  noisereduce==3.0.3
19
  pydub
20
  langid