Spaces:
Running
on
Zero
Running
on
Zero
updates
Browse files- app.py +14 -14
- requirements.txt +3 -5
app.py
CHANGED
|
@@ -10,12 +10,12 @@ from typing import List, Tuple, Dict, Generator
|
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
load_dotenv()
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
|
| 15 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 16 |
os.environ.setdefault("COQUI_TOS_AGREED", "1")
|
| 17 |
-
#
|
| 18 |
-
os.environ.setdefault("
|
| 19 |
|
| 20 |
# HF Spaces / Gradio
|
| 21 |
try:
|
|
@@ -79,7 +79,8 @@ def pcm_to_wav(pcm: bytes, sr: int = 24000, ch: int = 1, bit: int = 16) -> bytes
|
|
| 79 |
if pcm.startswith(b"RIFF"): # already WAV
|
| 80 |
return pcm
|
| 81 |
chunk = 36 + len(pcm)
|
| 82 |
-
hdr = struct.pack(
|
|
|
|
| 83 |
b"RIFF", chunk, b"WAVE", b"fmt ", 16, 1, ch, sr,
|
| 84 |
sr * ch * bit // 8, ch * bit // 8, bit, b"data", len(pcm)
|
| 85 |
)
|
|
@@ -165,20 +166,20 @@ precache_assets()
|
|
| 165 |
|
| 166 |
def _load_xtts(device: str) -> Xtts:
|
| 167 |
print("Loading Coqui XTTS V2 model (first run)...")
|
| 168 |
-
|
|
|
|
|
|
|
| 169 |
|
| 170 |
cfg = XttsConfig()
|
| 171 |
cfg.load_json(os.path.join(model_dir, "config.json"))
|
| 172 |
|
| 173 |
model = Xtts.init_from_config(cfg)
|
| 174 |
-
#
|
| 175 |
model.load_checkpoint(
|
| 176 |
cfg,
|
| 177 |
-
|
| 178 |
-
vocab_path=vocab_json,
|
| 179 |
-
speaker_file_path=speakers_pth, # <-- fixes TypeError
|
| 180 |
eval=True,
|
| 181 |
-
use_deepspeed=False,
|
| 182 |
)
|
| 183 |
model.to(device)
|
| 184 |
print("XTTS model ready.")
|
|
@@ -272,7 +273,7 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
|
|
| 272 |
|
| 273 |
tts, llm = load_models()
|
| 274 |
|
| 275 |
-
# Pre-compute & cache voice latents once per
|
| 276 |
global voice_latents
|
| 277 |
if not voice_latents:
|
| 278 |
for role, fname in [
|
|
@@ -326,7 +327,6 @@ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_
|
|
| 326 |
|
| 327 |
print("Downloading voice files (idempotent)...")
|
| 328 |
# Already handled in precache, but keep for local dev logs
|
| 329 |
-
# (No-op if files exist)
|
| 330 |
|
| 331 |
demo = gr.Interface(
|
| 332 |
fn=generate_story_and_speech,
|
|
@@ -342,4 +342,4 @@ demo = gr.Interface(
|
|
| 342 |
)
|
| 343 |
|
| 344 |
if __name__ == "__main__":
|
| 345 |
-
demo.queue().launch(
|
|
|
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
load_dotenv()
|
| 12 |
|
| 13 |
+
# Fast downloads & stable behavior
|
| 14 |
+
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # faster HF downloads
|
| 15 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 16 |
os.environ.setdefault("COQUI_TOS_AGREED", "1")
|
| 17 |
+
os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false") # avoid pandas analytics path
|
| 18 |
+
os.environ.setdefault("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "1")
|
| 19 |
|
| 20 |
# HF Spaces / Gradio
|
| 21 |
try:
|
|
|
|
| 79 |
if pcm.startswith(b"RIFF"): # already WAV
|
| 80 |
return pcm
|
| 81 |
chunk = 36 + len(pcm)
|
| 82 |
+
hdr = struct.pack(
|
| 83 |
+
"<4sI4s4sIHHIIHH4sI",
|
| 84 |
b"RIFF", chunk, b"WAVE", b"fmt ", 16, 1, ch, sr,
|
| 85 |
sr * ch * bit // 8, ch * bit // 8, bit, b"data", len(pcm)
|
| 86 |
)
|
|
|
|
| 166 |
|
| 167 |
def _load_xtts(device: str) -> Xtts:
|
| 168 |
print("Loading Coqui XTTS V2 model (first run)...")
|
| 169 |
+
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
| 170 |
+
ModelManager().download_model(model_name) # idempotent
|
| 171 |
+
model_dir = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
| 172 |
|
| 173 |
cfg = XttsConfig()
|
| 174 |
cfg.load_json(os.path.join(model_dir, "config.json"))
|
| 175 |
|
| 176 |
model = Xtts.init_from_config(cfg)
|
| 177 |
+
# Use checkpoint_dir so the library finds model.pth, vocab.json and speakers_xtts.pth itself
|
| 178 |
model.load_checkpoint(
|
| 179 |
cfg,
|
| 180 |
+
checkpoint_dir=model_dir,
|
|
|
|
|
|
|
| 181 |
eval=True,
|
| 182 |
+
use_deepspeed=False, # deepspeed not installed in your Space
|
| 183 |
)
|
| 184 |
model.to(device)
|
| 185 |
print("XTTS model ready.")
|
|
|
|
| 273 |
|
| 274 |
tts, llm = load_models()
|
| 275 |
|
| 276 |
+
# Pre-compute & cache voice latents once per worker
|
| 277 |
global voice_latents
|
| 278 |
if not voice_latents:
|
| 279 |
for role, fname in [
|
|
|
|
| 327 |
|
| 328 |
print("Downloading voice files (idempotent)...")
|
| 329 |
# Already handled in precache, but keep for local dev logs
|
|
|
|
| 330 |
|
| 331 |
demo = gr.Interface(
|
| 332 |
fn=generate_story_and_speech,
|
|
|
|
| 342 |
)
|
| 343 |
|
| 344 |
if __name__ == "__main__":
|
| 345 |
+
demo.queue().launch(analytics_enabled=False)
|
requirements.txt
CHANGED
|
@@ -7,16 +7,14 @@ python-dotenv
|
|
| 7 |
spaces
|
| 8 |
requests
|
| 9 |
numpy
|
| 10 |
-
pandas>=2.2.2,<3 # Fixes Gradio analytics OptionError
|
| 11 |
|
| 12 |
-
# TTS
|
| 13 |
-
|
| 14 |
-
pydantic==2.5.3
|
| 15 |
|
| 16 |
# LLM
|
| 17 |
llama-cpp-python==0.2.79
|
| 18 |
|
| 19 |
-
# Audio & Text
|
| 20 |
noisereduce==3.0.3
|
| 21 |
pydub
|
| 22 |
langid
|
|
|
|
| 7 |
spaces
|
| 8 |
requests
|
| 9 |
numpy
|
|
|
|
| 10 |
|
| 11 |
+
# TTS (maintained fork; keeps "from TTS..." imports)
|
| 12 |
+
coqui-tts==0.27.2
|
|
|
|
| 13 |
|
| 14 |
# LLM
|
| 15 |
llama-cpp-python==0.2.79
|
| 16 |
|
| 17 |
+
# Audio & Text Processing
|
| 18 |
noisereduce==3.0.3
|
| 19 |
pydub
|
| 20 |
langid
|