Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -329,10 +329,17 @@ VOICE_CHOICES = {
|
|
| 329 |
}
|
| 330 |
# --- ENGINE ---
|
| 331 |
print("🚀 BOOTING HIGH-RAM ENGINE...")
|
| 332 |
-
# Enable fast networking immediately
|
| 333 |
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
| 334 |
-
# 1. Phonemizer -
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
# 2. Tokenizer
|
| 337 |
vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
|
| 338 |
with open(vocab_path, "r", encoding="utf-8") as f:
|
|
@@ -363,17 +370,30 @@ sess_options.inter_op_num_threads = 0
|
|
| 363 |
SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
|
| 364 |
print("✅ ENGINE READY")
|
| 365 |
# --- CORE LOGIC (Shared by UI and API) ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
@lru_cache(maxsize=5000)
|
| 367 |
def get_tokens(text):
|
| 368 |
-
|
| 369 |
-
phonemes, _ = G2P(text)
|
| 370 |
-
# 🔥 FIXED: Filter out invalid tokens (prevents audio gaps)
|
| 371 |
-
tokens = []
|
| 372 |
-
for p in phonemes:
|
| 373 |
-
token = TOKENIZER.get(p)
|
| 374 |
-
if token is not None and token > 0:
|
| 375 |
-
tokens.append(token)
|
| 376 |
-
return tokens
|
| 377 |
def trim_silence(audio, threshold=0.01):
|
| 378 |
if audio.size == 0: return audio
|
| 379 |
mask = np.abs(audio) > threshold
|
|
@@ -393,7 +413,9 @@ def infer(text, voice_name, speed):
|
|
| 393 |
"speed": np.array([speed], dtype=np.float32)
|
| 394 |
})[0]
|
| 395 |
return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
|
| 396 |
-
except
|
|
|
|
|
|
|
| 397 |
def tuned_splitter(text):
|
| 398 |
chunks = re.split(r'([.,!?;:\n]+)', text)
|
| 399 |
buffer = ""
|
|
@@ -427,7 +449,7 @@ with gr.Blocks(title="Kokoro TTS") as app:
|
|
| 427 |
gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
|
| 428 |
with gr.Row():
|
| 429 |
with gr.Column():
|
| 430 |
-
text_in = gr.Textbox(label="Input Text", lines=3, value="
|
| 431 |
voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
|
| 432 |
speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
|
| 433 |
btn = gr.Button("Generate", variant="primary")
|
|
@@ -440,15 +462,8 @@ INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
|
|
| 440 |
G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
|
| 441 |
INFERENCE_QUEUE = asyncio.Queue()
|
| 442 |
def g2p_task(text):
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
# 🔥 FIXED: Filter out invalid tokens
|
| 446 |
-
tokens = []
|
| 447 |
-
for p in phonemes:
|
| 448 |
-
token = TOKENIZER.get(p)
|
| 449 |
-
if token is not None and token > 0:
|
| 450 |
-
tokens.append(token)
|
| 451 |
-
return tokens
|
| 452 |
async def audio_engine_loop():
|
| 453 |
print("⚡ API AUDIO PIPELINE STARTED")
|
| 454 |
loop = asyncio.get_running_loop()
|
|
@@ -480,7 +495,7 @@ async def audio_engine_loop():
|
|
| 480 |
pass
|
| 481 |
|
| 482 |
except Exception as e:
|
| 483 |
-
print(f"API Engine Error: {e}")
|
| 484 |
@api.on_event("startup")
|
| 485 |
async def startup():
|
| 486 |
asyncio.create_task(audio_engine_loop())
|
|
@@ -523,19 +538,24 @@ async def websocket_endpoint(ws: WebSocket):
|
|
| 523 |
text = data["text"]
|
| 524 |
for chunk in tuned_splitter(text):
|
| 525 |
if chunk.strip():
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
if style is None:
|
| 530 |
-
get_voice(voice_key)
|
| 531 |
style = VOICE_CACHE.get(voice_key)
|
| 532 |
-
|
| 533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
if "flush" in data:
|
| 536 |
pass
|
| 537 |
except Exception as e:
|
| 538 |
print(f"🔥 Critical WS Error: {e}")
|
|
|
|
|
|
|
| 539 |
finally:
|
| 540 |
heartbeat_task.cancel()
|
| 541 |
# --- FINAL MOUNT ---
|
|
|
|
| 329 |
}
|
| 330 |
# --- ENGINE ---
|
| 331 |
print("🚀 BOOTING HIGH-RAM ENGINE...")
|
|
|
|
| 332 |
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
| 333 |
+
# 1. Phonemizer - Try with espeak fallback, fall back to None if it fails
|
| 334 |
+
try:
|
| 335 |
+
from misaki.espeak import EspeakFallback
|
| 336 |
+
espeak_fallback = EspeakFallback()
|
| 337 |
+
G2P = en.G2P(trf=False, british=False, fallback=espeak_fallback)
|
| 338 |
+
print("✅ G2P initialized with espeak fallback")
|
| 339 |
+
except Exception as e:
|
| 340 |
+
print(f"⚠️ Could not load espeak fallback: {e}")
|
| 341 |
+
G2P = en.G2P(trf=False, british=False, fallback=None)
|
| 342 |
+
print("✅ G2P initialized without fallback")
|
| 343 |
# 2. Tokenizer
|
| 344 |
vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
|
| 345 |
with open(vocab_path, "r", encoding="utf-8") as f:
|
|
|
|
| 370 |
SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
|
| 371 |
print("✅ ENGINE READY")
|
| 372 |
# --- CORE LOGIC (Shared by UI and API) ---
|
| 373 |
+
def safe_g2p(text):
|
| 374 |
+
"""Safely convert text to phonemes, handling errors gracefully"""
|
| 375 |
+
if not text or not text.strip():
|
| 376 |
+
return []
|
| 377 |
+
|
| 378 |
+
# Special replacements
|
| 379 |
+
if "Kokoro" in text:
|
| 380 |
+
text = text.replace("Kokoro", "kˈOkəɹO")
|
| 381 |
+
|
| 382 |
+
try:
|
| 383 |
+
phonemes, _ = G2P(text)
|
| 384 |
+
# Filter out invalid tokens
|
| 385 |
+
tokens = []
|
| 386 |
+
for p in phonemes:
|
| 387 |
+
token = TOKENIZER.get(p)
|
| 388 |
+
if token is not None and token > 0:
|
| 389 |
+
tokens.append(token)
|
| 390 |
+
return tokens
|
| 391 |
+
except Exception as e:
|
| 392 |
+
print(f"⚠️ G2P error for '{text[:30]}...': {e}")
|
| 393 |
+
return []
|
| 394 |
@lru_cache(maxsize=5000)
|
| 395 |
def get_tokens(text):
|
| 396 |
+
return safe_g2p(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
def trim_silence(audio, threshold=0.01):
|
| 398 |
if audio.size == 0: return audio
|
| 399 |
mask = np.abs(audio) > threshold
|
|
|
|
| 413 |
"speed": np.array([speed], dtype=np.float32)
|
| 414 |
})[0]
|
| 415 |
return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
|
| 416 |
+
except Exception as e:
|
| 417 |
+
print(f"⚠️ Inference error: {e}")
|
| 418 |
+
return None
|
| 419 |
def tuned_splitter(text):
|
| 420 |
chunks = re.split(r'([.,!?;:\n]+)', text)
|
| 421 |
buffer = ""
|
|
|
|
| 449 |
gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
|
| 450 |
with gr.Row():
|
| 451 |
with gr.Column():
|
| 452 |
+
text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! This is a test of the Kokoro TTS system.")
|
| 453 |
voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
|
| 454 |
speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
|
| 455 |
btn = gr.Button("Generate", variant="primary")
|
|
|
|
| 462 |
G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
|
| 463 |
INFERENCE_QUEUE = asyncio.Queue()
|
| 464 |
def g2p_task(text):
|
| 465 |
+
"""Thread-safe G2P task"""
|
| 466 |
+
return safe_g2p(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
async def audio_engine_loop():
|
| 468 |
print("⚡ API AUDIO PIPELINE STARTED")
|
| 469 |
loop = asyncio.get_running_loop()
|
|
|
|
| 495 |
pass
|
| 496 |
|
| 497 |
except Exception as e:
|
| 498 |
+
print(f"⚠️ API Engine Error: {e}")
|
| 499 |
@api.on_event("startup")
|
| 500 |
async def startup():
|
| 501 |
asyncio.create_task(audio_engine_loop())
|
|
|
|
| 538 |
text = data["text"]
|
| 539 |
for chunk in tuned_splitter(text):
|
| 540 |
if chunk.strip():
|
| 541 |
+
try:
|
| 542 |
+
tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
|
| 543 |
+
if tokens:
|
|
|
|
|
|
|
| 544 |
style = VOICE_CACHE.get(voice_key)
|
| 545 |
+
if style is None:
|
| 546 |
+
get_voice(voice_key)
|
| 547 |
+
style = VOICE_CACHE.get(voice_key)
|
| 548 |
+
|
| 549 |
+
await INFERENCE_QUEUE.put((tokens, style, speed, ws))
|
| 550 |
+
except Exception as e:
|
| 551 |
+
print(f"⚠️ G2P task error: {e}")
|
| 552 |
|
| 553 |
if "flush" in data:
|
| 554 |
pass
|
| 555 |
except Exception as e:
|
| 556 |
print(f"🔥 Critical WS Error: {e}")
|
| 557 |
+
import traceback
|
| 558 |
+
traceback.print_exc()
|
| 559 |
finally:
|
| 560 |
heartbeat_task.cancel()
|
| 561 |
# --- FINAL MOUNT ---
|