auralodyssey commited on
Commit
b8af37a
·
verified ·
1 Parent(s): be1838a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -31
app.py CHANGED
@@ -329,10 +329,17 @@ VOICE_CHOICES = {
329
  }
330
  # --- ENGINE ---
331
  print("🚀 BOOTING HIGH-RAM ENGINE...")
332
- # Enable fast networking immediately
333
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
334
- # 1. Phonemizer - 🔥 FIXED: Added espeak fallback for proper nouns!
335
- G2P = en.G2P(trf=False, british=False, fallback='espeak') # ← THIS IS THE KEY FIX!
 
 
 
 
 
 
 
 
336
  # 2. Tokenizer
337
  vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
338
  with open(vocab_path, "r", encoding="utf-8") as f:
@@ -363,17 +370,30 @@ sess_options.inter_op_num_threads = 0
363
  SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
364
  print("✅ ENGINE READY")
365
  # --- CORE LOGIC (Shared by UI and API) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  @lru_cache(maxsize=5000)
367
  def get_tokens(text):
368
- if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
369
- phonemes, _ = G2P(text)
370
- # 🔥 FIXED: Filter out invalid tokens (prevents audio gaps)
371
- tokens = []
372
- for p in phonemes:
373
- token = TOKENIZER.get(p)
374
- if token is not None and token > 0:
375
- tokens.append(token)
376
- return tokens
377
  def trim_silence(audio, threshold=0.01):
378
  if audio.size == 0: return audio
379
  mask = np.abs(audio) > threshold
@@ -393,7 +413,9 @@ def infer(text, voice_name, speed):
393
  "speed": np.array([speed], dtype=np.float32)
394
  })[0]
395
  return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
396
- except: return None
 
 
397
  def tuned_splitter(text):
398
  chunks = re.split(r'([.,!?;:\n]+)', text)
399
  buffer = ""
@@ -427,7 +449,7 @@ with gr.Blocks(title="Kokoro TTS") as app:
427
  gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
428
  with gr.Row():
429
  with gr.Column():
430
- text_in = gr.Textbox(label="Input Text", lines=3, value="The system is live. Use the Gradio UI for testing, or connect to /ws/audio for the API.")
431
  voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
432
  speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
433
  btn = gr.Button("Generate", variant="primary")
@@ -440,15 +462,8 @@ INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
440
  G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
441
  INFERENCE_QUEUE = asyncio.Queue()
442
  def g2p_task(text):
443
- if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
444
- phonemes, _ = G2P(text)
445
- # 🔥 FIXED: Filter out invalid tokens
446
- tokens = []
447
- for p in phonemes:
448
- token = TOKENIZER.get(p)
449
- if token is not None and token > 0:
450
- tokens.append(token)
451
- return tokens
452
  async def audio_engine_loop():
453
  print("⚡ API AUDIO PIPELINE STARTED")
454
  loop = asyncio.get_running_loop()
@@ -480,7 +495,7 @@ async def audio_engine_loop():
480
  pass
481
 
482
  except Exception as e:
483
- print(f"API Engine Error: {e}")
484
  @api.on_event("startup")
485
  async def startup():
486
  asyncio.create_task(audio_engine_loop())
@@ -523,19 +538,24 @@ async def websocket_endpoint(ws: WebSocket):
523
  text = data["text"]
524
  for chunk in tuned_splitter(text):
525
  if chunk.strip():
526
- tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
527
- if tokens:
528
- style = VOICE_CACHE.get(voice_key)
529
- if style is None:
530
- get_voice(voice_key)
531
  style = VOICE_CACHE.get(voice_key)
532
-
533
- await INFERENCE_QUEUE.put((tokens, style, speed, ws))
 
 
 
 
 
534
 
535
  if "flush" in data:
536
  pass
537
  except Exception as e:
538
  print(f"🔥 Critical WS Error: {e}")
 
 
539
  finally:
540
  heartbeat_task.cancel()
541
  # --- FINAL MOUNT ---
 
329
  }
330
  # --- ENGINE ---
331
  print("🚀 BOOTING HIGH-RAM ENGINE...")
 
332
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
333
+ # 1. Phonemizer - Try with espeak fallback, fall back to None if it fails
334
+ try:
335
+ from misaki.espeak import EspeakFallback
336
+ espeak_fallback = EspeakFallback()
337
+ G2P = en.G2P(trf=False, british=False, fallback=espeak_fallback)
338
+ print("✅ G2P initialized with espeak fallback")
339
+ except Exception as e:
340
+ print(f"⚠️ Could not load espeak fallback: {e}")
341
+ G2P = en.G2P(trf=False, british=False, fallback=None)
342
+ print("✅ G2P initialized without fallback")
343
  # 2. Tokenizer
344
  vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
345
  with open(vocab_path, "r", encoding="utf-8") as f:
 
370
  SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
371
  print("✅ ENGINE READY")
372
  # --- CORE LOGIC (Shared by UI and API) ---
373
+ def safe_g2p(text):
374
+ """Safely convert text to phonemes, handling errors gracefully"""
375
+ if not text or not text.strip():
376
+ return []
377
+
378
+ # Special replacements
379
+ if "Kokoro" in text:
380
+ text = text.replace("Kokoro", "kˈOkəɹO")
381
+
382
+ try:
383
+ phonemes, _ = G2P(text)
384
+ # Filter out invalid tokens
385
+ tokens = []
386
+ for p in phonemes:
387
+ token = TOKENIZER.get(p)
388
+ if token is not None and token > 0:
389
+ tokens.append(token)
390
+ return tokens
391
+ except Exception as e:
392
+ print(f"⚠️ G2P error for '{text[:30]}...': {e}")
393
+ return []
394
  @lru_cache(maxsize=5000)
395
  def get_tokens(text):
396
+ return safe_g2p(text)
 
 
 
 
 
 
 
 
397
  def trim_silence(audio, threshold=0.01):
398
  if audio.size == 0: return audio
399
  mask = np.abs(audio) > threshold
 
413
  "speed": np.array([speed], dtype=np.float32)
414
  })[0]
415
  return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
416
+ except Exception as e:
417
+ print(f"⚠️ Inference error: {e}")
418
+ return None
419
  def tuned_splitter(text):
420
  chunks = re.split(r'([.,!?;:\n]+)', text)
421
  buffer = ""
 
449
  gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
450
  with gr.Row():
451
  with gr.Column():
452
+ text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! This is a test of the Kokoro TTS system.")
453
  voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
454
  speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
455
  btn = gr.Button("Generate", variant="primary")
 
462
  G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
463
  INFERENCE_QUEUE = asyncio.Queue()
464
  def g2p_task(text):
465
+ """Thread-safe G2P task"""
466
+ return safe_g2p(text)
 
 
 
 
 
 
 
467
  async def audio_engine_loop():
468
  print("⚡ API AUDIO PIPELINE STARTED")
469
  loop = asyncio.get_running_loop()
 
495
  pass
496
 
497
  except Exception as e:
498
+ print(f"⚠️ API Engine Error: {e}")
499
  @api.on_event("startup")
500
  async def startup():
501
  asyncio.create_task(audio_engine_loop())
 
538
  text = data["text"]
539
  for chunk in tuned_splitter(text):
540
  if chunk.strip():
541
+ try:
542
+ tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
543
+ if tokens:
 
 
544
  style = VOICE_CACHE.get(voice_key)
545
+ if style is None:
546
+ get_voice(voice_key)
547
+ style = VOICE_CACHE.get(voice_key)
548
+
549
+ await INFERENCE_QUEUE.put((tokens, style, speed, ws))
550
+ except Exception as e:
551
+ print(f"⚠️ G2P task error: {e}")
552
 
553
  if "flush" in data:
554
  pass
555
  except Exception as e:
556
  print(f"🔥 Critical WS Error: {e}")
557
+ import traceback
558
+ traceback.print_exc()
559
  finally:
560
  heartbeat_task.cancel()
561
  # --- FINAL MOUNT ---