auralodyssey commited on
Commit
eff63e9
Β·
verified Β·
1 Parent(s): b8af37a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -125
app.py CHANGED
@@ -294,26 +294,25 @@
294
 
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
-
298
  import os
299
  import json
300
  import time
301
  import re
302
  import numpy as np
303
- import onnxruntime as ort
304
  import gradio as gr
305
- from huggingface_hub import hf_hub_download
306
- from misaki import en
307
  from functools import lru_cache
308
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
309
  import asyncio
310
  import uvloop
311
  import uvicorn
312
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
313
  # --- CONFIGURATION ---
314
- MODEL_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
315
- MODEL_FILE = "onnx/model.onnx"
316
- TOKENIZER_FILE = "tokenizer.json"
317
  # --- VOICE UI ---
318
  VOICE_CHOICES = {
319
  'πŸ‡ΊπŸ‡Έ 🚺 Heart': 'af_heart', 'πŸ‡ΊπŸ‡Έ 🚺 Bella': 'af_bella', 'πŸ‡ΊπŸ‡Έ 🚺 Nicole': 'af_nicole',
@@ -327,96 +326,52 @@ VOICE_CHOICES = {
327
  'πŸ‡¬πŸ‡§ 🚹 George': 'bm_george', 'πŸ‡¬πŸ‡§ 🚹 Fable': 'bm_fable', 'πŸ‡¬πŸ‡§ 🚹 Lewis': 'bm_lewis',
328
  'πŸ‡¬πŸ‡§ 🚹 Daniel': 'bm_daniel',
329
  }
 
330
  # --- ENGINE ---
331
- print("πŸš€ BOOTING HIGH-RAM ENGINE...")
332
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
333
- # 1. Phonemizer - Try with espeak fallback, fall back to None if it fails
334
- try:
335
- from misaki.espeak import EspeakFallback
336
- espeak_fallback = EspeakFallback()
337
- G2P = en.G2P(trf=False, british=False, fallback=espeak_fallback)
338
- print("βœ… G2P initialized with espeak fallback")
339
- except Exception as e:
340
- print(f"⚠️ Could not load espeak fallback: {e}")
341
- G2P = en.G2P(trf=False, british=False, fallback=None)
342
- print("βœ… G2P initialized without fallback")
343
- # 2. Tokenizer
344
- vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
345
- with open(vocab_path, "r", encoding="utf-8") as f:
346
- data = json.load(f)
347
- TOKENIZER = data["model"]["vocab"] if "model" in data else data.get("vocab", {})
348
- # 3. Voices (Lazy Load)
349
- VOICE_CACHE = {}
350
- def get_voice(name):
351
- code = VOICE_CHOICES.get(name, name)
352
- if code not in VOICE_CACHE:
353
- try:
354
- print(f"⬇️ Loading Voice: {code}")
355
- path = hf_hub_download(repo_id=MODEL_REPO, filename=f"voices/{code}.bin")
356
- VOICE_CACHE[code] = np.fromfile(path, dtype=np.float32).reshape(-1, 1, 256)
357
- except:
358
- if 'af_bella' not in VOICE_CACHE:
359
- p = hf_hub_download(repo_id=MODEL_REPO, filename="voices/af_bella.bin")
360
- VOICE_CACHE['af_bella'] = np.fromfile(p, dtype=np.float32).reshape(-1, 1, 256)
361
- return VOICE_CACHE['af_bella']
362
- return VOICE_CACHE[code]
363
- # 4. ONNX Engine
364
- model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
365
- sess_options = ort.SessionOptions()
366
- sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
367
- sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
368
- sess_options.intra_op_num_threads = 0
369
- sess_options.inter_op_num_threads = 0
370
- SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
371
- print("βœ… ENGINE READY")
372
- # --- CORE LOGIC (Shared by UI and API) ---
373
- def safe_g2p(text):
374
- """Safely convert text to phonemes, handling errors gracefully"""
375
  if not text or not text.strip():
376
- return []
377
 
378
- # Special replacements
379
- if "Kokoro" in text:
380
- text = text.replace("Kokoro", "kˈOkΙ™ΙΉO")
381
 
382
  try:
383
- phonemes, _ = G2P(text)
384
- # Filter out invalid tokens
385
- tokens = []
386
- for p in phonemes:
387
- token = TOKENIZER.get(p)
388
- if token is not None and token > 0:
389
- tokens.append(token)
390
- return tokens
 
 
 
 
 
391
  except Exception as e:
392
- print(f"⚠️ G2P error for '{text[:30]}...': {e}")
393
- return []
394
- @lru_cache(maxsize=5000)
395
- def get_tokens(text):
396
- return safe_g2p(text)
397
  def trim_silence(audio, threshold=0.01):
398
- if audio.size == 0: return audio
 
399
  mask = np.abs(audio) > threshold
400
- if not np.any(mask): return audio
 
401
  start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
402
  return audio[max(0, start-50) : min(len(audio), end+50)]
403
- def infer(text, voice_name, speed):
404
- if not text.strip(): return None
405
- ids = get_tokens(text)[:510]
406
- if not ids: return None
407
- voice = get_voice(voice_name)
408
- style = voice[min(len(ids), voice.shape[0]-1)]
409
- try:
410
- audio = SESSION.run(None, {
411
- "input_ids": np.array([[0] + ids + [0]], dtype=np.int64),
412
- "style": style,
413
- "speed": np.array([speed], dtype=np.float32)
414
- })[0]
415
- return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
416
- except Exception as e:
417
- print(f"⚠️ Inference error: {e}")
418
- return None
419
  def tuned_splitter(text):
 
420
  chunks = re.split(r'([.,!?;:\n]+)', text)
421
  buffer = ""
422
  chunk_count = 0
@@ -433,81 +388,85 @@ def tuned_splitter(text):
433
  buffer = ""
434
  if buffer.strip():
435
  yield buffer.strip()
 
436
  def stream_generator(text, voice_name, speed):
437
- print("--- START STREAM ---")
438
- get_voice(voice_name)
439
  for i, chunk in enumerate(tuned_splitter(text)):
440
  t0 = time.time()
441
- audio = infer(chunk, voice_name, speed)
442
- if audio:
 
443
  dur = time.time() - t0
444
  print(f"⚑ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
445
- yield audio
 
 
446
  print("--- END STREAM ---")
 
447
  # --- UI DEFINITION ---
448
  with gr.Blocks(title="Kokoro TTS") as app:
449
- gr.Markdown("## ⚑ Kokoro-82M (High-RAM Tuned)")
450
  with gr.Row():
451
  with gr.Column():
452
- text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! This is a test of the Kokoro TTS system.")
453
  voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='πŸ‡ΊπŸ‡Έ 🚺 Bella', label="Voice")
454
  speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
455
  btn = gr.Button("Generate", variant="primary")
456
  with gr.Column():
457
  audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
458
  btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 
459
  # --- API INTEGRATION ---
460
  api = FastAPI()
 
461
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
462
- G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
463
  INFERENCE_QUEUE = asyncio.Queue()
464
- def g2p_task(text):
465
- """Thread-safe G2P task"""
466
- return safe_g2p(text)
467
  async def audio_engine_loop():
 
468
  print("⚑ API AUDIO PIPELINE STARTED")
469
  loop = asyncio.get_running_loop()
470
 
471
  while True:
472
  job = await INFERENCE_QUEUE.get()
473
- tokens, style, speed, ws = job
474
 
475
  try:
476
  if ws.client_state.value > 1:
477
  continue
478
- input_ids = np.array([[0, *tokens[:510], 0]], dtype=np.int64)
479
- style_vec = style[min(len(tokens), style.shape[0]-1)]
480
-
481
  audio = await loop.run_in_executor(
482
- INFERENCE_EXECUTOR,
483
- lambda: SESSION.run(None, {
484
- "input_ids": input_ids,
485
- "style": style_vec,
486
- "speed": np.array([speed], dtype=np.float32)
487
- })[0]
488
  )
489
 
490
- pcm_bytes = (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16).tobytes()
491
-
492
- try:
493
- await ws.send_bytes(pcm_bytes)
494
- except Exception:
495
- pass
496
 
 
 
 
 
 
497
  except Exception as e:
498
  print(f"⚠️ API Engine Error: {e}")
 
499
  @api.on_event("startup")
500
  async def startup():
501
  asyncio.create_task(audio_engine_loop())
 
502
  @api.websocket("/ws/audio")
503
  async def websocket_endpoint(ws: WebSocket):
504
  await ws.accept()
505
 
506
  voice_key = "af_bella"
507
  speed = 1.0
508
- loop = asyncio.get_running_loop()
509
 
510
  print(f"βœ… Client connected: {ws.client}")
 
511
  async def keep_alive():
512
  while True:
513
  try:
@@ -517,6 +476,7 @@ async def websocket_endpoint(ws: WebSocket):
517
  break
518
 
519
  heartbeat_task = asyncio.create_task(keep_alive())
 
520
  try:
521
  while True:
522
  try:
@@ -527,10 +487,10 @@ async def websocket_endpoint(ws: WebSocket):
527
  except Exception as e:
528
  print(f"⚠️ Connection lost: {e}")
529
  break
 
530
  if "config" in data:
531
  voice_name = data.get("voice", "πŸ‡ΊπŸ‡Έ 🚺 Bella")
532
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
533
- get_voice(voice_name)
534
  voice_key = voice_code
535
  speed = float(data.get("speed", speed))
536
 
@@ -538,27 +498,20 @@ async def websocket_endpoint(ws: WebSocket):
538
  text = data["text"]
539
  for chunk in tuned_splitter(text):
540
  if chunk.strip():
541
- try:
542
- tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
543
- if tokens:
544
- style = VOICE_CACHE.get(voice_key)
545
- if style is None:
546
- get_voice(voice_key)
547
- style = VOICE_CACHE.get(voice_key)
548
-
549
- await INFERENCE_QUEUE.put((tokens, style, speed, ws))
550
- except Exception as e:
551
- print(f"⚠️ G2P task error: {e}")
552
 
553
  if "flush" in data:
554
  pass
 
555
  except Exception as e:
556
  print(f"πŸ”₯ Critical WS Error: {e}")
557
  import traceback
558
  traceback.print_exc()
559
  finally:
560
  heartbeat_task.cancel()
 
561
  # --- FINAL MOUNT ---
562
  final_app = gr.mount_gradio_app(api, app, path="/")
 
563
  if __name__ == "__main__":
564
  uvicorn.run(final_app, host="0.0.0.0", port=7860)
 
294
 
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
 
297
  import os
298
  import json
299
  import time
300
  import re
301
  import numpy as np
 
302
  import gradio as gr
 
 
303
  from functools import lru_cache
304
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
305
  import asyncio
306
  import uvloop
307
  import uvicorn
308
  from concurrent.futures import ThreadPoolExecutor
309
+
310
+ # πŸ”₯ USE KOKORO PIPELINE INSTEAD OF RAW MISAKI
311
+ from kokoro import KPipeline
312
+
313
  # --- CONFIGURATION ---
314
+ SAMPLE_RATE = 24000
315
+
 
316
  # --- VOICE UI ---
317
  VOICE_CHOICES = {
318
  'πŸ‡ΊπŸ‡Έ 🚺 Heart': 'af_heart', 'πŸ‡ΊπŸ‡Έ 🚺 Bella': 'af_bella', 'πŸ‡ΊπŸ‡Έ 🚺 Nicole': 'af_nicole',
 
326
  'πŸ‡¬πŸ‡§ 🚹 George': 'bm_george', 'πŸ‡¬πŸ‡§ 🚹 Fable': 'bm_fable', 'πŸ‡¬πŸ‡§ 🚹 Lewis': 'bm_lewis',
327
  'πŸ‡¬πŸ‡§ 🚹 Daniel': 'bm_daniel',
328
  }
329
+
330
  # --- ENGINE ---
331
+ print("πŸš€ BOOTING KOKORO PIPELINE ENGINE...")
332
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
333
+
334
+ # Initialize KPipeline - this handles espeak fallback automatically!
335
+ PIPELINE = KPipeline(lang_code='a') # 'a' = American English
336
+ print("βœ… KOKORO PIPELINE READY")
337
+
338
+ # --- CORE LOGIC ---
339
+ def generate_audio(text, voice_name, speed):
340
+ """Generate audio using KPipeline - handles all phonemes properly!"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  if not text or not text.strip():
342
+ return None
343
 
344
+ voice = VOICE_CHOICES.get(voice_name, voice_name)
 
 
345
 
346
  try:
347
+ # KPipeline returns generator of (graphemes, phonemes, audio)
348
+ audio_chunks = []
349
+ for gs, ps, audio in PIPELINE(text, voice=voice, speed=speed):
350
+ if audio is not None and len(audio) > 0:
351
+ audio_chunks.append(audio)
352
+
353
+ if not audio_chunks:
354
+ return None
355
+
356
+ # Concatenate all audio chunks
357
+ full_audio = np.concatenate(audio_chunks)
358
+ return full_audio
359
+
360
  except Exception as e:
361
+ print(f"⚠️ Audio generation error: {e}")
362
+ return None
363
+
 
 
364
  def trim_silence(audio, threshold=0.01):
365
+ if audio is None or audio.size == 0:
366
+ return audio
367
  mask = np.abs(audio) > threshold
368
+ if not np.any(mask):
369
+ return audio
370
  start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
371
  return audio[max(0, start-50) : min(len(audio), end+50)]
372
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  def tuned_splitter(text):
374
+ """Split text into chunks for streaming"""
375
  chunks = re.split(r'([.,!?;:\n]+)', text)
376
  buffer = ""
377
  chunk_count = 0
 
388
  buffer = ""
389
  if buffer.strip():
390
  yield buffer.strip()
391
+
392
  def stream_generator(text, voice_name, speed):
393
+ """Generate audio stream for Gradio UI"""
394
+ print(f"--- START STREAM: {text[:50]}... ---")
395
  for i, chunk in enumerate(tuned_splitter(text)):
396
  t0 = time.time()
397
+ audio = generate_audio(chunk, voice_name, speed)
398
+ if audio is not None and len(audio) > 0:
399
+ audio = trim_silence(audio)
400
  dur = time.time() - t0
401
  print(f"⚑ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
402
+ # Convert to int16 for audio output
403
+ audio_int16 = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
404
+ yield (SAMPLE_RATE, audio_int16)
405
  print("--- END STREAM ---")
406
+
407
  # --- UI DEFINITION ---
408
  with gr.Blocks(title="Kokoro TTS") as app:
409
+ gr.Markdown("## ⚑ Kokoro-82M with KPipeline (Proper Name Support!)")
410
  with gr.Row():
411
  with gr.Column():
412
+ text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! My name is Yaman and I work at Willo. Testing pronunciation of names!")
413
  voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='πŸ‡ΊπŸ‡Έ 🚺 Bella', label="Voice")
414
  speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
415
  btn = gr.Button("Generate", variant="primary")
416
  with gr.Column():
417
  audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
418
  btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
419
+
420
  # --- API INTEGRATION ---
421
  api = FastAPI()
422
+
423
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 
424
  INFERENCE_QUEUE = asyncio.Queue()
425
+
 
 
426
  async def audio_engine_loop():
427
+ """Background worker that processes audio requests"""
428
  print("⚑ API AUDIO PIPELINE STARTED")
429
  loop = asyncio.get_running_loop()
430
 
431
  while True:
432
  job = await INFERENCE_QUEUE.get()
433
+ text, voice, speed, ws = job
434
 
435
  try:
436
  if ws.client_state.value > 1:
437
  continue
438
+
439
+ # Generate audio using KPipeline (in thread to not block)
 
440
  audio = await loop.run_in_executor(
441
+ INFERENCE_EXECUTOR,
442
+ lambda: generate_audio(text, voice, speed)
 
 
 
 
443
  )
444
 
445
+ if audio is not None and len(audio) > 0:
446
+ audio = trim_silence(audio)
447
+ pcm_bytes = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
 
 
 
448
 
449
+ try:
450
+ await ws.send_bytes(pcm_bytes)
451
+ except Exception:
452
+ pass
453
+
454
  except Exception as e:
455
  print(f"⚠️ API Engine Error: {e}")
456
+
457
  @api.on_event("startup")
458
  async def startup():
459
  asyncio.create_task(audio_engine_loop())
460
+
461
  @api.websocket("/ws/audio")
462
  async def websocket_endpoint(ws: WebSocket):
463
  await ws.accept()
464
 
465
  voice_key = "af_bella"
466
  speed = 1.0
 
467
 
468
  print(f"βœ… Client connected: {ws.client}")
469
+
470
  async def keep_alive():
471
  while True:
472
  try:
 
476
  break
477
 
478
  heartbeat_task = asyncio.create_task(keep_alive())
479
+
480
  try:
481
  while True:
482
  try:
 
487
  except Exception as e:
488
  print(f"⚠️ Connection lost: {e}")
489
  break
490
+
491
  if "config" in data:
492
  voice_name = data.get("voice", "πŸ‡ΊπŸ‡Έ 🚺 Bella")
493
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
 
494
  voice_key = voice_code
495
  speed = float(data.get("speed", speed))
496
 
 
498
  text = data["text"]
499
  for chunk in tuned_splitter(text):
500
  if chunk.strip():
501
+ await INFERENCE_QUEUE.put((chunk, voice_key, speed, ws))
 
 
 
 
 
 
 
 
 
 
502
 
503
  if "flush" in data:
504
  pass
505
+
506
  except Exception as e:
507
  print(f"πŸ”₯ Critical WS Error: {e}")
508
  import traceback
509
  traceback.print_exc()
510
  finally:
511
  heartbeat_task.cancel()
512
+
513
  # --- FINAL MOUNT ---
514
  final_app = gr.mount_gradio_app(api, app, path="/")
515
+
516
  if __name__ == "__main__":
517
  uvicorn.run(final_app, host="0.0.0.0", port=7860)