auralodyssey commited on
Commit
89cbd38
·
verified ·
1 Parent(s): f78ae4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +517 -62
app.py CHANGED
@@ -294,11 +294,337 @@
294
 
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  import os
298
  import re
 
299
  import time
300
  import asyncio
301
- from concurrent.futures import ThreadPoolExecutor
 
302
 
303
  import numpy as np
304
  import gradio as gr
@@ -309,26 +635,37 @@ import torch
309
  from kokoro import KPipeline
310
 
311
  # ----------------------------
312
- # HARD LIMIT CPU THREADS (2 vCPU box)
313
  # ----------------------------
314
- os.environ.setdefault("OMP_NUM_THREADS", "2")
315
- os.environ.setdefault("MKL_NUM_THREADS", "2")
316
- os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
 
 
317
 
318
  try:
319
- torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
320
- torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
321
  except Exception:
322
  pass
323
 
324
- # Optional: uvloop for faster event loop on HF Linux
325
  try:
326
- import uvloop # type: ignore
327
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
328
- except Exception:
329
- pass
 
 
 
330
 
331
- print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
 
 
 
 
 
 
332
 
333
  # ----------------------------
334
  # VOICES
@@ -349,29 +686,31 @@ VOICE_CHOICES = {
349
  def voice_to_lang_code(voice_code: str) -> str:
350
  if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
351
  return "b" # British
352
- return "a" # American
353
 
354
  # ----------------------------
355
- # PIPELINES (keep hot in RAM)
 
356
  # ----------------------------
 
357
  PIPELINES = {
358
  "a": KPipeline(lang_code="a"),
359
  "b": KPipeline(lang_code="b"),
360
  }
 
361
 
362
  # ----------------------------
363
- # TEXT NORMALIZATION (matches your pasted official docs)
364
  # ----------------------------
365
  def normalize_text(text: str) -> str:
366
  if not text:
367
  return ""
368
- return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
 
 
369
 
370
  # ----------------------------
371
- # LOW LATENCY SEGMENTATION
372
- # One pipeline call per request.
373
- # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
374
- # We also force a small first segment for fast first audio.
375
  # ----------------------------
376
  _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
377
 
@@ -380,10 +719,10 @@ def inject_newlines_for_fast_stream(text: str) -> str:
380
  if not text:
381
  return ""
382
 
383
- # Sentence boundaries -> newline so official split_pattern can segment
384
  text = _SENT_BOUNDARY.sub(r"\1\n", text)
385
-
386
- # Also split on existing multi-newlines
387
  text = re.sub(r"\n{3,}", "\n\n", text)
388
 
389
  # Guarantee a small first segment for low time-to-first-audio
@@ -396,7 +735,7 @@ def inject_newlines_for_fast_stream(text: str) -> str:
396
  return text
397
 
398
  # ----------------------------
399
- # AUDIO CONVERSION (fast, safe)
400
  # ----------------------------
401
  def audio_to_int16_np(audio):
402
  if isinstance(audio, torch.Tensor):
@@ -404,7 +743,7 @@ def audio_to_int16_np(audio):
404
  audio = torch.clamp(audio, -1.0, 1.0)
405
  return (audio * 32767.0).to(torch.int16).numpy()
406
 
407
- audio = np.asarray(audio)
408
  audio = np.clip(audio, -1.0, 1.0)
409
  return (audio * 32767.0).astype(np.int16)
410
 
@@ -412,10 +751,13 @@ def audio_to_pcm_bytes(audio) -> bytes:
412
  return audio_to_int16_np(audio).tobytes()
413
 
414
  # ----------------------------
415
- # OFFICIAL GENERATION PATH (single pipeline call)
416
- # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
417
  # ----------------------------
418
  def kokoro_generator_full(text: str, voice_code: str, speed: float):
 
 
 
 
419
  lang_code = voice_to_lang_code(voice_code)
420
  pipeline = PIPELINES[lang_code]
421
  text = inject_newlines_for_fast_stream(text)
@@ -423,27 +765,50 @@ def kokoro_generator_full(text: str, voice_code: str, speed: float):
423
  if not text:
424
  return
425
 
426
- with torch.inference_mode():
427
- generator = pipeline(
428
- text,
429
- voice=voice_code,
430
- speed=float(speed),
431
- split_pattern=r"\n+",
432
- )
433
- for _, _, audio in generator:
434
- yield audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
  # ----------------------------
437
- # WARMUP (pay cold-start cost at boot)
438
  # ----------------------------
439
  def warmup():
440
  try:
441
  t0 = time.time()
442
- for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
443
  break
444
  print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
445
  except Exception as e:
446
  print(f"⚠️ WARMUP FAILED: {e}")
 
447
 
448
  # ----------------------------
449
  # GRADIO UI STREAM
@@ -461,74 +826,134 @@ def gradio_stream(text, voice_name, speed):
461
  yield 24000, audio_to_int16_np(audio)
462
 
463
  # ----------------------------
464
- # FASTAPI WS ENGINE
465
- # Single worker thread for actual generation.
466
- # Stream frames to client as soon as they exist.
467
- # No buffering a full list before sending.
468
  # ----------------------------
469
  api = FastAPI()
470
 
471
- INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 
472
  INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
473
 
474
  async def audio_engine_loop():
475
- print("⚡ API AUDIO PIPELINE STARTED")
 
 
 
 
476
  loop = asyncio.get_running_loop()
477
 
478
  while True:
479
- ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
 
 
 
 
480
 
481
  # Skip dead clients early
482
- if ws.client_state.value > 1:
 
 
 
 
483
  continue
484
 
485
- frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
 
486
 
487
  def _worker():
 
 
 
 
488
  try:
 
 
489
  for audio in kokoro_generator_full(text, voice_code, speed):
490
  b = audio_to_pcm_bytes(audio)
491
- # backpressure aware
492
- while True:
 
 
 
 
 
 
 
 
493
  try:
494
  loop.call_soon_threadsafe(frame_q.put_nowait, b)
495
  break
496
- except Exception:
497
  time.sleep(0.001)
 
 
 
 
 
 
 
498
  loop.call_soon_threadsafe(frame_q.put_nowait, None)
 
 
499
  except Exception as e:
500
- print(f"API Worker Error: {e}")
 
501
  try:
502
  loop.call_soon_threadsafe(frame_q.put_nowait, None)
503
  except Exception:
504
  pass
505
 
 
506
  INFERENCE_EXECUTOR.submit(_worker)
507
 
 
508
  first_sent = False
509
  started = time.time()
 
510
 
511
  while True:
512
- frame = await frame_q.get()
 
 
 
 
 
 
513
  if frame is None:
514
  break
515
 
516
- if ws.client_state.value > 1:
 
 
 
 
 
517
  break
518
 
519
  try:
520
  await ws.send_bytes(frame)
 
 
521
  if not first_sent:
522
- print(f"⚡ API first audio in {time.time() - started:.2f}s")
523
  first_sent = True
524
- except Exception:
 
525
  break
526
 
 
 
527
  @api.on_event("startup")
528
  async def startup():
529
  loop = asyncio.get_running_loop()
 
 
530
  await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
 
 
531
  asyncio.create_task(audio_engine_loop())
 
 
532
 
533
  @api.websocket("/ws/audio")
534
  async def websocket_endpoint(ws: WebSocket):
@@ -536,10 +961,12 @@ async def websocket_endpoint(ws: WebSocket):
536
 
537
  voice_code = "af_bella"
538
  speed = 1.0
 
539
 
540
- print(f"✅ Client connected: {ws.client}")
541
 
542
  async def keep_alive():
 
543
  while True:
544
  try:
545
  await asyncio.sleep(15)
@@ -552,41 +979,63 @@ async def websocket_endpoint(ws: WebSocket):
552
  try:
553
  while True:
554
  try:
555
- data = await ws.receive_json()
 
 
 
556
  except WebSocketDisconnect:
557
- print("❌ Client disconnected cleanly")
558
  break
559
  except Exception as e:
560
- print(f"⚠️ Connection lost: {e}")
561
  break
562
 
 
563
  if "config" in data:
564
  voice_name = data.get("voice", "🇺🇸 🚺 Bella")
565
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
566
  speed = float(data.get("speed", speed))
 
567
 
 
568
  if "text" in data:
569
  text = normalize_text(data.get("text", ""))
570
  if text.strip():
 
571
  await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
572
 
 
573
  if "flush" in data:
574
  pass
575
 
576
  finally:
577
  heartbeat_task.cancel()
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
  # ----------------------------
580
  # GRADIO APP
581
  # ----------------------------
582
  with gr.Blocks(title="Kokoro TTS") as app:
583
- gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
 
 
584
  with gr.Row():
585
  with gr.Column():
586
  text_in = gr.Textbox(
587
  label="Input Text",
588
  lines=3,
589
- value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
590
  )
591
  voice_in = gr.Dropdown(
592
  list(VOICE_CHOICES.keys()),
@@ -603,4 +1052,10 @@ with gr.Blocks(title="Kokoro TTS") as app:
603
  final_app = gr.mount_gradio_app(api, app, path="/")
604
 
605
  if __name__ == "__main__":
606
- uvicorn.run(final_app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
294
 
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
+ #OLD KOKORO CHATGPT CODE
298
+ # import os
299
+ # import re
300
+ # import time
301
+ # import asyncio
302
+ # from concurrent.futures import ThreadPoolExecutor
303
+
304
+ # import numpy as np
305
+ # import gradio as gr
306
+ # from fastapi import FastAPI, WebSocket, WebSocketDisconnect
307
+ # import uvicorn
308
+
309
+ # import torch
310
+ # from kokoro import KPipeline
311
+
312
+ # # ----------------------------
313
+ # # HARD LIMIT CPU THREADS (2 vCPU box)
314
+ # # ----------------------------
315
+ # os.environ.setdefault("OMP_NUM_THREADS", "2")
316
+ # os.environ.setdefault("MKL_NUM_THREADS", "2")
317
+ # os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
318
+
319
+ # try:
320
+ # torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
321
+ # torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
322
+ # except Exception:
323
+ # pass
324
+
325
+ # # Optional: uvloop for faster event loop on HF Linux
326
+ # try:
327
+ # import uvloop # type: ignore
328
+ # asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
329
+ # except Exception:
330
+ # pass
331
+
332
+ # print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
333
+
334
+ # # ----------------------------
335
+ # # VOICES
336
+ # # ----------------------------
337
+ # VOICE_CHOICES = {
338
+ # "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
339
+ # "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
340
+ # "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
341
+ # "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
342
+ # "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
343
+ # "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
344
+ # "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
345
+ # "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
346
+ # "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
347
+ # "🇬🇧 🚹 Daniel": "bm_daniel",
348
+ # }
349
+
350
+ # def voice_to_lang_code(voice_code: str) -> str:
351
+ # if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
352
+ # return "b" # British
353
+ # return "a" # American
354
+
355
+ # # ----------------------------
356
+ # # PIPELINES (keep hot in RAM)
357
+ # # ----------------------------
358
+ # PIPELINES = {
359
+ # "a": KPipeline(lang_code="a"),
360
+ # "b": KPipeline(lang_code="b"),
361
+ # }
362
+
363
+ # # ----------------------------
364
+ # # TEXT NORMALIZATION (matches your pasted official docs)
365
+ # # ----------------------------
366
+ # def normalize_text(text: str) -> str:
367
+ # if not text:
368
+ # return ""
369
+ # return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
370
+
371
+ # # ----------------------------
372
+ # # LOW LATENCY SEGMENTATION
373
+ # # One pipeline call per request.
374
+ # # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
375
+ # # We also force a small first segment for fast first audio.
376
+ # # ----------------------------
377
+ # _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
378
+
379
+ # def inject_newlines_for_fast_stream(text: str) -> str:
380
+ # text = normalize_text(text).strip()
381
+ # if not text:
382
+ # return ""
383
+
384
+ # # Sentence boundaries -> newline so official split_pattern can segment
385
+ # text = _SENT_BOUNDARY.sub(r"\1\n", text)
386
+
387
+ # # Also split on existing multi-newlines
388
+ # text = re.sub(r"\n{3,}", "\n\n", text)
389
+
390
+ # # Guarantee a small first segment for low time-to-first-audio
391
+ # if "\n" not in text and len(text) > 90:
392
+ # cut = text.rfind(" ", 0, 70)
393
+ # if cut < 35:
394
+ # cut = 70
395
+ # text = text[:cut].strip() + "\n" + text[cut:].strip()
396
+
397
+ # return text
398
+
399
+ # # ----------------------------
400
+ # # AUDIO CONVERSION (fast, safe)
401
+ # # ----------------------------
402
+ # def audio_to_int16_np(audio):
403
+ # if isinstance(audio, torch.Tensor):
404
+ # audio = audio.detach().cpu()
405
+ # audio = torch.clamp(audio, -1.0, 1.0)
406
+ # return (audio * 32767.0).to(torch.int16).numpy()
407
+
408
+ # audio = np.asarray(audio)
409
+ # audio = np.clip(audio, -1.0, 1.0)
410
+ # return (audio * 32767.0).astype(np.int16)
411
+
412
+ # def audio_to_pcm_bytes(audio) -> bytes:
413
+ # return audio_to_int16_np(audio).tobytes()
414
+
415
+ # # ----------------------------
416
+ # # OFFICIAL GENERATION PATH (single pipeline call)
417
+ # # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
418
+ # # ----------------------------
419
+ # def kokoro_generator_full(text: str, voice_code: str, speed: float):
420
+ # lang_code = voice_to_lang_code(voice_code)
421
+ # pipeline = PIPELINES[lang_code]
422
+ # text = inject_newlines_for_fast_stream(text)
423
+
424
+ # if not text:
425
+ # return
426
+
427
+ # with torch.inference_mode():
428
+ # generator = pipeline(
429
+ # text,
430
+ # voice=voice_code,
431
+ # speed=float(speed),
432
+ # split_pattern=r"\n+",
433
+ # )
434
+ # for _, _, audio in generator:
435
+ # yield audio
436
+
437
+ # # ----------------------------
438
+ # # WARMUP (pay cold-start cost at boot)
439
+ # # ----------------------------
440
+ # def warmup():
441
+ # try:
442
+ # t0 = time.time()
443
+ # for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
444
+ # break
445
+ # print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
446
+ # except Exception as e:
447
+ # print(f"⚠️ WARMUP FAILED: {e}")
448
+
449
+ # # ----------------------------
450
+ # # GRADIO UI STREAM
451
+ # # ----------------------------
452
+ # def gradio_stream(text, voice_name, speed):
453
+ # voice_code = VOICE_CHOICES.get(voice_name, voice_name)
454
+ # text = normalize_text(text)
455
+
456
+ # i = 0
457
+ # t0 = time.time()
458
+ # for audio in kokoro_generator_full(text, voice_code, speed):
459
+ # if i == 0:
460
+ # print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
461
+ # i += 1
462
+ # yield 24000, audio_to_int16_np(audio)
463
+
464
+ # # ----------------------------
465
+ # # FASTAPI WS ENGINE
466
+ # # Single worker thread for actual generation.
467
+ # # Stream frames to client as soon as they exist.
468
+ # # No buffering a full list before sending.
469
+ # # ----------------------------
470
+ # api = FastAPI()
471
+
472
+ # INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
473
+ # INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
474
+
475
+ # async def audio_engine_loop():
476
+ # print("⚡ API AUDIO PIPELINE STARTED")
477
+ # loop = asyncio.get_running_loop()
478
+
479
+ # while True:
480
+ # ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
481
+
482
+ # # Skip dead clients early
483
+ # if ws.client_state.value > 1:
484
+ # continue
485
+
486
+ # frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
487
+
488
+ # def _worker():
489
+ # try:
490
+ # for audio in kokoro_generator_full(text, voice_code, speed):
491
+ # b = audio_to_pcm_bytes(audio)
492
+ # # backpressure aware
493
+ # while True:
494
+ # try:
495
+ # loop.call_soon_threadsafe(frame_q.put_nowait, b)
496
+ # break
497
+ # except Exception:
498
+ # time.sleep(0.001)
499
+ # loop.call_soon_threadsafe(frame_q.put_nowait, None)
500
+ # except Exception as e:
501
+ # print(f"API Worker Error: {e}")
502
+ # try:
503
+ # loop.call_soon_threadsafe(frame_q.put_nowait, None)
504
+ # except Exception:
505
+ # pass
506
+
507
+ # INFERENCE_EXECUTOR.submit(_worker)
508
+
509
+ # first_sent = False
510
+ # started = time.time()
511
+
512
+ # while True:
513
+ # frame = await frame_q.get()
514
+ # if frame is None:
515
+ # break
516
+
517
+ # if ws.client_state.value > 1:
518
+ # break
519
+
520
+ # try:
521
+ # await ws.send_bytes(frame)
522
+ # if not first_sent:
523
+ # print(f"⚡ API first audio in {time.time() - started:.2f}s")
524
+ # first_sent = True
525
+ # except Exception:
526
+ # break
527
+
528
+ # @api.on_event("startup")
529
+ # async def startup():
530
+ # loop = asyncio.get_running_loop()
531
+ # await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
532
+ # asyncio.create_task(audio_engine_loop())
533
+
534
+ # @api.websocket("/ws/audio")
535
+ # async def websocket_endpoint(ws: WebSocket):
536
+ # await ws.accept()
537
+
538
+ # voice_code = "af_bella"
539
+ # speed = 1.0
540
+
541
+ # print(f"✅ Client connected: {ws.client}")
542
+
543
+ # async def keep_alive():
544
+ # while True:
545
+ # try:
546
+ # await asyncio.sleep(15)
547
+ # await ws.send_json({"type": "ping"})
548
+ # except Exception:
549
+ # break
550
+
551
+ # heartbeat_task = asyncio.create_task(keep_alive())
552
+
553
+ # try:
554
+ # while True:
555
+ # try:
556
+ # data = await ws.receive_json()
557
+ # except WebSocketDisconnect:
558
+ # print("❌ Client disconnected cleanly")
559
+ # break
560
+ # except Exception as e:
561
+ # print(f"⚠️ Connection lost: {e}")
562
+ # break
563
+
564
+ # if "config" in data:
565
+ # voice_name = data.get("voice", "🇺🇸 🚺 Bella")
566
+ # voice_code = VOICE_CHOICES.get(voice_name, voice_name)
567
+ # speed = float(data.get("speed", speed))
568
+
569
+ # if "text" in data:
570
+ # text = normalize_text(data.get("text", ""))
571
+ # if text.strip():
572
+ # await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
573
+
574
+ # if "flush" in data:
575
+ # pass
576
+
577
+ # finally:
578
+ # heartbeat_task.cancel()
579
+
580
+ # # ----------------------------
581
+ # # GRADIO APP
582
+ # # ----------------------------
583
+ # with gr.Blocks(title="Kokoro TTS") as app:
584
+ # gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
585
+ # with gr.Row():
586
+ # with gr.Column():
587
+ # text_in = gr.Textbox(
588
+ # label="Input Text",
589
+ # lines=3,
590
+ # value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
591
+ # )
592
+ # voice_in = gr.Dropdown(
593
+ # list(VOICE_CHOICES.keys()),
594
+ # value="🇺🇸 🚺 Bella",
595
+ # label="Voice",
596
+ # )
597
+ # speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
598
+ # btn = gr.Button("Generate", variant="primary")
599
+ # with gr.Column():
600
+ # audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
601
+
602
+ # btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
603
+
604
+ # final_app = gr.mount_gradio_app(api, app, path="/")
605
+
606
+ # if __name__ == "__main__":
607
+ # uvicorn.run(final_app, host="0.0.0.0", port=7860)
608
+ #claude code
609
+ """
610
+ Kokoro TTS WebSocket Server - OPTIMIZED for 2 vCPU / 16GB RAM
611
+ ============================================================
612
+ Fixes:
613
+ - Backpressure loop timeout prevents worker thread hang
614
+ - Parallel inference workers (2, one per vCPU)
615
+ - Proper error handling with traceback logging
616
+ - Generation timeout to prevent infinite hangs
617
+ - Memory-optimized with periodic garbage collection
618
+ - Aggressive batching for throughput
619
+ """
620
+
621
  import os
622
  import re
623
+ import gc
624
  import time
625
  import asyncio
626
+ import traceback
627
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
628
 
629
  import numpy as np
630
  import gradio as gr
 
635
  from kokoro import KPipeline
636
 
637
  # ----------------------------
638
+ # MAXIMIZE 2 vCPU UTILIZATION
639
  # ----------------------------
640
+ CPU_COUNT = 2
641
+ os.environ["OMP_NUM_THREADS"] = str(CPU_COUNT)
642
+ os.environ["MKL_NUM_THREADS"] = str(CPU_COUNT)
643
+ os.environ["NUMEXPR_NUM_THREADS"] = str(CPU_COUNT)
644
+ os.environ["OPENBLAS_NUM_THREADS"] = str(CPU_COUNT)
645
 
646
  try:
647
+ torch.set_num_threads(CPU_COUNT)
648
+ torch.set_num_interop_threads(CPU_COUNT)
649
  except Exception:
650
  pass
651
 
652
+ # Use uvloop for faster async on Linux
653
  try:
654
+ import uvloop
655
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
656
+ print("✅ Using uvloop for faster async")
657
+ except ImportError:
658
+ print("⚠️ uvloop not available, using default event loop")
659
+
660
+ print(f"🚀 BOOTING KOKORO - Optimized for {CPU_COUNT} vCPU / 16GB RAM")
661
 
662
+ # ----------------------------
663
+ # CONFIGURATION
664
+ # ----------------------------
665
+ GENERATION_TIMEOUT_SECONDS = 60 # Max time for a single TTS generation
666
+ BACKPRESSURE_TIMEOUT_MS = 10000 # Max wait for queue space (10 seconds)
667
+ WORKER_COUNT = 2 # One per vCPU for parallel processing
668
+ QUEUE_MAXSIZE = 12 # Buffer more frames for smoother streaming
669
 
670
  # ----------------------------
671
  # VOICES
 
686
  def voice_to_lang_code(voice_code: str) -> str:
687
  if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
688
  return "b" # British
689
+ return "a" # American
690
 
691
  # ----------------------------
692
+ # PIPELINES (hot in RAM - uses ~2GB per pipeline)
693
+ # With 16GB RAM we can comfortably hold both
694
  # ----------------------------
695
+ print("📦 Loading Kokoro pipelines into RAM...")
696
  PIPELINES = {
697
  "a": KPipeline(lang_code="a"),
698
  "b": KPipeline(lang_code="b"),
699
  }
700
+ print(f"✅ Pipelines loaded. Memory usage: ~4GB for models")
701
 
702
  # ----------------------------
703
+ # TEXT NORMALIZATION
704
  # ----------------------------
705
  def normalize_text(text: str) -> str:
706
  if not text:
707
  return ""
708
+ # Kokoro pronunciation helper
709
+ text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
710
+ return text
711
 
712
  # ----------------------------
713
+ # FAST SEGMENTATION FOR STREAMING
 
 
 
714
  # ----------------------------
715
  _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
716
 
 
719
  if not text:
720
  return ""
721
 
722
+ # Sentence boundaries -> newline for pipeline segmentation
723
  text = _SENT_BOUNDARY.sub(r"\1\n", text)
724
+
725
+ # Normalize excessive newlines
726
  text = re.sub(r"\n{3,}", "\n\n", text)
727
 
728
  # Guarantee a small first segment for low time-to-first-audio
 
735
  return text
736
 
737
  # ----------------------------
738
+ # AUDIO CONVERSION (optimized)
739
  # ----------------------------
740
  def audio_to_int16_np(audio):
741
  if isinstance(audio, torch.Tensor):
 
743
  audio = torch.clamp(audio, -1.0, 1.0)
744
  return (audio * 32767.0).to(torch.int16).numpy()
745
 
746
+ audio = np.asarray(audio, dtype=np.float32)
747
  audio = np.clip(audio, -1.0, 1.0)
748
  return (audio * 32767.0).astype(np.int16)
749
 
 
751
  return audio_to_int16_np(audio).tobytes()
752
 
753
  # ----------------------------
754
+ # GENERATION WITH TIMEOUT
 
755
  # ----------------------------
756
  def kokoro_generator_full(text: str, voice_code: str, speed: float):
757
+ """
758
+ Generate audio chunks from text using Kokoro pipeline.
759
+ Yields audio tensors for each segment.
760
+ """
761
  lang_code = voice_to_lang_code(voice_code)
762
  pipeline = PIPELINES[lang_code]
763
  text = inject_newlines_for_fast_stream(text)
 
765
  if not text:
766
  return
767
 
768
+ chunk_count = 0
769
+ start_time = time.time()
770
+
771
+ try:
772
+ with torch.inference_mode():
773
+ generator = pipeline(
774
+ text,
775
+ voice=voice_code,
776
+ speed=float(speed),
777
+ split_pattern=r"\n+",
778
+ )
779
+ for _, _, audio in generator:
780
+ chunk_count += 1
781
+ elapsed = time.time() - start_time
782
+
783
+ # Timeout protection
784
+ if elapsed > GENERATION_TIMEOUT_SECONDS:
785
+ print(f"⚠️ Generation timeout after {elapsed:.1f}s, {chunk_count} chunks")
786
+ break
787
+
788
+ yield audio
789
+
790
+ print(f"✅ Generated {chunk_count} chunks in {time.time() - start_time:.2f}s")
791
+
792
+ except Exception as e:
793
+ print(f"❌ Generation error: {e}")
794
+ traceback.print_exc()
795
+ finally:
796
+ # Periodic garbage collection to prevent memory buildup
797
+ if chunk_count > 10:
798
+ gc.collect()
799
 
800
  # ----------------------------
801
+ # WARMUP (preload models)
802
  # ----------------------------
803
  def warmup():
804
  try:
805
  t0 = time.time()
806
+ for _ in kokoro_generator_full("Hello, this is a warmup test.", "af_bella", 1.0):
807
  break
808
  print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
809
  except Exception as e:
810
  print(f"⚠️ WARMUP FAILED: {e}")
811
+ traceback.print_exc()
812
 
813
  # ----------------------------
814
  # GRADIO UI STREAM
 
826
  yield 24000, audio_to_int16_np(audio)
827
 
828
  # ----------------------------
829
+ # FASTAPI WEBSOCKET ENGINE
 
 
 
830
  # ----------------------------
831
  api = FastAPI()
832
 
833
+ # Use multiple workers for parallel inference
834
+ INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=WORKER_COUNT)
835
  INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
836
 
837
  async def audio_engine_loop():
838
+ """
839
+ Main audio processing loop.
840
+ Pulls requests from queue and streams audio back to clients.
841
+ """
842
+ print(f"⚡ API AUDIO PIPELINE STARTED ({WORKER_COUNT} workers)")
843
  loop = asyncio.get_running_loop()
844
 
845
  while True:
846
+ try:
847
+ ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
848
+ except Exception as e:
849
+ print(f"⚠️ Queue get error: {e}")
850
+ continue
851
 
852
  # Skip dead clients early
853
+ try:
854
+ if ws.client_state.value > 1:
855
+ print("⏭️ Skipping dead client")
856
+ continue
857
+ except Exception:
858
  continue
859
 
860
+ frame_q: asyncio.Queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
861
+ generation_id = id(ws)
862
 
863
  def _worker():
864
+ """Worker thread for audio generation."""
865
+ chunk_count = 0
866
+ start_time = time.time()
867
+
868
  try:
869
+ print(f"🔊 [{generation_id}] Starting TTS: {text[:50]}...")
870
+
871
  for audio in kokoro_generator_full(text, voice_code, speed):
872
  b = audio_to_pcm_bytes(audio)
873
+ chunk_count += 1
874
+
875
+ if chunk_count == 1:
876
+ print(f"⚡ [{generation_id}] First chunk ready in {time.time() - start_time:.2f}s")
877
+
878
+ # Backpressure with TIMEOUT to prevent infinite hang
879
+ attempts = 0
880
+ max_attempts = BACKPRESSURE_TIMEOUT_MS # 10 seconds at 1ms/attempt
881
+
882
+ while attempts < max_attempts:
883
  try:
884
  loop.call_soon_threadsafe(frame_q.put_nowait, b)
885
  break
886
+ except asyncio.QueueFull:
887
  time.sleep(0.001)
888
+ attempts += 1
889
+ else:
890
+ # Timeout reached - client too slow or disconnected
891
+ print(f"⚠️ [{generation_id}] Backpressure timeout after {attempts}ms - aborting")
892
+ break
893
+
894
+ # Send completion signal
895
  loop.call_soon_threadsafe(frame_q.put_nowait, None)
896
+ print(f"✅ [{generation_id}] Completed: {chunk_count} chunks in {time.time() - start_time:.2f}s")
897
+
898
  except Exception as e:
899
+ print(f" [{generation_id}] Worker error: {e}")
900
+ traceback.print_exc()
901
  try:
902
  loop.call_soon_threadsafe(frame_q.put_nowait, None)
903
  except Exception:
904
  pass
905
 
906
+ # Submit to executor
907
  INFERENCE_EXECUTOR.submit(_worker)
908
 
909
+ # Stream frames to client
910
  first_sent = False
911
  started = time.time()
912
+ frames_sent = 0
913
 
914
  while True:
915
+ try:
916
+ # Timeout on frame retrieval to prevent infinite hang
917
+ frame = await asyncio.wait_for(frame_q.get(), timeout=30.0)
918
+ except asyncio.TimeoutError:
919
+ print(f"⚠️ [{generation_id}] Frame queue timeout - no data for 30s")
920
+ break
921
+
922
  if frame is None:
923
  break
924
 
925
+ # Check client still alive
926
+ try:
927
+ if ws.client_state.value > 1:
928
+ print(f"⏭️ [{generation_id}] Client disconnected mid-stream")
929
+ break
930
+ except Exception:
931
  break
932
 
933
  try:
934
  await ws.send_bytes(frame)
935
+ frames_sent += 1
936
+
937
  if not first_sent:
938
+ print(f"⚡ [{generation_id}] First audio sent in {time.time() - started:.2f}s")
939
  first_sent = True
940
+ except Exception as e:
941
+ print(f"⚠️ [{generation_id}] Send failed: {e}")
942
  break
943
 
944
+ print(f"📤 [{generation_id}] Streaming complete: {frames_sent} frames sent")
945
+
946
  @api.on_event("startup")
947
  async def startup():
948
  loop = asyncio.get_running_loop()
949
+
950
+ # Warmup in executor to not block startup
951
  await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
952
+
953
+ # Start the audio engine loop
954
  asyncio.create_task(audio_engine_loop())
955
+
956
+ print("🚀 Server ready!")
957
 
958
  @api.websocket("/ws/audio")
959
  async def websocket_endpoint(ws: WebSocket):
 
961
 
962
  voice_code = "af_bella"
963
  speed = 1.0
964
+ client_id = id(ws)
965
 
966
+ print(f"✅ [{client_id}] Client connected: {ws.client}")
967
 
968
  async def keep_alive():
969
+ """Send periodic pings to keep connection alive."""
970
  while True:
971
  try:
972
  await asyncio.sleep(15)
 
979
  try:
980
  while True:
981
  try:
982
+ data = await asyncio.wait_for(ws.receive_json(), timeout=120.0)
983
+ except asyncio.TimeoutError:
984
+ print(f"⏱️ [{client_id}] Connection timeout - no messages for 120s")
985
+ break
986
  except WebSocketDisconnect:
987
+ print(f"❌ [{client_id}] Client disconnected cleanly")
988
  break
989
  except Exception as e:
990
+ print(f"⚠️ [{client_id}] Connection error: {e}")
991
  break
992
 
993
+ # Handle config updates
994
  if "config" in data:
995
  voice_name = data.get("voice", "🇺🇸 🚺 Bella")
996
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
997
  speed = float(data.get("speed", speed))
998
+ print(f"🎛️ [{client_id}] Config: voice={voice_code}, speed={speed}")
999
 
1000
+ # Handle text-to-speech request
1001
  if "text" in data:
1002
  text = normalize_text(data.get("text", ""))
1003
  if text.strip():
1004
+ print(f"📥 [{client_id}] TTS request: {text[:50]}...")
1005
  await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
1006
 
1007
+ # Handle flush (no-op for now, could clear queue)
1008
  if "flush" in data:
1009
  pass
1010
 
1011
  finally:
1012
  heartbeat_task.cancel()
1013
+ print(f"👋 [{client_id}] Connection closed")
1014
+
1015
+ # ----------------------------
1016
+ # HEALTH CHECK ENDPOINT
1017
+ # ----------------------------
1018
+ @api.get("/health")
1019
+ async def health_check():
1020
+ return {
1021
+ "status": "healthy",
1022
+ "workers": WORKER_COUNT,
1023
+ "queue_size": INFERENCE_QUEUE.qsize(),
1024
+ }
1025
 
1026
  # ----------------------------
1027
  # GRADIO APP
1028
  # ----------------------------
1029
  with gr.Blocks(title="Kokoro TTS") as app:
1030
+ gr.Markdown("## ⚡ Kokoro-82M (Optimized for 2 vCPU / 16GB RAM)")
1031
+ gr.Markdown("API: Connect to `/ws/audio` for real-time streaming")
1032
+
1033
  with gr.Row():
1034
  with gr.Column():
1035
  text_in = gr.Textbox(
1036
  label="Input Text",
1037
  lines=3,
1038
+ value="Hello! This is the Kokoro text-to-speech system. The server is optimized for low latency streaming.",
1039
  )
1040
  voice_in = gr.Dropdown(
1041
  list(VOICE_CHOICES.keys()),
 
1052
  final_app = gr.mount_gradio_app(api, app, path="/")
1053
 
1054
  if __name__ == "__main__":
1055
+ uvicorn.run(
1056
+ final_app,
1057
+ host="0.0.0.0",
1058
+ port=7860,
1059
+ workers=1, # Single process, multiple threads
1060
+ log_level="info",
1061
+ )