auralodyssey commited on
Commit
d638591
ยท
verified ยท
1 Parent(s): 89cbd38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +638 -638
app.py CHANGED
@@ -295,767 +295,767 @@
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  #OLD KOKORO CHATGPT CODE
298
- # import os
299
- # import re
300
- # import time
301
- # import asyncio
302
- # from concurrent.futures import ThreadPoolExecutor
303
-
304
- # import numpy as np
305
- # import gradio as gr
306
- # from fastapi import FastAPI, WebSocket, WebSocketDisconnect
307
- # import uvicorn
308
 
309
- # import torch
310
- # from kokoro import KPipeline
 
 
311
 
312
- # # ----------------------------
313
- # # HARD LIMIT CPU THREADS (2 vCPU box)
314
- # # ----------------------------
315
- # os.environ.setdefault("OMP_NUM_THREADS", "2")
316
- # os.environ.setdefault("MKL_NUM_THREADS", "2")
317
- # os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
318
 
319
- # try:
320
- # torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
321
- # torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
322
- # except Exception:
323
- # pass
 
324
 
325
- # # Optional: uvloop for faster event loop on HF Linux
326
- # try:
327
- # import uvloop # type: ignore
328
- # asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
329
- # except Exception:
330
- # pass
331
 
332
- # print("๐Ÿš€ BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
 
 
 
 
 
333
 
334
- # # ----------------------------
335
- # # VOICES
336
- # # ----------------------------
337
- # VOICE_CHOICES = {
338
- # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Heart": "af_heart", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella": "af_bella", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nicole": "af_nicole",
339
- # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Aoede": "af_aoede", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Kore": "af_kore", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sarah": "af_sarah",
340
- # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nova": "af_nova", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sky": "af_sky", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Alloy": "af_alloy",
341
- # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Jessica": "af_jessica", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ River": "af_river", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Michael": "am_michael",
342
- # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Fenrir": "am_fenrir", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Puck": "am_puck", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Echo": "am_echo",
343
- # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Eric": "am_eric", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Liam": "am_liam", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Onyx": "am_onyx",
344
- # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Santa": "am_santa", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Adam": "am_adam", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Emma": "bf_emma",
345
- # "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Isabella": "bf_isabella", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Alice": "bf_alice", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Lily": "bf_lily",
346
- # "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน George": "bm_george", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Fable": "bm_fable", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Lewis": "bm_lewis",
347
- # "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Daniel": "bm_daniel",
348
- # }
349
 
350
- # def voice_to_lang_code(voice_code: str) -> str:
351
- # if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
352
- # return "b" # British
353
- # return "a" # American
 
 
 
 
 
 
 
 
 
 
 
354
 
355
- # # ----------------------------
356
- # # PIPELINES (keep hot in RAM)
357
- # # ----------------------------
358
- # PIPELINES = {
359
- # "a": KPipeline(lang_code="a"),
360
- # "b": KPipeline(lang_code="b"),
361
- # }
362
 
363
- # # ----------------------------
364
- # # TEXT NORMALIZATION (matches your pasted official docs)
365
- # # ----------------------------
366
- # def normalize_text(text: str) -> str:
367
- # if not text:
368
- # return ""
369
- # return text.replace("Kokoro", "[Kokoro](/kหˆOkษ™ษนO/)")
370
 
371
- # # ----------------------------
372
- # # LOW LATENCY SEGMENTATION
373
- # # One pipeline call per request.
374
- # # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
375
- # # We also force a small first segment for fast first audio.
376
- # # ----------------------------
377
- # _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
378
 
379
- # def inject_newlines_for_fast_stream(text: str) -> str:
380
- # text = normalize_text(text).strip()
381
- # if not text:
382
- # return ""
 
 
 
383
 
384
- # # Sentence boundaries -> newline so official split_pattern can segment
385
- # text = _SENT_BOUNDARY.sub(r"\1\n", text)
 
 
386
 
387
- # # Also split on existing multi-newlines
388
- # text = re.sub(r"\n{3,}", "\n\n", text)
389
 
390
- # # Guarantee a small first segment for low time-to-first-audio
391
- # if "\n" not in text and len(text) > 90:
392
- # cut = text.rfind(" ", 0, 70)
393
- # if cut < 35:
394
- # cut = 70
395
- # text = text[:cut].strip() + "\n" + text[cut:].strip()
396
 
397
- # return text
 
 
 
 
 
398
 
399
- # # ----------------------------
400
- # # AUDIO CONVERSION (fast, safe)
401
- # # ----------------------------
402
- # def audio_to_int16_np(audio):
403
- # if isinstance(audio, torch.Tensor):
404
- # audio = audio.detach().cpu()
405
- # audio = torch.clamp(audio, -1.0, 1.0)
406
- # return (audio * 32767.0).to(torch.int16).numpy()
407
 
408
- # audio = np.asarray(audio)
409
- # audio = np.clip(audio, -1.0, 1.0)
410
- # return (audio * 32767.0).astype(np.int16)
 
 
 
 
 
411
 
412
- # def audio_to_pcm_bytes(audio) -> bytes:
413
- # return audio_to_int16_np(audio).tobytes()
 
414
 
415
- # # ----------------------------
416
- # # OFFICIAL GENERATION PATH (single pipeline call)
417
- # # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
418
- # # ----------------------------
419
- # def kokoro_generator_full(text: str, voice_code: str, speed: float):
420
- # lang_code = voice_to_lang_code(voice_code)
421
- # pipeline = PIPELINES[lang_code]
422
- # text = inject_newlines_for_fast_stream(text)
423
 
424
- # if not text:
425
- # return
 
 
 
 
 
 
426
 
427
- # with torch.inference_mode():
428
- # generator = pipeline(
429
- # text,
430
- # voice=voice_code,
431
- # speed=float(speed),
432
- # split_pattern=r"\n+",
433
- # )
434
- # for _, _, audio in generator:
435
- # yield audio
436
 
437
- # # ----------------------------
438
- # # WARMUP (pay cold-start cost at boot)
439
- # # ----------------------------
440
- # def warmup():
441
- # try:
442
- # t0 = time.time()
443
- # for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
444
- # break
445
- # print(f"โœ… WARMUP DONE in {time.time() - t0:.2f}s")
446
- # except Exception as e:
447
- # print(f"โš ๏ธ WARMUP FAILED: {e}")
448
 
449
- # # ----------------------------
450
- # # GRADIO UI STREAM
451
- # # ----------------------------
452
- # def gradio_stream(text, voice_name, speed):
453
- # voice_code = VOICE_CHOICES.get(voice_name, voice_name)
454
- # text = normalize_text(text)
 
 
 
 
 
455
 
456
- # i = 0
457
- # t0 = time.time()
458
- # for audio in kokoro_generator_full(text, voice_code, speed):
459
- # if i == 0:
460
- # print(f"โšก UI first audio in {time.time() - t0:.2f}s")
461
- # i += 1
462
- # yield 24000, audio_to_int16_np(audio)
463
 
464
- # # ----------------------------
465
- # # FASTAPI WS ENGINE
466
- # # Single worker thread for actual generation.
467
- # # Stream frames to client as soon as they exist.
468
- # # No buffering a full list before sending.
469
- # # ----------------------------
470
- # api = FastAPI()
471
 
472
- # INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
473
- # INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
 
 
 
 
 
474
 
475
- # async def audio_engine_loop():
476
- # print("โšก API AUDIO PIPELINE STARTED")
477
- # loop = asyncio.get_running_loop()
478
 
479
- # while True:
480
- # ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
 
481
 
482
- # # Skip dead clients early
483
- # if ws.client_state.value > 1:
484
- # continue
485
 
486
- # frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
 
 
487
 
488
- # def _worker():
489
- # try:
490
- # for audio in kokoro_generator_full(text, voice_code, speed):
491
- # b = audio_to_pcm_bytes(audio)
492
- # # backpressure aware
493
- # while True:
494
- # try:
495
- # loop.call_soon_threadsafe(frame_q.put_nowait, b)
496
- # break
497
- # except Exception:
498
- # time.sleep(0.001)
499
- # loop.call_soon_threadsafe(frame_q.put_nowait, None)
500
- # except Exception as e:
501
- # print(f"API Worker Error: {e}")
502
- # try:
503
- # loop.call_soon_threadsafe(frame_q.put_nowait, None)
504
- # except Exception:
505
- # pass
506
 
507
- # INFERENCE_EXECUTOR.submit(_worker)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
- # first_sent = False
510
- # started = time.time()
511
 
512
- # while True:
513
- # frame = await frame_q.get()
514
- # if frame is None:
515
- # break
516
 
517
- # if ws.client_state.value > 1:
518
- # break
 
 
519
 
520
- # try:
521
- # await ws.send_bytes(frame)
522
- # if not first_sent:
523
- # print(f"โšก API first audio in {time.time() - started:.2f}s")
524
- # first_sent = True
525
- # except Exception:
526
- # break
527
 
528
- # @api.on_event("startup")
529
- # async def startup():
530
- # loop = asyncio.get_running_loop()
531
- # await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
532
- # asyncio.create_task(audio_engine_loop())
 
 
533
 
534
- # @api.websocket("/ws/audio")
535
- # async def websocket_endpoint(ws: WebSocket):
536
- # await ws.accept()
 
 
537
 
538
- # voice_code = "af_bella"
539
- # speed = 1.0
 
540
 
541
- # print(f"โœ… Client connected: {ws.client}")
 
542
 
543
- # async def keep_alive():
544
- # while True:
545
- # try:
546
- # await asyncio.sleep(15)
547
- # await ws.send_json({"type": "ping"})
548
- # except Exception:
549
- # break
550
 
551
- # heartbeat_task = asyncio.create_task(keep_alive())
 
 
 
 
 
 
552
 
553
- # try:
554
- # while True:
555
- # try:
556
- # data = await ws.receive_json()
557
- # except WebSocketDisconnect:
558
- # print("โŒ Client disconnected cleanly")
559
- # break
560
- # except Exception as e:
561
- # print(f"โš ๏ธ Connection lost: {e}")
562
- # break
563
 
564
- # if "config" in data:
565
- # voice_name = data.get("voice", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella")
566
- # voice_code = VOICE_CHOICES.get(voice_name, voice_name)
567
- # speed = float(data.get("speed", speed))
 
 
 
 
 
 
568
 
569
- # if "text" in data:
570
- # text = normalize_text(data.get("text", ""))
571
- # if text.strip():
572
- # await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
573
 
574
- # if "flush" in data:
575
- # pass
 
 
576
 
577
- # finally:
578
- # heartbeat_task.cancel()
579
 
580
- # # ----------------------------
581
- # # GRADIO APP
582
- # # ----------------------------
583
- # with gr.Blocks(title="Kokoro TTS") as app:
584
- # gr.Markdown("## โšก Kokoro-82M (Official Pipeline, Low Latency)")
585
- # with gr.Row():
586
- # with gr.Column():
587
- # text_in = gr.Textbox(
588
- # label="Input Text",
589
- # lines=3,
590
- # value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
591
- # )
592
- # voice_in = gr.Dropdown(
593
- # list(VOICE_CHOICES.keys()),
594
- # value="๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella",
595
- # label="Voice",
596
- # )
597
- # speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
598
- # btn = gr.Button("Generate", variant="primary")
599
- # with gr.Column():
600
- # audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
601
 
602
- # btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
- # final_app = gr.mount_gradio_app(api, app, path="/")
605
 
606
- # if __name__ == "__main__":
607
- # uvicorn.run(final_app, host="0.0.0.0", port=7860)
 
 
608
  #claude code
609
- """
610
- Kokoro TTS WebSocket Server - OPTIMIZED for 2 vCPU / 16GB RAM
611
- ============================================================
612
- Fixes:
613
- - Backpressure loop timeout prevents worker thread hang
614
- - Parallel inference workers (2, one per vCPU)
615
- - Proper error handling with traceback logging
616
- - Generation timeout to prevent infinite hangs
617
- - Memory-optimized with periodic garbage collection
618
- - Aggressive batching for throughput
619
- """
620
 
621
- import os
622
- import re
623
- import gc
624
- import time
625
- import asyncio
626
- import traceback
627
- from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
628
 
629
- import numpy as np
630
- import gradio as gr
631
- from fastapi import FastAPI, WebSocket, WebSocketDisconnect
632
- import uvicorn
633
 
634
- import torch
635
- from kokoro import KPipeline
636
 
637
- # ----------------------------
638
- # MAXIMIZE 2 vCPU UTILIZATION
639
- # ----------------------------
640
- CPU_COUNT = 2
641
- os.environ["OMP_NUM_THREADS"] = str(CPU_COUNT)
642
- os.environ["MKL_NUM_THREADS"] = str(CPU_COUNT)
643
- os.environ["NUMEXPR_NUM_THREADS"] = str(CPU_COUNT)
644
- os.environ["OPENBLAS_NUM_THREADS"] = str(CPU_COUNT)
645
 
646
- try:
647
- torch.set_num_threads(CPU_COUNT)
648
- torch.set_num_interop_threads(CPU_COUNT)
649
- except Exception:
650
- pass
651
 
652
- # Use uvloop for faster async on Linux
653
- try:
654
- import uvloop
655
- asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
656
- print("โœ… Using uvloop for faster async")
657
- except ImportError:
658
- print("โš ๏ธ uvloop not available, using default event loop")
659
 
660
- print(f"๐Ÿš€ BOOTING KOKORO - Optimized for {CPU_COUNT} vCPU / 16GB RAM")
661
 
662
- # ----------------------------
663
- # CONFIGURATION
664
- # ----------------------------
665
- GENERATION_TIMEOUT_SECONDS = 60 # Max time for a single TTS generation
666
- BACKPRESSURE_TIMEOUT_MS = 10000 # Max wait for queue space (10 seconds)
667
- WORKER_COUNT = 2 # One per vCPU for parallel processing
668
- QUEUE_MAXSIZE = 12 # Buffer more frames for smoother streaming
669
 
670
- # ----------------------------
671
- # VOICES
672
- # ----------------------------
673
- VOICE_CHOICES = {
674
- "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Heart": "af_heart", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella": "af_bella", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nicole": "af_nicole",
675
- "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Aoede": "af_aoede", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Kore": "af_kore", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sarah": "af_sarah",
676
- "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nova": "af_nova", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sky": "af_sky", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Alloy": "af_alloy",
677
- "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Jessica": "af_jessica", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ River": "af_river", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Michael": "am_michael",
678
- "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Fenrir": "am_fenrir", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Puck": "am_puck", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Echo": "am_echo",
679
- "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Eric": "am_eric", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Liam": "am_liam", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Onyx": "am_onyx",
680
- "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Santa": "am_santa", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Adam": "am_adam", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Emma": "bf_emma",
681
- "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Isabella": "bf_isabella", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Alice": "bf_alice", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Lily": "bf_lily",
682
- "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน George": "bm_george", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Fable": "bm_fable", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Lewis": "bm_lewis",
683
- "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Daniel": "bm_daniel",
684
- }
685
 
686
- def voice_to_lang_code(voice_code: str) -> str:
687
- if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
688
- return "b" # British
689
- return "a" # American
690
 
691
- # ----------------------------
692
- # PIPELINES (hot in RAM - uses ~2GB per pipeline)
693
- # With 16GB RAM we can comfortably hold both
694
- # ----------------------------
695
- print("๐Ÿ“ฆ Loading Kokoro pipelines into RAM...")
696
- PIPELINES = {
697
- "a": KPipeline(lang_code="a"),
698
- "b": KPipeline(lang_code="b"),
699
- }
700
- print(f"โœ… Pipelines loaded. Memory usage: ~4GB for models")
701
 
702
- # ----------------------------
703
- # TEXT NORMALIZATION
704
- # ----------------------------
705
- def normalize_text(text: str) -> str:
706
- if not text:
707
- return ""
708
- # Kokoro pronunciation helper
709
- text = text.replace("Kokoro", "[Kokoro](/kหˆOkษ™ษนO/)")
710
- return text
711
 
712
- # ----------------------------
713
- # FAST SEGMENTATION FOR STREAMING
714
- # ----------------------------
715
- _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
716
 
717
- def inject_newlines_for_fast_stream(text: str) -> str:
718
- text = normalize_text(text).strip()
719
- if not text:
720
- return ""
721
 
722
- # Sentence boundaries -> newline for pipeline segmentation
723
- text = _SENT_BOUNDARY.sub(r"\1\n", text)
724
 
725
- # Normalize excessive newlines
726
- text = re.sub(r"\n{3,}", "\n\n", text)
727
 
728
- # Guarantee a small first segment for low time-to-first-audio
729
- if "\n" not in text and len(text) > 90:
730
- cut = text.rfind(" ", 0, 70)
731
- if cut < 35:
732
- cut = 70
733
- text = text[:cut].strip() + "\n" + text[cut:].strip()
734
 
735
- return text
736
 
737
- # ----------------------------
738
- # AUDIO CONVERSION (optimized)
739
- # ----------------------------
740
- def audio_to_int16_np(audio):
741
- if isinstance(audio, torch.Tensor):
742
- audio = audio.detach().cpu()
743
- audio = torch.clamp(audio, -1.0, 1.0)
744
- return (audio * 32767.0).to(torch.int16).numpy()
745
 
746
- audio = np.asarray(audio, dtype=np.float32)
747
- audio = np.clip(audio, -1.0, 1.0)
748
- return (audio * 32767.0).astype(np.int16)
749
 
750
- def audio_to_pcm_bytes(audio) -> bytes:
751
- return audio_to_int16_np(audio).tobytes()
752
 
753
- # ----------------------------
754
- # GENERATION WITH TIMEOUT
755
- # ----------------------------
756
- def kokoro_generator_full(text: str, voice_code: str, speed: float):
757
- """
758
- Generate audio chunks from text using Kokoro pipeline.
759
- Yields audio tensors for each segment.
760
- """
761
- lang_code = voice_to_lang_code(voice_code)
762
- pipeline = PIPELINES[lang_code]
763
- text = inject_newlines_for_fast_stream(text)
764
 
765
- if not text:
766
- return
767
 
768
- chunk_count = 0
769
- start_time = time.time()
770
 
771
- try:
772
- with torch.inference_mode():
773
- generator = pipeline(
774
- text,
775
- voice=voice_code,
776
- speed=float(speed),
777
- split_pattern=r"\n+",
778
- )
779
- for _, _, audio in generator:
780
- chunk_count += 1
781
- elapsed = time.time() - start_time
782
 
783
- # Timeout protection
784
- if elapsed > GENERATION_TIMEOUT_SECONDS:
785
- print(f"โš ๏ธ Generation timeout after {elapsed:.1f}s, {chunk_count} chunks")
786
- break
787
 
788
- yield audio
789
 
790
- print(f"โœ… Generated {chunk_count} chunks in {time.time() - start_time:.2f}s")
791
 
792
- except Exception as e:
793
- print(f"โŒ Generation error: {e}")
794
- traceback.print_exc()
795
- finally:
796
- # Periodic garbage collection to prevent memory buildup
797
- if chunk_count > 10:
798
- gc.collect()
799
 
800
- # ----------------------------
801
- # WARMUP (preload models)
802
- # ----------------------------
803
- def warmup():
804
- try:
805
- t0 = time.time()
806
- for _ in kokoro_generator_full("Hello, this is a warmup test.", "af_bella", 1.0):
807
- break
808
- print(f"โœ… WARMUP DONE in {time.time() - t0:.2f}s")
809
- except Exception as e:
810
- print(f"โš ๏ธ WARMUP FAILED: {e}")
811
- traceback.print_exc()
812
 
813
- # ----------------------------
814
- # GRADIO UI STREAM
815
- # ----------------------------
816
- def gradio_stream(text, voice_name, speed):
817
- voice_code = VOICE_CHOICES.get(voice_name, voice_name)
818
- text = normalize_text(text)
819
 
820
- i = 0
821
- t0 = time.time()
822
- for audio in kokoro_generator_full(text, voice_code, speed):
823
- if i == 0:
824
- print(f"โšก UI first audio in {time.time() - t0:.2f}s")
825
- i += 1
826
- yield 24000, audio_to_int16_np(audio)
827
 
828
- # ----------------------------
829
- # FASTAPI WEBSOCKET ENGINE
830
- # ----------------------------
831
- api = FastAPI()
832
 
833
- # Use multiple workers for parallel inference
834
- INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=WORKER_COUNT)
835
- INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
836
 
837
- async def audio_engine_loop():
838
- """
839
- Main audio processing loop.
840
- Pulls requests from queue and streams audio back to clients.
841
- """
842
- print(f"โšก API AUDIO PIPELINE STARTED ({WORKER_COUNT} workers)")
843
- loop = asyncio.get_running_loop()
844
 
845
- while True:
846
- try:
847
- ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
848
- except Exception as e:
849
- print(f"โš ๏ธ Queue get error: {e}")
850
- continue
851
 
852
- # Skip dead clients early
853
- try:
854
- if ws.client_state.value > 1:
855
- print("โญ๏ธ Skipping dead client")
856
- continue
857
- except Exception:
858
- continue
859
 
860
- frame_q: asyncio.Queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
861
- generation_id = id(ws)
862
 
863
- def _worker():
864
- """Worker thread for audio generation."""
865
- chunk_count = 0
866
- start_time = time.time()
867
 
868
- try:
869
- print(f"๐Ÿ”Š [{generation_id}] Starting TTS: {text[:50]}...")
870
 
871
- for audio in kokoro_generator_full(text, voice_code, speed):
872
- b = audio_to_pcm_bytes(audio)
873
- chunk_count += 1
874
 
875
- if chunk_count == 1:
876
- print(f"โšก [{generation_id}] First chunk ready in {time.time() - start_time:.2f}s")
877
 
878
- # Backpressure with TIMEOUT to prevent infinite hang
879
- attempts = 0
880
- max_attempts = BACKPRESSURE_TIMEOUT_MS # 10 seconds at 1ms/attempt
881
 
882
- while attempts < max_attempts:
883
- try:
884
- loop.call_soon_threadsafe(frame_q.put_nowait, b)
885
- break
886
- except asyncio.QueueFull:
887
- time.sleep(0.001)
888
- attempts += 1
889
- else:
890
- # Timeout reached - client too slow or disconnected
891
- print(f"โš ๏ธ [{generation_id}] Backpressure timeout after {attempts}ms - aborting")
892
- break
893
 
894
- # Send completion signal
895
- loop.call_soon_threadsafe(frame_q.put_nowait, None)
896
- print(f"โœ… [{generation_id}] Completed: {chunk_count} chunks in {time.time() - start_time:.2f}s")
897
 
898
- except Exception as e:
899
- print(f"โŒ [{generation_id}] Worker error: {e}")
900
- traceback.print_exc()
901
- try:
902
- loop.call_soon_threadsafe(frame_q.put_nowait, None)
903
- except Exception:
904
- pass
905
 
906
- # Submit to executor
907
- INFERENCE_EXECUTOR.submit(_worker)
908
 
909
- # Stream frames to client
910
- first_sent = False
911
- started = time.time()
912
- frames_sent = 0
913
 
914
- while True:
915
- try:
916
- # Timeout on frame retrieval to prevent infinite hang
917
- frame = await asyncio.wait_for(frame_q.get(), timeout=30.0)
918
- except asyncio.TimeoutError:
919
- print(f"โš ๏ธ [{generation_id}] Frame queue timeout - no data for 30s")
920
- break
921
 
922
- if frame is None:
923
- break
924
 
925
- # Check client still alive
926
- try:
927
- if ws.client_state.value > 1:
928
- print(f"โญ๏ธ [{generation_id}] Client disconnected mid-stream")
929
- break
930
- except Exception:
931
- break
932
 
933
- try:
934
- await ws.send_bytes(frame)
935
- frames_sent += 1
936
 
937
- if not first_sent:
938
- print(f"โšก [{generation_id}] First audio sent in {time.time() - started:.2f}s")
939
- first_sent = True
940
- except Exception as e:
941
- print(f"โš ๏ธ [{generation_id}] Send failed: {e}")
942
- break
943
 
944
- print(f"๐Ÿ“ค [{generation_id}] Streaming complete: {frames_sent} frames sent")
945
 
946
- @api.on_event("startup")
947
- async def startup():
948
- loop = asyncio.get_running_loop()
949
 
950
- # Warmup in executor to not block startup
951
- await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
952
 
953
- # Start the audio engine loop
954
- asyncio.create_task(audio_engine_loop())
955
 
956
- print("๐Ÿš€ Server ready!")
957
 
958
- @api.websocket("/ws/audio")
959
- async def websocket_endpoint(ws: WebSocket):
960
- await ws.accept()
961
 
962
- voice_code = "af_bella"
963
- speed = 1.0
964
- client_id = id(ws)
965
 
966
- print(f"โœ… [{client_id}] Client connected: {ws.client}")
967
 
968
- async def keep_alive():
969
- """Send periodic pings to keep connection alive."""
970
- while True:
971
- try:
972
- await asyncio.sleep(15)
973
- await ws.send_json({"type": "ping"})
974
- except Exception:
975
- break
976
 
977
- heartbeat_task = asyncio.create_task(keep_alive())
978
 
979
- try:
980
- while True:
981
- try:
982
- data = await asyncio.wait_for(ws.receive_json(), timeout=120.0)
983
- except asyncio.TimeoutError:
984
- print(f"โฑ๏ธ [{client_id}] Connection timeout - no messages for 120s")
985
- break
986
- except WebSocketDisconnect:
987
- print(f"โŒ [{client_id}] Client disconnected cleanly")
988
- break
989
- except Exception as e:
990
- print(f"โš ๏ธ [{client_id}] Connection error: {e}")
991
- break
992
 
993
- # Handle config updates
994
- if "config" in data:
995
- voice_name = data.get("voice", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella")
996
- voice_code = VOICE_CHOICES.get(voice_name, voice_name)
997
- speed = float(data.get("speed", speed))
998
- print(f"๐ŸŽ›๏ธ [{client_id}] Config: voice={voice_code}, speed={speed}")
999
 
1000
- # Handle text-to-speech request
1001
- if "text" in data:
1002
- text = normalize_text(data.get("text", ""))
1003
- if text.strip():
1004
- print(f"๐Ÿ“ฅ [{client_id}] TTS request: {text[:50]}...")
1005
- await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
1006
 
1007
- # Handle flush (no-op for now, could clear queue)
1008
- if "flush" in data:
1009
- pass
1010
 
1011
- finally:
1012
- heartbeat_task.cancel()
1013
- print(f"๐Ÿ‘‹ [{client_id}] Connection closed")
1014
 
1015
- # ----------------------------
1016
- # HEALTH CHECK ENDPOINT
1017
- # ----------------------------
1018
- @api.get("/health")
1019
- async def health_check():
1020
- return {
1021
- "status": "healthy",
1022
- "workers": WORKER_COUNT,
1023
- "queue_size": INFERENCE_QUEUE.qsize(),
1024
- }
1025
 
1026
- # ----------------------------
1027
- # GRADIO APP
1028
- # ----------------------------
1029
- with gr.Blocks(title="Kokoro TTS") as app:
1030
- gr.Markdown("## โšก Kokoro-82M (Optimized for 2 vCPU / 16GB RAM)")
1031
- gr.Markdown("API: Connect to `/ws/audio` for real-time streaming")
1032
 
1033
- with gr.Row():
1034
- with gr.Column():
1035
- text_in = gr.Textbox(
1036
- label="Input Text",
1037
- lines=3,
1038
- value="Hello! This is the Kokoro text-to-speech system. The server is optimized for low latency streaming.",
1039
- )
1040
- voice_in = gr.Dropdown(
1041
- list(VOICE_CHOICES.keys()),
1042
- value="๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella",
1043
- label="Voice",
1044
- )
1045
- speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
1046
- btn = gr.Button("Generate", variant="primary")
1047
- with gr.Column():
1048
- audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
1049
 
1050
- btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
1051
 
1052
- final_app = gr.mount_gradio_app(api, app, path="/")
1053
 
1054
- if __name__ == "__main__":
1055
- uvicorn.run(
1056
- final_app,
1057
- host="0.0.0.0",
1058
- port=7860,
1059
- workers=1, # Single process, multiple threads
1060
- log_level="info",
1061
- )
 
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  #OLD KOKORO CHATGPT CODE
298
+ import os
299
+ import re
300
+ import time
301
+ import asyncio
302
+ from concurrent.futures import ThreadPoolExecutor
 
 
 
 
 
303
 
304
+ import numpy as np
305
+ import gradio as gr
306
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
307
+ import uvicorn
308
 
309
+ import torch
310
+ from kokoro import KPipeline
 
 
 
 
311
 
312
+ # ----------------------------
313
+ # HARD LIMIT CPU THREADS (2 vCPU box)
314
+ # ----------------------------
315
+ os.environ.setdefault("OMP_NUM_THREADS", "2")
316
+ os.environ.setdefault("MKL_NUM_THREADS", "2")
317
+ os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
318
 
319
+ try:
320
+ torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
321
+ torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
322
+ except Exception:
323
+ pass
 
324
 
325
+ # Optional: uvloop for faster event loop on HF Linux
326
+ try:
327
+ import uvloop # type: ignore
328
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
329
+ except Exception:
330
+ pass
331
 
332
+ print("๐Ÿš€ BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
+ # ----------------------------
335
+ # VOICES
336
+ # ----------------------------
337
+ VOICE_CHOICES = {
338
+ "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Heart": "af_heart", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella": "af_bella", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nicole": "af_nicole",
339
+ "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Aoede": "af_aoede", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Kore": "af_kore", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sarah": "af_sarah",
340
+ "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nova": "af_nova", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sky": "af_sky", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Alloy": "af_alloy",
341
+ "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Jessica": "af_jessica", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ River": "af_river", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Michael": "am_michael",
342
+ "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Fenrir": "am_fenrir", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Puck": "am_puck", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Echo": "am_echo",
343
+ "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Eric": "am_eric", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Liam": "am_liam", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Onyx": "am_onyx",
344
+ "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Santa": "am_santa", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Adam": "am_adam", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Emma": "bf_emma",
345
+ "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Isabella": "bf_isabella", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Alice": "bf_alice", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Lily": "bf_lily",
346
+ "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน George": "bm_george", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Fable": "bm_fable", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Lewis": "bm_lewis",
347
+ "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Daniel": "bm_daniel",
348
+ }
349
 
350
+ def voice_to_lang_code(voice_code: str) -> str:
351
+ if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
352
+ return "b" # British
353
+ return "a" # American
 
 
 
354
 
355
+ # ----------------------------
356
+ # PIPELINES (keep hot in RAM)
357
+ # ----------------------------
358
+ PIPELINES = {
359
+ "a": KPipeline(lang_code="a"),
360
+ "b": KPipeline(lang_code="b"),
361
+ }
362
 
363
+ # ----------------------------
364
+ # TEXT NORMALIZATION (matches your pasted official docs)
365
+ # ----------------------------
366
+ def normalize_text(text: str) -> str:
367
+ if not text:
368
+ return ""
369
+ return text.replace("Kokoro", "[Kokoro](/kหˆOkษ™ษนO/)")
370
 
371
+ # ----------------------------
372
+ # LOW LATENCY SEGMENTATION
373
+ # One pipeline call per request.
374
+ # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
375
+ # We also force a small first segment for fast first audio.
376
+ # ----------------------------
377
+ _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
378
 
379
+ def inject_newlines_for_fast_stream(text: str) -> str:
380
+ text = normalize_text(text).strip()
381
+ if not text:
382
+ return ""
383
 
384
+ # Sentence boundaries -> newline so official split_pattern can segment
385
+ text = _SENT_BOUNDARY.sub(r"\1\n", text)
386
 
387
+ # Also split on existing multi-newlines
388
+ text = re.sub(r"\n{3,}", "\n\n", text)
 
 
 
 
389
 
390
+ # Guarantee a small first segment for low time-to-first-audio
391
+ if "\n" not in text and len(text) > 90:
392
+ cut = text.rfind(" ", 0, 70)
393
+ if cut < 35:
394
+ cut = 70
395
+ text = text[:cut].strip() + "\n" + text[cut:].strip()
396
 
397
+ return text
 
 
 
 
 
 
 
398
 
399
+ # ----------------------------
400
+ # AUDIO CONVERSION (fast, safe)
401
+ # ----------------------------
402
+ def audio_to_int16_np(audio):
403
+ if isinstance(audio, torch.Tensor):
404
+ audio = audio.detach().cpu()
405
+ audio = torch.clamp(audio, -1.0, 1.0)
406
+ return (audio * 32767.0).to(torch.int16).numpy()
407
 
408
+ audio = np.asarray(audio)
409
+ audio = np.clip(audio, -1.0, 1.0)
410
+ return (audio * 32767.0).astype(np.int16)
411
 
412
+ def audio_to_pcm_bytes(audio) -> bytes:
413
+ return audio_to_int16_np(audio).tobytes()
 
 
 
 
 
 
414
 
415
+ # ----------------------------
416
+ # OFFICIAL GENERATION PATH (single pipeline call)
417
+ # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
418
+ # ----------------------------
419
+ def kokoro_generator_full(text: str, voice_code: str, speed: float):
420
+ lang_code = voice_to_lang_code(voice_code)
421
+ pipeline = PIPELINES[lang_code]
422
+ text = inject_newlines_for_fast_stream(text)
423
 
424
+ if not text:
425
+ return
 
 
 
 
 
 
 
426
 
427
+ with torch.inference_mode():
428
+ generator = pipeline(
429
+ text,
430
+ voice=voice_code,
431
+ speed=float(speed),
432
+ split_pattern=r"\n+",
433
+ )
434
+ for _, _, audio in generator:
435
+ yield audio
 
 
436
 
437
+ # ----------------------------
438
+ # WARMUP (pay cold-start cost at boot)
439
+ # ----------------------------
440
+ def warmup():
441
+ try:
442
+ t0 = time.time()
443
+ for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
444
+ break
445
+ print(f"โœ… WARMUP DONE in {time.time() - t0:.2f}s")
446
+ except Exception as e:
447
+ print(f"โš ๏ธ WARMUP FAILED: {e}")
448
 
449
+ # ----------------------------
450
+ # GRADIO UI STREAM
451
+ # ----------------------------
452
+ def gradio_stream(text, voice_name, speed):
453
+ voice_code = VOICE_CHOICES.get(voice_name, voice_name)
454
+ text = normalize_text(text)
 
455
 
456
+ i = 0
457
+ t0 = time.time()
458
+ for audio in kokoro_generator_full(text, voice_code, speed):
459
+ if i == 0:
460
+ print(f"โšก UI first audio in {time.time() - t0:.2f}s")
461
+ i += 1
462
+ yield 24000, audio_to_int16_np(audio)
463
 
464
+ # ----------------------------
465
+ # FASTAPI WS ENGINE
466
+ # Single worker thread for actual generation.
467
+ # Stream frames to client as soon as they exist.
468
+ # No buffering a full list before sending.
469
+ # ----------------------------
470
+ api = FastAPI()
471
 
472
+ INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
473
+ INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
 
474
 
475
+ async def audio_engine_loop():
476
+ print("โšก API AUDIO PIPELINE STARTED")
477
+ loop = asyncio.get_running_loop()
478
 
479
+ while True:
480
+ ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
 
481
 
482
+ # Skip dead clients early
483
+ if ws.client_state.value > 1:
484
+ continue
485
 
486
+ frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
+ def _worker():
489
+ try:
490
+ for audio in kokoro_generator_full(text, voice_code, speed):
491
+ b = audio_to_pcm_bytes(audio)
492
+ # backpressure aware
493
+ while True:
494
+ try:
495
+ loop.call_soon_threadsafe(frame_q.put_nowait, b)
496
+ break
497
+ except Exception:
498
+ time.sleep(0.001)
499
+ loop.call_soon_threadsafe(frame_q.put_nowait, None)
500
+ except Exception as e:
501
+ print(f"API Worker Error: {e}")
502
+ try:
503
+ loop.call_soon_threadsafe(frame_q.put_nowait, None)
504
+ except Exception:
505
+ pass
506
 
507
+ INFERENCE_EXECUTOR.submit(_worker)
 
508
 
509
+ first_sent = False
510
+ started = time.time()
 
 
511
 
512
+ while True:
513
+ frame = await frame_q.get()
514
+ if frame is None:
515
+ break
516
 
517
+ if ws.client_state.value > 1:
518
+ break
 
 
 
 
 
519
 
520
+ try:
521
+ await ws.send_bytes(frame)
522
+ if not first_sent:
523
+ print(f"โšก API first audio in {time.time() - started:.2f}s")
524
+ first_sent = True
525
+ except Exception:
526
+ break
527
 
528
+ @api.on_event("startup")
529
+ async def startup():
530
+ loop = asyncio.get_running_loop()
531
+ await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
532
+ asyncio.create_task(audio_engine_loop())
533
 
534
+ @api.websocket("/ws/audio")
535
+ async def websocket_endpoint(ws: WebSocket):
536
+ await ws.accept()
537
 
538
+ voice_code = "af_bella"
539
+ speed = 1.0
540
 
541
+ print(f"โœ… Client connected: {ws.client}")
 
 
 
 
 
 
542
 
543
+ async def keep_alive():
544
+ while True:
545
+ try:
546
+ await asyncio.sleep(15)
547
+ await ws.send_json({"type": "ping"})
548
+ except Exception:
549
+ break
550
 
551
+ heartbeat_task = asyncio.create_task(keep_alive())
 
 
 
 
 
 
 
 
 
552
 
553
+ try:
554
+ while True:
555
+ try:
556
+ data = await ws.receive_json()
557
+ except WebSocketDisconnect:
558
+ print("โŒ Client disconnected cleanly")
559
+ break
560
+ except Exception as e:
561
+ print(f"โš ๏ธ Connection lost: {e}")
562
+ break
563
 
564
+ if "config" in data:
565
+ voice_name = data.get("voice", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella")
566
+ voice_code = VOICE_CHOICES.get(voice_name, voice_name)
567
+ speed = float(data.get("speed", speed))
568
 
569
+ if "text" in data:
570
+ text = normalize_text(data.get("text", ""))
571
+ if text.strip():
572
+ await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
573
 
574
+ if "flush" in data:
575
+ pass
576
 
577
+ finally:
578
+ heartbeat_task.cancel()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579
 
580
+ # ----------------------------
581
+ # GRADIO APP
582
+ # ----------------------------
583
+ with gr.Blocks(title="Kokoro TTS") as app:
584
+ gr.Markdown("## โšก Kokoro-82M (Official Pipeline, Low Latency)")
585
+ with gr.Row():
586
+ with gr.Column():
587
+ text_in = gr.Textbox(
588
+ label="Input Text",
589
+ lines=3,
590
+ value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
591
+ )
592
+ voice_in = gr.Dropdown(
593
+ list(VOICE_CHOICES.keys()),
594
+ value="๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella",
595
+ label="Voice",
596
+ )
597
+ speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
598
+ btn = gr.Button("Generate", variant="primary")
599
+ with gr.Column():
600
+ audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
601
 
602
+ btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
603
 
604
+ final_app = gr.mount_gradio_app(api, app, path="/")
605
+
606
+ if __name__ == "__main__":
607
+ uvicorn.run(final_app, host="0.0.0.0", port=7860)
608
  #claude code
609
+ # """
610
+ # Kokoro TTS WebSocket Server - OPTIMIZED for 2 vCPU / 16GB RAM
611
+ # ============================================================
612
+ # Fixes:
613
+ # - Backpressure loop timeout prevents worker thread hang
614
+ # - Parallel inference workers (2, one per vCPU)
615
+ # - Proper error handling with traceback logging
616
+ # - Generation timeout to prevent infinite hangs
617
+ # - Memory-optimized with periodic garbage collection
618
+ # - Aggressive batching for throughput
619
+ # """
620
 
621
+ # import os
622
+ # import re
623
+ # import gc
624
+ # import time
625
+ # import asyncio
626
+ # import traceback
627
+ # from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
628
 
629
+ # import numpy as np
630
+ # import gradio as gr
631
+ # from fastapi import FastAPI, WebSocket, WebSocketDisconnect
632
+ # import uvicorn
633
 
634
+ # import torch
635
+ # from kokoro import KPipeline
636
 
637
+ # # ----------------------------
638
+ # # MAXIMIZE 2 vCPU UTILIZATION
639
+ # # ----------------------------
640
+ # CPU_COUNT = 2
641
+ # os.environ["OMP_NUM_THREADS"] = str(CPU_COUNT)
642
+ # os.environ["MKL_NUM_THREADS"] = str(CPU_COUNT)
643
+ # os.environ["NUMEXPR_NUM_THREADS"] = str(CPU_COUNT)
644
+ # os.environ["OPENBLAS_NUM_THREADS"] = str(CPU_COUNT)
645
 
646
+ # try:
647
+ # torch.set_num_threads(CPU_COUNT)
648
+ # torch.set_num_interop_threads(CPU_COUNT)
649
+ # except Exception:
650
+ # pass
651
 
652
+ # # Use uvloop for faster async on Linux
653
+ # try:
654
+ # import uvloop
655
+ # asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
656
+ # print("โœ… Using uvloop for faster async")
657
+ # except ImportError:
658
+ # print("โš ๏ธ uvloop not available, using default event loop")
659
 
660
+ # print(f"๐Ÿš€ BOOTING KOKORO - Optimized for {CPU_COUNT} vCPU / 16GB RAM")
661
 
662
+ # # ----------------------------
663
+ # # CONFIGURATION
664
+ # # ----------------------------
665
+ # GENERATION_TIMEOUT_SECONDS = 60 # Max time for a single TTS generation
666
+ # BACKPRESSURE_TIMEOUT_MS = 10000 # Max wait for queue space (10 seconds)
667
+ # WORKER_COUNT = 2 # One per vCPU for parallel processing
668
+ # QUEUE_MAXSIZE = 12 # Buffer more frames for smoother streaming
669
 
670
+ # # ----------------------------
671
+ # # VOICES
672
+ # # ----------------------------
673
+ # VOICE_CHOICES = {
674
+ # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Heart": "af_heart", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella": "af_bella", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nicole": "af_nicole",
675
+ # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Aoede": "af_aoede", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Kore": "af_kore", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sarah": "af_sarah",
676
+ # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nova": "af_nova", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sky": "af_sky", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Alloy": "af_alloy",
677
+ # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Jessica": "af_jessica", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ River": "af_river", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Michael": "am_michael",
678
+ # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Fenrir": "am_fenrir", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Puck": "am_puck", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Echo": "am_echo",
679
+ # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Eric": "am_eric", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Liam": "am_liam", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Onyx": "am_onyx",
680
+ # "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Santa": "am_santa", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน Adam": "am_adam", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Emma": "bf_emma",
681
+ # "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Isabella": "bf_isabella", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Alice": "bf_alice", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ Lily": "bf_lily",
682
+ # "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน George": "bm_george", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Fable": "bm_fable", "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Lewis": "bm_lewis",
683
+ # "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน Daniel": "bm_daniel",
684
+ # }
685
 
686
+ # def voice_to_lang_code(voice_code: str) -> str:
687
+ # if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
688
+ # return "b" # British
689
+ # return "a" # American
690
 
691
+ # # ----------------------------
692
+ # # PIPELINES (hot in RAM - uses ~2GB per pipeline)
693
+ # # With 16GB RAM we can comfortably hold both
694
+ # # ----------------------------
695
+ # print("๐Ÿ“ฆ Loading Kokoro pipelines into RAM...")
696
+ # PIPELINES = {
697
+ # "a": KPipeline(lang_code="a"),
698
+ # "b": KPipeline(lang_code="b"),
699
+ # }
700
+ # print(f"โœ… Pipelines loaded. Memory usage: ~4GB for models")
701
 
702
+ # # ----------------------------
703
+ # # TEXT NORMALIZATION
704
+ # # ----------------------------
705
+ # def normalize_text(text: str) -> str:
706
+ # if not text:
707
+ # return ""
708
+ # # Kokoro pronunciation helper
709
+ # text = text.replace("Kokoro", "[Kokoro](/kหˆOkษ™ษนO/)")
710
+ # return text
711
 
712
+ # # ----------------------------
713
+ # # FAST SEGMENTATION FOR STREAMING
714
+ # # ----------------------------
715
+ # _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
716
 
717
+ # def inject_newlines_for_fast_stream(text: str) -> str:
718
+ # text = normalize_text(text).strip()
719
+ # if not text:
720
+ # return ""
721
 
722
+ # # Sentence boundaries -> newline for pipeline segmentation
723
+ # text = _SENT_BOUNDARY.sub(r"\1\n", text)
724
 
725
+ # # Normalize excessive newlines
726
+ # text = re.sub(r"\n{3,}", "\n\n", text)
727
 
728
+ # # Guarantee a small first segment for low time-to-first-audio
729
+ # if "\n" not in text and len(text) > 90:
730
+ # cut = text.rfind(" ", 0, 70)
731
+ # if cut < 35:
732
+ # cut = 70
733
+ # text = text[:cut].strip() + "\n" + text[cut:].strip()
734
 
735
+ # return text
736
 
737
+ # # ----------------------------
738
+ # # AUDIO CONVERSION (optimized)
739
+ # # ----------------------------
740
+ # def audio_to_int16_np(audio):
741
+ # if isinstance(audio, torch.Tensor):
742
+ # audio = audio.detach().cpu()
743
+ # audio = torch.clamp(audio, -1.0, 1.0)
744
+ # return (audio * 32767.0).to(torch.int16).numpy()
745
 
746
+ # audio = np.asarray(audio, dtype=np.float32)
747
+ # audio = np.clip(audio, -1.0, 1.0)
748
+ # return (audio * 32767.0).astype(np.int16)
749
 
750
+ # def audio_to_pcm_bytes(audio) -> bytes:
751
+ # return audio_to_int16_np(audio).tobytes()
752
 
753
+ # # ----------------------------
754
+ # # GENERATION WITH TIMEOUT
755
+ # # ----------------------------
756
+ # def kokoro_generator_full(text: str, voice_code: str, speed: float):
757
+ # """
758
+ # Generate audio chunks from text using Kokoro pipeline.
759
+ # Yields audio tensors for each segment.
760
+ # """
761
+ # lang_code = voice_to_lang_code(voice_code)
762
+ # pipeline = PIPELINES[lang_code]
763
+ # text = inject_newlines_for_fast_stream(text)
764
 
765
+ # if not text:
766
+ # return
767
 
768
+ # chunk_count = 0
769
+ # start_time = time.time()
770
 
771
+ # try:
772
+ # with torch.inference_mode():
773
+ # generator = pipeline(
774
+ # text,
775
+ # voice=voice_code,
776
+ # speed=float(speed),
777
+ # split_pattern=r"\n+",
778
+ # )
779
+ # for _, _, audio in generator:
780
+ # chunk_count += 1
781
+ # elapsed = time.time() - start_time
782
 
783
+ # # Timeout protection
784
+ # if elapsed > GENERATION_TIMEOUT_SECONDS:
785
+ # print(f"โš ๏ธ Generation timeout after {elapsed:.1f}s, {chunk_count} chunks")
786
+ # break
787
 
788
+ # yield audio
789
 
790
+ # print(f"โœ… Generated {chunk_count} chunks in {time.time() - start_time:.2f}s")
791
 
792
+ # except Exception as e:
793
+ # print(f"โŒ Generation error: {e}")
794
+ # traceback.print_exc()
795
+ # finally:
796
+ # # Periodic garbage collection to prevent memory buildup
797
+ # if chunk_count > 10:
798
+ # gc.collect()
799
 
800
+ # # ----------------------------
801
+ # # WARMUP (preload models)
802
+ # # ----------------------------
803
+ # def warmup():
804
+ # try:
805
+ # t0 = time.time()
806
+ # for _ in kokoro_generator_full("Hello, this is a warmup test.", "af_bella", 1.0):
807
+ # break
808
+ # print(f"โœ… WARMUP DONE in {time.time() - t0:.2f}s")
809
+ # except Exception as e:
810
+ # print(f"โš ๏ฟฝ๏ฟฝ๏ฟฝ WARMUP FAILED: {e}")
811
+ # traceback.print_exc()
812
 
813
+ # # ----------------------------
814
+ # # GRADIO UI STREAM
815
+ # # ----------------------------
816
+ # def gradio_stream(text, voice_name, speed):
817
+ # voice_code = VOICE_CHOICES.get(voice_name, voice_name)
818
+ # text = normalize_text(text)
819
 
820
+ # i = 0
821
+ # t0 = time.time()
822
+ # for audio in kokoro_generator_full(text, voice_code, speed):
823
+ # if i == 0:
824
+ # print(f"โšก UI first audio in {time.time() - t0:.2f}s")
825
+ # i += 1
826
+ # yield 24000, audio_to_int16_np(audio)
827
 
828
+ # # ----------------------------
829
+ # # FASTAPI WEBSOCKET ENGINE
830
+ # # ----------------------------
831
+ # api = FastAPI()
832
 
833
+ # # Use multiple workers for parallel inference
834
+ # INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=WORKER_COUNT)
835
+ # INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
836
 
837
+ # async def audio_engine_loop():
838
+ # """
839
+ # Main audio processing loop.
840
+ # Pulls requests from queue and streams audio back to clients.
841
+ # """
842
+ # print(f"โšก API AUDIO PIPELINE STARTED ({WORKER_COUNT} workers)")
843
+ # loop = asyncio.get_running_loop()
844
 
845
+ # while True:
846
+ # try:
847
+ # ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
848
+ # except Exception as e:
849
+ # print(f"โš ๏ธ Queue get error: {e}")
850
+ # continue
851
 
852
+ # # Skip dead clients early
853
+ # try:
854
+ # if ws.client_state.value > 1:
855
+ # print("โญ๏ธ Skipping dead client")
856
+ # continue
857
+ # except Exception:
858
+ # continue
859
 
860
+ # frame_q: asyncio.Queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
861
+ # generation_id = id(ws)
862
 
863
+ # def _worker():
864
+ # """Worker thread for audio generation."""
865
+ # chunk_count = 0
866
+ # start_time = time.time()
867
 
868
+ # try:
869
+ # print(f"๐Ÿ”Š [{generation_id}] Starting TTS: {text[:50]}...")
870
 
871
+ # for audio in kokoro_generator_full(text, voice_code, speed):
872
+ # b = audio_to_pcm_bytes(audio)
873
+ # chunk_count += 1
874
 
875
+ # if chunk_count == 1:
876
+ # print(f"โšก [{generation_id}] First chunk ready in {time.time() - start_time:.2f}s")
877
 
878
+ # # Backpressure with TIMEOUT to prevent infinite hang
879
+ # attempts = 0
880
+ # max_attempts = BACKPRESSURE_TIMEOUT_MS # 10 seconds at 1ms/attempt
881
 
882
+ # while attempts < max_attempts:
883
+ # try:
884
+ # loop.call_soon_threadsafe(frame_q.put_nowait, b)
885
+ # break
886
+ # except asyncio.QueueFull:
887
+ # time.sleep(0.001)
888
+ # attempts += 1
889
+ # else:
890
+ # # Timeout reached - client too slow or disconnected
891
+ # print(f"โš ๏ธ [{generation_id}] Backpressure timeout after {attempts}ms - aborting")
892
+ # break
893
 
894
+ # # Send completion signal
895
+ # loop.call_soon_threadsafe(frame_q.put_nowait, None)
896
+ # print(f"โœ… [{generation_id}] Completed: {chunk_count} chunks in {time.time() - start_time:.2f}s")
897
 
898
+ # except Exception as e:
899
+ # print(f"โŒ [{generation_id}] Worker error: {e}")
900
+ # traceback.print_exc()
901
+ # try:
902
+ # loop.call_soon_threadsafe(frame_q.put_nowait, None)
903
+ # except Exception:
904
+ # pass
905
 
906
+ # # Submit to executor
907
+ # INFERENCE_EXECUTOR.submit(_worker)
908
 
909
+ # # Stream frames to client
910
+ # first_sent = False
911
+ # started = time.time()
912
+ # frames_sent = 0
913
 
914
+ # while True:
915
+ # try:
916
+ # # Timeout on frame retrieval to prevent infinite hang
917
+ # frame = await asyncio.wait_for(frame_q.get(), timeout=30.0)
918
+ # except asyncio.TimeoutError:
919
+ # print(f"โš ๏ธ [{generation_id}] Frame queue timeout - no data for 30s")
920
+ # break
921
 
922
+ # if frame is None:
923
+ # break
924
 
925
+ # # Check client still alive
926
+ # try:
927
+ # if ws.client_state.value > 1:
928
+ # print(f"โญ๏ธ [{generation_id}] Client disconnected mid-stream")
929
+ # break
930
+ # except Exception:
931
+ # break
932
 
933
+ # try:
934
+ # await ws.send_bytes(frame)
935
+ # frames_sent += 1
936
 
937
+ # if not first_sent:
938
+ # print(f"โšก [{generation_id}] First audio sent in {time.time() - started:.2f}s")
939
+ # first_sent = True
940
+ # except Exception as e:
941
+ # print(f"โš ๏ธ [{generation_id}] Send failed: {e}")
942
+ # break
943
 
944
+ # print(f"๐Ÿ“ค [{generation_id}] Streaming complete: {frames_sent} frames sent")
945
 
946
+ # @api.on_event("startup")
947
+ # async def startup():
948
+ # loop = asyncio.get_running_loop()
949
 
950
+ # # Warmup in executor to not block startup
951
+ # await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
952
 
953
+ # # Start the audio engine loop
954
+ # asyncio.create_task(audio_engine_loop())
955
 
956
+ # print("๐Ÿš€ Server ready!")
957
 
958
+ # @api.websocket("/ws/audio")
959
+ # async def websocket_endpoint(ws: WebSocket):
960
+ # await ws.accept()
961
 
962
+ # voice_code = "af_bella"
963
+ # speed = 1.0
964
+ # client_id = id(ws)
965
 
966
+ # print(f"โœ… [{client_id}] Client connected: {ws.client}")
967
 
968
+ # async def keep_alive():
969
+ # """Send periodic pings to keep connection alive."""
970
+ # while True:
971
+ # try:
972
+ # await asyncio.sleep(15)
973
+ # await ws.send_json({"type": "ping"})
974
+ # except Exception:
975
+ # break
976
 
977
+ # heartbeat_task = asyncio.create_task(keep_alive())
978
 
979
+ # try:
980
+ # while True:
981
+ # try:
982
+ # data = await asyncio.wait_for(ws.receive_json(), timeout=120.0)
983
+ # except asyncio.TimeoutError:
984
+ # print(f"โฑ๏ธ [{client_id}] Connection timeout - no messages for 120s")
985
+ # break
986
+ # except WebSocketDisconnect:
987
+ # print(f"โŒ [{client_id}] Client disconnected cleanly")
988
+ # break
989
+ # except Exception as e:
990
+ # print(f"โš ๏ธ [{client_id}] Connection error: {e}")
991
+ # break
992
 
993
+ # # Handle config updates
994
+ # if "config" in data:
995
+ # voice_name = data.get("voice", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella")
996
+ # voice_code = VOICE_CHOICES.get(voice_name, voice_name)
997
+ # speed = float(data.get("speed", speed))
998
+ # print(f"๐ŸŽ›๏ธ [{client_id}] Config: voice={voice_code}, speed={speed}")
999
 
1000
+ # # Handle text-to-speech request
1001
+ # if "text" in data:
1002
+ # text = normalize_text(data.get("text", ""))
1003
+ # if text.strip():
1004
+ # print(f"๐Ÿ“ฅ [{client_id}] TTS request: {text[:50]}...")
1005
+ # await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
1006
 
1007
+ # # Handle flush (no-op for now, could clear queue)
1008
+ # if "flush" in data:
1009
+ # pass
1010
 
1011
+ # finally:
1012
+ # heartbeat_task.cancel()
1013
+ # print(f"๐Ÿ‘‹ [{client_id}] Connection closed")
1014
 
1015
+ # # ----------------------------
1016
+ # # HEALTH CHECK ENDPOINT
1017
+ # # ----------------------------
1018
+ # @api.get("/health")
1019
+ # async def health_check():
1020
+ # return {
1021
+ # "status": "healthy",
1022
+ # "workers": WORKER_COUNT,
1023
+ # "queue_size": INFERENCE_QUEUE.qsize(),
1024
+ # }
1025
 
1026
+ # # ----------------------------
1027
+ # # GRADIO APP
1028
+ # # ----------------------------
1029
+ # with gr.Blocks(title="Kokoro TTS") as app:
1030
+ # gr.Markdown("## โšก Kokoro-82M (Optimized for 2 vCPU / 16GB RAM)")
1031
+ # gr.Markdown("API: Connect to `/ws/audio` for real-time streaming")
1032
 
1033
+ # with gr.Row():
1034
+ # with gr.Column():
1035
+ # text_in = gr.Textbox(
1036
+ # label="Input Text",
1037
+ # lines=3,
1038
+ # value="Hello! This is the Kokoro text-to-speech system. The server is optimized for low latency streaming.",
1039
+ # )
1040
+ # voice_in = gr.Dropdown(
1041
+ # list(VOICE_CHOICES.keys()),
1042
+ # value="๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella",
1043
+ # label="Voice",
1044
+ # )
1045
+ # speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
1046
+ # btn = gr.Button("Generate", variant="primary")
1047
+ # with gr.Column():
1048
+ # audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
1049
 
1050
+ # btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
1051
 
1052
+ # final_app = gr.mount_gradio_app(api, app, path="/")
1053
 
1054
+ # if __name__ == "__main__":
1055
+ # uvicorn.run(
1056
+ # final_app,
1057
+ # host="0.0.0.0",
1058
+ # port=7860,
1059
+ # workers=1, # Single process, multiple threads
1060
+ # log_level="info",
1061
+ # )