auralodyssey commited on
Commit
a649960
Β·
verified Β·
1 Parent(s): 20fe4d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -203
app.py CHANGED
@@ -298,244 +298,280 @@ import os
298
  import re
299
  import time
300
  import asyncio
301
- from concurrent.futures import ThreadPoolExecutor
302
-
303
  import numpy as np
304
  import gradio as gr
 
 
305
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 
306
  import uvicorn
307
 
308
- import torch
309
  from kokoro import KPipeline
310
 
311
- # ----------------------------
312
- # HARD LIMIT CPU THREADS (2 vCPU box)
313
- # ----------------------------
 
314
  os.environ.setdefault("OMP_NUM_THREADS", "2")
315
  os.environ.setdefault("MKL_NUM_THREADS", "2")
316
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
317
 
318
- try:
319
- torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
320
- torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
321
- except Exception:
322
- pass
323
 
324
- # Optional: uvloop for faster event loop on HF Linux
325
- try:
326
- import uvloop # type: ignore
327
- asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
328
- except Exception:
329
- pass
330
 
331
- print("πŸš€ BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
332
 
333
- # ----------------------------
334
- # VOICES
335
- # ----------------------------
336
  VOICE_CHOICES = {
337
- "πŸ‡ΊπŸ‡Έ 🚺 Heart": "af_heart", "πŸ‡ΊπŸ‡Έ 🚺 Bella": "af_bella", "πŸ‡ΊπŸ‡Έ 🚺 Nicole": "af_nicole",
338
- "πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede", "πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore", "πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
339
- "πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova", "πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky", "πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
340
- "πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica", "πŸ‡ΊπŸ‡Έ 🚺 River": "af_river", "πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
341
- "πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir", "πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck", "πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
342
- "πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric", "πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam", "πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
343
- "πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa", "πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam", "πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
344
- "πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella", "πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice", "πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
345
- "πŸ‡¬πŸ‡§ 🚹 George": "bm_george", "πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable", "πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  "πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
347
  }
348
 
349
- def voice_to_lang_code(voice_code: str) -> str:
350
- if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
351
- return "b" # British
352
- return "a" # American
353
-
354
- # ----------------------------
355
- # PIPELINES (keep hot in RAM)
356
- # ----------------------------
357
- PIPELINES = {
358
- "a": KPipeline(lang_code="a"),
359
- "b": KPipeline(lang_code="b"),
360
- }
361
-
362
- # ----------------------------
363
- # TEXT NORMALIZATION (matches your pasted official docs)
364
- # ----------------------------
365
- def normalize_text(text: str) -> str:
366
- if not text:
367
- return ""
368
- return text.replace("Kokoro", "[Kokoro](/kˈOkΙ™ΙΉO/)")
369
-
370
- # ----------------------------
371
- # LOW LATENCY SEGMENTATION
372
- # One pipeline call per request.
373
- # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
374
- # We also force a small first segment for fast first audio.
375
- # ----------------------------
376
- _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
377
-
378
- def inject_newlines_for_fast_stream(text: str) -> str:
379
- text = normalize_text(text).strip()
380
- if not text:
381
- return ""
382
-
383
- # Sentence boundaries -> newline so official split_pattern can segment
384
- text = _SENT_BOUNDARY.sub(r"\1\n", text)
385
-
386
- # Also split on existing multi-newlines
387
- text = re.sub(r"\n{3,}", "\n\n", text)
388
-
389
- # Guarantee a small first segment for low time-to-first-audio
390
- if "\n" not in text and len(text) > 90:
391
- cut = text.rfind(" ", 0, 70)
392
- if cut < 35:
393
- cut = 70
394
- text = text[:cut].strip() + "\n" + text[cut:].strip()
395
 
396
- return text
 
 
 
 
397
 
398
- # ----------------------------
399
- # AUDIO CONVERSION (fast, safe)
400
- # ----------------------------
401
- def audio_to_int16_np(audio):
 
402
  if isinstance(audio, torch.Tensor):
403
- audio = audio.detach().cpu()
404
- audio = torch.clamp(audio, -1.0, 1.0)
405
- return (audio * 32767.0).to(torch.int16).numpy()
406
-
407
- audio = np.asarray(audio)
408
- audio = np.clip(audio, -1.0, 1.0)
409
- return (audio * 32767.0).astype(np.int16)
410
-
411
- def audio_to_pcm_bytes(audio) -> bytes:
412
- return audio_to_int16_np(audio).tobytes()
413
-
414
- # ----------------------------
415
- # OFFICIAL GENERATION PATH (single pipeline call)
416
- # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
417
- # ----------------------------
418
- def kokoro_generator_full(text: str, voice_code: str, speed: float):
419
- lang_code = voice_to_lang_code(voice_code)
420
- pipeline = PIPELINES[lang_code]
421
- text = inject_newlines_for_fast_stream(text)
422
-
423
- if not text:
424
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
 
 
426
  with torch.inference_mode():
427
- generator = pipeline(
428
- text,
429
- voice=voice_code,
430
  speed=float(speed),
431
- split_pattern=r"\n+",
432
  )
433
- for _, _, audio in generator:
434
- yield audio
435
-
436
- # ----------------------------
437
- # WARMUP (pay cold-start cost at boot)
438
- # ----------------------------
 
 
 
 
 
439
  def warmup():
440
  try:
441
  t0 = time.time()
442
- for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
443
- break
444
- print(f"βœ… WARMUP DONE in {time.time() - t0:.2f}s")
445
  except Exception as e:
446
- print(f"⚠️ WARMUP FAILED: {e}")
447
-
448
- # ----------------------------
449
- # GRADIO UI STREAM
450
- # ----------------------------
451
- def gradio_stream(text, voice_name, speed):
452
- voice_code = VOICE_CHOICES.get(voice_name, voice_name)
453
- text = normalize_text(text)
454
-
455
- i = 0
456
- t0 = time.time()
457
- for audio in kokoro_generator_full(text, voice_code, speed):
458
- if i == 0:
459
- print(f"⚑ UI first audio in {time.time() - t0:.2f}s")
460
- i += 1
461
- yield 24000, audio_to_int16_np(audio)
462
-
463
- # ----------------------------
464
- # FASTAPI WS ENGINE
465
- # Single worker thread for actual generation.
466
- # Stream frames to client as soon as they exist.
467
- # No buffering a full list before sending.
468
- # ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  api = FastAPI()
470
 
 
471
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
472
- INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
473
 
474
  async def audio_engine_loop():
475
  print("⚑ API AUDIO PIPELINE STARTED")
476
  loop = asyncio.get_running_loop()
477
 
478
  while True:
479
- ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
480
-
481
- # Skip dead clients early
482
- if ws.client_state.value > 1:
483
- continue
484
-
485
- frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
486
 
487
- def _worker():
488
- try:
489
- for audio in kokoro_generator_full(text, voice_code, speed):
490
- b = audio_to_pcm_bytes(audio)
491
- # backpressure aware
492
- while True:
493
- try:
494
- loop.call_soon_threadsafe(frame_q.put_nowait, b)
495
- break
496
- except Exception:
497
- time.sleep(0.001)
498
- loop.call_soon_threadsafe(frame_q.put_nowait, None)
499
- except Exception as e:
500
- print(f"API Worker Error: {e}")
501
- try:
502
- loop.call_soon_threadsafe(frame_q.put_nowait, None)
503
- except Exception:
504
- pass
505
-
506
- INFERENCE_EXECUTOR.submit(_worker)
507
-
508
- first_sent = False
509
- started = time.time()
510
 
511
- while True:
512
- frame = await frame_q.get()
513
- if frame is None:
514
- break
 
515
 
516
- if ws.client_state.value > 1:
517
- break
518
 
 
519
  try:
520
- await ws.send_bytes(frame)
521
- if not first_sent:
522
- print(f"⚑ API first audio in {time.time() - started:.2f}s")
523
- first_sent = True
524
  except Exception:
525
- break
 
 
 
526
 
527
  @api.on_event("startup")
528
  async def startup():
529
- loop = asyncio.get_running_loop()
530
- await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
531
  asyncio.create_task(audio_engine_loop())
532
 
533
  @api.websocket("/ws/audio")
534
  async def websocket_endpoint(ws: WebSocket):
535
  await ws.accept()
536
 
537
- voice_code = "af_bella"
538
  speed = 1.0
 
539
 
540
  print(f"βœ… Client connected: {ws.client}")
541
 
@@ -554,51 +590,52 @@ async def websocket_endpoint(ws: WebSocket):
554
  try:
555
  data = await ws.receive_json()
556
  except WebSocketDisconnect:
557
- print("❌ Client disconnected cleanly")
558
  break
559
- except Exception as e:
560
- print(f"⚠️ Connection lost: {e}")
561
  break
562
 
563
  if "config" in data:
564
- voice_name = data.get("voice", "πŸ‡ΊπŸ‡Έ 🚺 Bella")
565
- voice_code = VOICE_CHOICES.get(voice_name, voice_name)
566
  speed = float(data.get("speed", speed))
567
 
568
  if "text" in data:
569
- text = normalize_text(data.get("text", ""))
570
- if text.strip():
571
- await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
 
 
 
 
 
572
 
573
  if "flush" in data:
574
  pass
575
 
 
 
576
  finally:
577
  heartbeat_task.cancel()
578
 
579
- # ----------------------------
580
- # GRADIO APP
581
- # ----------------------------
582
  with gr.Blocks(title="Kokoro TTS") as app:
583
  gr.Markdown("## ⚑ Kokoro-82M (Official Pipeline, Low Latency)")
584
  with gr.Row():
585
  with gr.Column():
586
  text_in = gr.Textbox(
587
  label="Input Text",
588
- lines=3,
589
- value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
590
- )
591
- voice_in = gr.Dropdown(
592
- list(VOICE_CHOICES.keys()),
593
- value="πŸ‡ΊπŸ‡Έ 🚺 Bella",
594
- label="Voice",
595
  )
 
596
  speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
597
  btn = gr.Button("Generate", variant="primary")
598
  with gr.Column():
599
  audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
600
 
601
- btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
602
 
603
  final_app = gr.mount_gradio_app(api, app, path="/")
604
 
 
298
  import re
299
  import time
300
  import asyncio
301
+ import uvloop
 
302
  import numpy as np
303
  import gradio as gr
304
+ import torch
305
+
306
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
307
+ from concurrent.futures import ThreadPoolExecutor
308
  import uvicorn
309
 
310
+ # Official pipeline
311
  from kokoro import KPipeline
312
 
313
+ # -------------------------
314
+ # CPU + runtime tuning
315
+ # -------------------------
316
+ # Keep these conservative. HF CPU is usually 2 vCPU.
317
  os.environ.setdefault("OMP_NUM_THREADS", "2")
318
  os.environ.setdefault("MKL_NUM_THREADS", "2")
319
  os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
320
 
321
+ torch.set_num_threads(2)
322
+ torch.set_num_interop_threads(1)
 
 
 
323
 
324
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
 
 
 
 
325
 
326
+ SAMPLE_RATE = 24000
327
 
328
+ # -------------------------
329
+ # Voices (use Kokoro voice ids)
330
+ # -------------------------
331
  VOICE_CHOICES = {
332
+ "πŸ‡ΊπŸ‡Έ 🚺 Heart": "af_heart",
333
+ "πŸ‡ΊπŸ‡Έ 🚺 Bella": "af_bella",
334
+ "πŸ‡ΊπŸ‡Έ 🚺 Nicole": "af_nicole",
335
+ "πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede",
336
+ "πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore",
337
+ "πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
338
+ "πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova",
339
+ "πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky",
340
+ "πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
341
+ "πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica",
342
+ "πŸ‡ΊπŸ‡Έ 🚺 River": "af_river",
343
+ "πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
344
+ "πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir",
345
+ "πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck",
346
+ "πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
347
+ "πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric",
348
+ "πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam",
349
+ "πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
350
+ "πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa",
351
+ "πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam",
352
+ "πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
353
+ "πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella",
354
+ "πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice",
355
+ "πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
356
+ "πŸ‡¬πŸ‡§ 🚹 George": "bm_george",
357
+ "πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable",
358
+ "πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
359
  "πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
360
  }
361
 
362
+ DEFAULT_VOICE_UI = "πŸ‡ΊπŸ‡Έ 🚺 Bella"
363
+ DEFAULT_VOICE = VOICE_CHOICES[DEFAULT_VOICE_UI]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
+ # -------------------------
366
+ # Kokoro pipeline (global)
367
+ # -------------------------
368
+ print("πŸš€ BOOTING KOKORO (OFFICIAL PIPELINE)")
369
+ PIPELINE = KPipeline(lang_code="a")
370
 
371
+ # -------------------------
372
+ # Helpers
373
+ # -------------------------
374
+ def _to_numpy_audio(audio):
375
+ # Kokoro may return a torch.Tensor or numpy array
376
  if isinstance(audio, torch.Tensor):
377
+ return audio.detach().cpu().numpy()
378
+ return np.asarray(audio)
379
+
380
+ def _float_to_int16(audio_f32):
381
+ audio_f32 = np.clip(audio_f32, -1.0, 1.0).astype(np.float32)
382
+ return (audio_f32 * 32767.0).astype(np.int16)
383
+
384
+ def trim_silence(audio_f32, threshold=0.01, pad=240):
385
+ # audio_f32 is float32, shape [N]
386
+ if audio_f32.size == 0:
387
+ return audio_f32
388
+ mask = np.abs(audio_f32) > threshold
389
+ if not np.any(mask):
390
+ return audio_f32
391
+ start = int(np.argmax(mask))
392
+ end = int(len(mask) - np.argmax(mask[::-1]))
393
+ start = max(0, start - pad)
394
+ end = min(len(audio_f32), end + pad)
395
+ return audio_f32[start:end]
396
+
397
+ def crossfade_concat(a, b, overlap=1200):
398
+ # overlap ~ 50ms at 24k
399
+ if a is None:
400
+ return b
401
+ if b is None:
402
+ return a
403
+ if len(a) < overlap or len(b) < overlap:
404
+ return np.concatenate([a, b])
405
+
406
+ fade_out = np.linspace(1.0, 0.0, overlap, dtype=np.float32)
407
+ fade_in = 1.0 - fade_out
408
+
409
+ a_tail = a[-overlap:] * fade_out
410
+ b_head = b[:overlap] * fade_in
411
+
412
+ mixed = a_tail + b_head
413
+ return np.concatenate([a[:-overlap], mixed, b[overlap:]])
414
+
415
+ def tuned_splitter(text):
416
+ # First chunk small for fast first packet, later chunks larger for efficiency
417
+ parts = re.split(r"([.,!?;:\n]+)", text)
418
+ buf = ""
419
+ chunk_idx = 0
420
+ for p in parts:
421
+ buf += p
422
+ if chunk_idx == 0:
423
+ threshold = 80
424
+ elif chunk_idx == 1:
425
+ threshold = 140
426
+ elif chunk_idx == 2:
427
+ threshold = 220
428
+ else:
429
+ threshold = 320
430
+
431
+ if re.search(r"[.,!?;:\n]$", buf) and len(buf) >= threshold:
432
+ s = buf.strip()
433
+ if s:
434
+ yield s
435
+ chunk_idx += 1
436
+ buf = ""
437
+
438
+ s = buf.strip()
439
+ if s:
440
+ yield s
441
+
442
+ def normalize_names_minimally(text):
443
+ # Cheap heuristics to reduce skipped acronyms and CamelCase
444
+ # 1) Split ALLCAPS as letters: "AI" -> "A I"
445
+ text = re.sub(r"\b([A-Z]{2,})\b", lambda m: " ".join(list(m.group(1))), text)
446
+ # 2) Split CamelCase boundaries: "OpenAI" -> "Open AI"
447
+ text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
448
+ # Keep your Kokoro IPA hint example
449
+ text = text.replace("Kokoro", "Kokoro") # keep as-is unless you inject IPA tags in client
450
+ return text
451
 
452
+ def synthesize_one_chunk(chunk, voice_id, speed):
453
+ # Make sure no nested splitting happens inside a chunk
454
  with torch.inference_mode():
455
+ gen = PIPELINE(
456
+ chunk,
457
+ voice=voice_id,
458
  speed=float(speed),
459
+ split_pattern=r"\n+", # chunk text has no newlines in practice
460
  )
461
+ # gen yields (gs, ps, audio)
462
+ out_audio = None
463
+ for _, _, audio in gen:
464
+ audio_np = _to_numpy_audio(audio).astype(np.float32)
465
+ audio_np = trim_silence(audio_np)
466
+ out_audio = crossfade_concat(out_audio, audio_np, overlap=1200)
467
+ return out_audio
468
+
469
+ # -------------------------
470
+ # Warmup to remove cold start latency
471
+ # -------------------------
472
  def warmup():
473
  try:
474
  t0 = time.time()
475
+ _ = synthesize_one_chunk("Warmup.", DEFAULT_VOICE, 1.0)
476
+ dt = time.time() - t0
477
+ print(f"βœ… Warmup done in {dt:.2f}s")
478
  except Exception as e:
479
+ print(f"⚠️ Warmup failed: {e}")
480
+
481
+ # Run warmup in background thread once
482
+ WARMUP_EXECUTOR = ThreadPoolExecutor(max_workers=1)
483
+ WARMUP_EXECUTOR.submit(warmup)
484
+
485
+ # -------------------------
486
+ # Streaming strategy
487
+ # -------------------------
488
+ def stream_generator(text, voice_ui, speed):
489
+ voice_id = VOICE_CHOICES.get(voice_ui, DEFAULT_VOICE)
490
+ text = normalize_names_minimally(text)
491
+
492
+ print("--- START UI STREAM ---")
493
+ first = True
494
+
495
+ # Buffer audio after the first packet to reduce gaps from too many tiny yields
496
+ buffer_audio = None
497
+ buffer_min_seconds = 0.9
498
+
499
+ for chunk_idx, chunk in enumerate(tuned_splitter(text)):
500
+ t0 = time.time()
501
+ audio_f32 = synthesize_one_chunk(chunk, voice_id, speed)
502
+ if audio_f32 is None or len(audio_f32) == 0:
503
+ continue
504
+
505
+ dt = time.time() - t0
506
+ print(f"⚑ UI chunk {chunk_idx}: {len(chunk)} chars in {dt:.2f}s")
507
+
508
+ if first:
509
+ # First packet: yield immediately for low perceived latency
510
+ first = False
511
+ yield (SAMPLE_RATE, _float_to_int16(audio_f32))
512
+ continue
513
+
514
+ buffer_audio = crossfade_concat(buffer_audio, audio_f32, overlap=1200)
515
+ if buffer_audio is not None:
516
+ if len(buffer_audio) >= int(buffer_min_seconds * SAMPLE_RATE):
517
+ yield (SAMPLE_RATE, _float_to_int16(buffer_audio))
518
+ buffer_audio = None
519
+
520
+ if buffer_audio is not None and len(buffer_audio) > 0:
521
+ yield (SAMPLE_RATE, _float_to_int16(buffer_audio))
522
+
523
+ print("--- END UI STREAM ---")
524
+
525
+ # -------------------------
526
+ # API (FastAPI + WS)
527
+ # -------------------------
528
  api = FastAPI()
529
 
530
+ # One inference worker is the right call on 2 vCPU
531
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
532
+ INFERENCE_QUEUE = asyncio.Queue()
533
 
534
  async def audio_engine_loop():
535
  print("⚑ API AUDIO PIPELINE STARTED")
536
  loop = asyncio.get_running_loop()
537
 
538
  while True:
539
+ job = await INFERENCE_QUEUE.get()
540
+ text, voice_id, speed, ws = job
 
 
 
 
 
541
 
542
+ try:
543
+ if ws.client_state.value > 1:
544
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
 
546
+ # Run synthesis in the single worker thread
547
+ audio_f32 = await loop.run_in_executor(
548
+ INFERENCE_EXECUTOR,
549
+ lambda: synthesize_one_chunk(text, voice_id, speed),
550
+ )
551
 
552
+ if audio_f32 is None or len(audio_f32) == 0:
553
+ continue
554
 
555
+ pcm = _float_to_int16(audio_f32).tobytes()
556
  try:
557
+ await ws.send_bytes(pcm)
 
 
 
558
  except Exception:
559
+ pass
560
+
561
+ except Exception as e:
562
+ print(f"API Engine Error: {e}")
563
 
564
  @api.on_event("startup")
565
  async def startup():
 
 
566
  asyncio.create_task(audio_engine_loop())
567
 
568
  @api.websocket("/ws/audio")
569
  async def websocket_endpoint(ws: WebSocket):
570
  await ws.accept()
571
 
572
+ voice_id = DEFAULT_VOICE
573
  speed = 1.0
574
+ loop = asyncio.get_running_loop()
575
 
576
  print(f"βœ… Client connected: {ws.client}")
577
 
 
590
  try:
591
  data = await ws.receive_json()
592
  except WebSocketDisconnect:
 
593
  break
594
+ except Exception:
 
595
  break
596
 
597
  if "config" in data:
598
+ voice_ui = data.get("voice", DEFAULT_VOICE_UI)
599
+ voice_id = VOICE_CHOICES.get(voice_ui, DEFAULT_VOICE)
600
  speed = float(data.get("speed", speed))
601
 
602
  if "text" in data:
603
+ raw = data["text"]
604
+ raw = normalize_names_minimally(raw)
605
+
606
+ # First chunk tiny, rest larger, same as UI
607
+ for chunk in tuned_splitter(raw):
608
+ if not chunk.strip():
609
+ continue
610
+ await INFERENCE_QUEUE.put((chunk, voice_id, speed, ws))
611
 
612
  if "flush" in data:
613
  pass
614
 
615
+ except Exception as e:
616
+ print(f"πŸ”₯ Critical WS Error: {e}")
617
  finally:
618
  heartbeat_task.cancel()
619
 
620
+ # -------------------------
621
+ # Gradio UI
622
+ # -------------------------
623
  with gr.Blocks(title="Kokoro TTS") as app:
624
  gr.Markdown("## ⚑ Kokoro-82M (Official Pipeline, Low Latency)")
625
  with gr.Row():
626
  with gr.Column():
627
  text_in = gr.Textbox(
628
  label="Input Text",
629
+ lines=4,
630
+ value="The system is live. Use the UI or connect to /ws/audio.",
 
 
 
 
 
631
  )
632
+ voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value=DEFAULT_VOICE_UI, label="Voice")
633
  speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
634
  btn = gr.Button("Generate", variant="primary")
635
  with gr.Column():
636
  audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
637
 
638
+ btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
639
 
640
  final_app = gr.mount_gradio_app(api, app, path="/")
641