auralodyssey commited on
Commit
20fe4d6
ยท
verified ยท
1 Parent(s): 83977c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -150
app.py CHANGED
@@ -295,8 +295,8 @@
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  import os
298
- import time
299
  import re
 
300
  import asyncio
301
  from concurrent.futures import ThreadPoolExecutor
302
 
@@ -308,30 +308,31 @@ import uvicorn
308
  import torch
309
  from kokoro import KPipeline
310
 
311
- # Optional speed boost on HF Linux
 
 
 
 
 
 
312
  try:
313
- import uvloop # type: ignore
314
- asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
315
  except Exception:
316
  pass
317
 
318
- print("๐Ÿš€ BOOTING KOKORO (OFFICIAL PIPELINE)")
319
-
320
- # Keep CPU threads predictable
321
  try:
322
- torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
323
- torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
324
  except Exception:
325
  pass
326
 
327
- # ------------------------------------------------------------
328
- # OFFICIAL PIPELINES (per your pasted docs)
329
- # ------------------------------------------------------------
330
- PIPELINES = {
331
- "a": KPipeline(lang_code="a"), # ๐Ÿ‡บ๐Ÿ‡ธ American English
332
- "b": KPipeline(lang_code="b"), # ๐Ÿ‡ฌ๐Ÿ‡ง British English
333
- }
334
 
 
 
 
335
  VOICE_CHOICES = {
336
  "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Heart": "af_heart", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella": "af_bella", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nicole": "af_nicole",
337
  "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Aoede": "af_aoede", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Kore": "af_kore", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sarah": "af_sarah",
@@ -347,88 +348,61 @@ VOICE_CHOICES = {
347
 
348
  def voice_to_lang_code(voice_code: str) -> str:
349
  if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
350
- return "b"
351
- return "a"
 
 
 
 
 
 
 
 
352
 
353
- # ------------------------------------------------------------
354
- # TEXT NORMALIZATION (stays within the docs you pasted)
355
- # Docs show: [Kokoro](/kหˆOkษ™ษนO/)
356
- # ------------------------------------------------------------
357
  def normalize_text(text: str) -> str:
358
  if not text:
359
- return text
360
  return text.replace("Kokoro", "[Kokoro](/kหˆOkษ™ษนO/)")
361
 
362
- # ------------------------------------------------------------
363
- # FAST-FIRST-AUDIO SPLITTER (your old technique)
364
- # Progressive thresholds so first chunk is quick.
365
- # Also includes a fallback to cut long text even without punctuation.
366
- # ------------------------------------------------------------
367
- _PUNCT_END = re.compile(r"[.,!?;:\n]$")
 
368
 
369
- def tuned_splitter(text: str):
370
- text = (text or "").strip()
371
  if not text:
372
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
- parts = re.split(r"([.,!?;:\n]+)", text)
375
- buffer = ""
376
- chunk_count = 0
377
-
378
- def threshold_for(n: int) -> int:
379
- if n == 0:
380
- return 60 # fast first audio
381
- if n == 1:
382
- return 120
383
- if n == 2:
384
- return 180
385
- return 260
386
-
387
- for part in parts:
388
- buffer += part
389
-
390
- threshold = threshold_for(chunk_count)
391
-
392
- # Emit when punctuation boundary is hit and buffer is big enough
393
- if _PUNCT_END.search(buffer) and len(buffer) >= threshold:
394
- out = buffer.strip()
395
- if out:
396
- yield out
397
- chunk_count += 1
398
- buffer = ""
399
- continue
400
-
401
- # Fallback: if no punctuation for too long, cut at last space
402
- hard_max = 320 if chunk_count == 0 else 520
403
- if len(buffer) >= hard_max:
404
- cut = buffer.rfind(" ")
405
- if cut > 40:
406
- out = buffer[:cut].strip()
407
- rest = buffer[cut:].strip()
408
- if out:
409
- yield out
410
- chunk_count += 1
411
- buffer = rest
412
- else:
413
- out = buffer.strip()
414
- if out:
415
- yield out
416
- chunk_count += 1
417
- buffer = ""
418
-
419
- if buffer.strip():
420
- yield buffer.strip()
421
-
422
- # ------------------------------------------------------------
423
- # AUDIO CONVERSION FIX
424
- # Fixes: "'Tensor' object has no attribute 'astype'"
425
- # ------------------------------------------------------------
426
  def audio_to_int16_np(audio):
427
  if isinstance(audio, torch.Tensor):
428
  audio = audio.detach().cpu()
429
  audio = torch.clamp(audio, -1.0, 1.0)
430
- audio_i16 = (audio * 32767.0).to(torch.int16)
431
- return audio_i16.numpy()
432
 
433
  audio = np.asarray(audio)
434
  audio = np.clip(audio, -1.0, 1.0)
@@ -437,88 +411,121 @@ def audio_to_int16_np(audio):
437
  def audio_to_pcm_bytes(audio) -> bytes:
438
  return audio_to_int16_np(audio).tobytes()
439
 
440
- # ------------------------------------------------------------
441
- # OFFICIAL GENERATION (exact pattern from your docs)
442
  # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
443
- # ------------------------------------------------------------
444
- def kokoro_generate(chunk: str, voice_code: str, speed: float):
445
  lang_code = voice_to_lang_code(voice_code)
446
  pipeline = PIPELINES[lang_code]
 
447
 
448
- generator = pipeline(
449
- chunk,
450
- voice=voice_code,
451
- speed=float(speed),
452
- split_pattern=r"\n+",
453
- )
454
- for _, _, audio in generator:
455
- yield audio
456
-
457
- # ------------------------------------------------------------
458
- # WARMUP
459
- # Moves the first-call latency to startup instead of first user request.
460
- # ------------------------------------------------------------
 
 
 
461
  def warmup():
462
  try:
463
- for _ in kokoro_generate("Hello.", "af_bella", 1.0):
 
464
  break
465
- print("โœ… WARMUP DONE")
466
  except Exception as e:
467
  print(f"โš ๏ธ WARMUP FAILED: {e}")
468
 
469
- # ------------------------------------------------------------
470
- # GRADIO STREAM
471
- # ------------------------------------------------------------
472
- def gradio_stream_generator(text, voice_name, speed):
473
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
474
  text = normalize_text(text)
475
 
476
- print("--- START UI STREAM ---")
477
- for i, chunk in enumerate(tuned_splitter(text)):
478
- t0 = time.time()
479
- for audio in kokoro_generate(chunk, voice_code, speed):
480
- dur = time.time() - t0
481
- print(f"โšก UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
482
- yield 24000, audio_to_int16_np(audio)
483
- print("--- END UI STREAM ---")
484
-
485
- # ------------------------------------------------------------
486
- # FASTAPI + WEBSOCKET QUEUE
487
- # ------------------------------------------------------------
 
 
488
  api = FastAPI()
 
489
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
490
- INFERENCE_QUEUE = asyncio.Queue()
491
 
492
  async def audio_engine_loop():
493
  print("โšก API AUDIO PIPELINE STARTED")
494
  loop = asyncio.get_running_loop()
495
 
496
  while True:
497
- ws, voice_code, speed, chunk = await INFERENCE_QUEUE.get()
498
- try:
499
- if ws.client_state.value > 1:
500
- continue
501
 
502
- def _run_and_pack():
503
- frames = []
504
- for audio in kokoro_generate(chunk, voice_code, speed):
505
- frames.append(audio_to_pcm_bytes(audio))
506
- return frames
507
 
508
- frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run_and_pack)
509
 
510
- for frame in frames:
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  try:
512
- await ws.send_bytes(frame)
513
  except Exception:
514
- break
515
 
516
- except Exception as e:
517
- print(f"API Engine Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
 
519
  @api.on_event("startup")
520
  async def startup():
521
- # Warmup in executor so startup does not block event loop
522
  loop = asyncio.get_running_loop()
523
  await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
524
  asyncio.create_task(audio_engine_loop())
@@ -559,25 +566,21 @@ async def websocket_endpoint(ws: WebSocket):
559
  speed = float(data.get("speed", speed))
560
 
561
  if "text" in data:
562
- text = normalize_text(data["text"])
563
- # Enqueue fast first chunk first
564
- for chunk in tuned_splitter(text):
565
- if chunk.strip():
566
- await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
567
 
568
  if "flush" in data:
569
  pass
570
 
571
- except Exception as e:
572
- print(f"๐Ÿ”ฅ Critical WS Error: {e}")
573
  finally:
574
  heartbeat_task.cancel()
575
 
576
- # ------------------------------------------------------------
577
- # GRADIO UI
578
- # ------------------------------------------------------------
579
  with gr.Blocks(title="Kokoro TTS") as app:
580
- gr.Markdown("## โšก Kokoro-82M (Official Pipeline, Fast First Audio)")
581
  with gr.Row():
582
  with gr.Column():
583
  text_in = gr.Textbox(
@@ -595,7 +598,7 @@ with gr.Blocks(title="Kokoro TTS") as app:
595
  with gr.Column():
596
  audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
597
 
598
- btn.click(gradio_stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
599
 
600
  final_app = gr.mount_gradio_app(api, app, path="/")
601
 
 
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  import os
 
298
  import re
299
+ import time
300
  import asyncio
301
  from concurrent.futures import ThreadPoolExecutor
302
 
 
308
  import torch
309
  from kokoro import KPipeline
310
 
311
+ # ----------------------------
312
+ # HARD LIMIT CPU THREADS (2 vCPU box)
313
+ # ----------------------------
314
+ os.environ.setdefault("OMP_NUM_THREADS", "2")
315
+ os.environ.setdefault("MKL_NUM_THREADS", "2")
316
+ os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
317
+
318
  try:
319
+ torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
320
+ torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
321
  except Exception:
322
  pass
323
 
324
+ # Optional: uvloop for faster event loop on HF Linux
 
 
325
  try:
326
+ import uvloop # type: ignore
327
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
328
  except Exception:
329
  pass
330
 
331
+ print("๐Ÿš€ BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
 
 
 
 
 
 
332
 
333
+ # ----------------------------
334
+ # VOICES
335
+ # ----------------------------
336
  VOICE_CHOICES = {
337
  "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Heart": "af_heart", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella": "af_bella", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Nicole": "af_nicole",
338
  "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Aoede": "af_aoede", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Kore": "af_kore", "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Sarah": "af_sarah",
 
348
 
349
  def voice_to_lang_code(voice_code: str) -> str:
350
  if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
351
+ return "b" # British
352
+ return "a" # American
353
+
354
+ # ----------------------------
355
+ # PIPELINES (keep hot in RAM)
356
+ # ----------------------------
357
+ PIPELINES = {
358
+ "a": KPipeline(lang_code="a"),
359
+ "b": KPipeline(lang_code="b"),
360
+ }
361
 
362
+ # ----------------------------
363
+ # TEXT NORMALIZATION (matches your pasted official docs)
364
+ # ----------------------------
 
365
  def normalize_text(text: str) -> str:
366
  if not text:
367
+ return ""
368
  return text.replace("Kokoro", "[Kokoro](/kหˆOkษ™ษนO/)")
369
 
370
+ # ----------------------------
371
+ # LOW LATENCY SEGMENTATION
372
+ # One pipeline call per request.
373
+ # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
374
+ # We also force a small first segment for fast first audio.
375
+ # ----------------------------
376
+ _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
377
 
378
+ def inject_newlines_for_fast_stream(text: str) -> str:
379
+ text = normalize_text(text).strip()
380
  if not text:
381
+ return ""
382
+
383
+ # Sentence boundaries -> newline so official split_pattern can segment
384
+ text = _SENT_BOUNDARY.sub(r"\1\n", text)
385
+
386
+ # Also split on existing multi-newlines
387
+ text = re.sub(r"\n{3,}", "\n\n", text)
388
+
389
+ # Guarantee a small first segment for low time-to-first-audio
390
+ if "\n" not in text and len(text) > 90:
391
+ cut = text.rfind(" ", 0, 70)
392
+ if cut < 35:
393
+ cut = 70
394
+ text = text[:cut].strip() + "\n" + text[cut:].strip()
395
+
396
+ return text
397
 
398
+ # ----------------------------
399
+ # AUDIO CONVERSION (fast, safe)
400
+ # ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  def audio_to_int16_np(audio):
402
  if isinstance(audio, torch.Tensor):
403
  audio = audio.detach().cpu()
404
  audio = torch.clamp(audio, -1.0, 1.0)
405
+ return (audio * 32767.0).to(torch.int16).numpy()
 
406
 
407
  audio = np.asarray(audio)
408
  audio = np.clip(audio, -1.0, 1.0)
 
411
  def audio_to_pcm_bytes(audio) -> bytes:
412
  return audio_to_int16_np(audio).tobytes()
413
 
414
+ # ----------------------------
415
+ # OFFICIAL GENERATION PATH (single pipeline call)
416
  # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
417
+ # ----------------------------
418
+ def kokoro_generator_full(text: str, voice_code: str, speed: float):
419
  lang_code = voice_to_lang_code(voice_code)
420
  pipeline = PIPELINES[lang_code]
421
+ text = inject_newlines_for_fast_stream(text)
422
 
423
+ if not text:
424
+ return
425
+
426
+ with torch.inference_mode():
427
+ generator = pipeline(
428
+ text,
429
+ voice=voice_code,
430
+ speed=float(speed),
431
+ split_pattern=r"\n+",
432
+ )
433
+ for _, _, audio in generator:
434
+ yield audio
435
+
436
+ # ----------------------------
437
+ # WARMUP (pay cold-start cost at boot)
438
+ # ----------------------------
439
  def warmup():
440
  try:
441
+ t0 = time.time()
442
+ for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
443
  break
444
+ print(f"โœ… WARMUP DONE in {time.time() - t0:.2f}s")
445
  except Exception as e:
446
  print(f"โš ๏ธ WARMUP FAILED: {e}")
447
 
448
+ # ----------------------------
449
+ # GRADIO UI STREAM
450
+ # ----------------------------
451
+ def gradio_stream(text, voice_name, speed):
452
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
453
  text = normalize_text(text)
454
 
455
+ i = 0
456
+ t0 = time.time()
457
+ for audio in kokoro_generator_full(text, voice_code, speed):
458
+ if i == 0:
459
+ print(f"โšก UI first audio in {time.time() - t0:.2f}s")
460
+ i += 1
461
+ yield 24000, audio_to_int16_np(audio)
462
+
463
+ # ----------------------------
464
+ # FASTAPI WS ENGINE
465
+ # Single worker thread for actual generation.
466
+ # Stream frames to client as soon as they exist.
467
+ # No buffering a full list before sending.
468
+ # ----------------------------
469
  api = FastAPI()
470
+
471
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
472
+ INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
473
 
474
  async def audio_engine_loop():
475
  print("โšก API AUDIO PIPELINE STARTED")
476
  loop = asyncio.get_running_loop()
477
 
478
  while True:
479
+ ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
 
 
 
480
 
481
+ # Skip dead clients early
482
+ if ws.client_state.value > 1:
483
+ continue
 
 
484
 
485
+ frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
486
 
487
+ def _worker():
488
+ try:
489
+ for audio in kokoro_generator_full(text, voice_code, speed):
490
+ b = audio_to_pcm_bytes(audio)
491
+ # backpressure aware
492
+ while True:
493
+ try:
494
+ loop.call_soon_threadsafe(frame_q.put_nowait, b)
495
+ break
496
+ except Exception:
497
+ time.sleep(0.001)
498
+ loop.call_soon_threadsafe(frame_q.put_nowait, None)
499
+ except Exception as e:
500
+ print(f"API Worker Error: {e}")
501
  try:
502
+ loop.call_soon_threadsafe(frame_q.put_nowait, None)
503
  except Exception:
504
+ pass
505
 
506
+ INFERENCE_EXECUTOR.submit(_worker)
507
+
508
+ first_sent = False
509
+ started = time.time()
510
+
511
+ while True:
512
+ frame = await frame_q.get()
513
+ if frame is None:
514
+ break
515
+
516
+ if ws.client_state.value > 1:
517
+ break
518
+
519
+ try:
520
+ await ws.send_bytes(frame)
521
+ if not first_sent:
522
+ print(f"โšก API first audio in {time.time() - started:.2f}s")
523
+ first_sent = True
524
+ except Exception:
525
+ break
526
 
527
  @api.on_event("startup")
528
  async def startup():
 
529
  loop = asyncio.get_running_loop()
530
  await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
531
  asyncio.create_task(audio_engine_loop())
 
566
  speed = float(data.get("speed", speed))
567
 
568
  if "text" in data:
569
+ text = normalize_text(data.get("text", ""))
570
+ if text.strip():
571
+ await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
 
 
572
 
573
  if "flush" in data:
574
  pass
575
 
 
 
576
  finally:
577
  heartbeat_task.cancel()
578
 
579
+ # ----------------------------
580
+ # GRADIO APP
581
+ # ----------------------------
582
  with gr.Blocks(title="Kokoro TTS") as app:
583
+ gr.Markdown("## โšก Kokoro-82M (Official Pipeline, Low Latency)")
584
  with gr.Row():
585
  with gr.Column():
586
  text_in = gr.Textbox(
 
598
  with gr.Column():
599
  audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
600
 
601
+ btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
602
 
603
  final_app = gr.mount_gradio_app(api, app, path="/")
604