auralodyssey commited on
Commit
7576e85
·
verified ·
1 Parent(s): 4daf7c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -181
app.py CHANGED
@@ -295,12 +295,9 @@
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  import os
298
- import re
299
- import json
300
  import time
 
301
  import asyncio
302
- import uvloop
303
- from functools import lru_cache
304
  from concurrent.futures import ThreadPoolExecutor
305
 
306
  import numpy as np
@@ -309,217 +306,145 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
309
  import uvicorn
310
 
311
  import torch
312
- import soundfile as sf
313
- from huggingface_hub import hf_hub_download
314
  from kokoro import KPipeline
315
 
316
- # -----------------------------
317
- # HF SPACE REALITY SETTINGS
318
- # -----------------------------
319
- # Free CPU Basic is small, so keep concurrency controlled.
320
- torch.set_num_threads(max(1, int(os.environ.get("TORCH_NUM_THREADS", "2"))))
321
-
322
- asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
323
 
324
  print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
325
 
326
- # -----------------------------
327
- # VOICES
328
- # -----------------------------
329
- VOICE_CHOICES = {
330
- "🇺🇸 🚺 Heart": "af_heart",
331
- "🇺🇸 🚺 Bella": "af_bella",
332
- "🇺🇸 🚺 Nicole": "af_nicole",
333
- "🇺🇸 🚺 Aoede": "af_aoede",
334
- "🇺🇸 🚺 Kore": "af_kore",
335
- "🇺🇸 🚺 Sarah": "af_sarah",
336
- "🇺🇸 🚺 Nova": "af_nova",
337
- "🇺🇸 🚺 Sky": "af_sky",
338
- "🇺🇸 🚺 Alloy": "af_alloy",
339
- "🇺🇸 🚺 Jessica": "af_jessica",
340
- "🇺🇸 🚺 River": "af_river",
341
- "🇺🇸 🚹 Michael": "am_michael",
342
- "🇺🇸 🚹 Fenrir": "am_fenrir",
343
- "🇺🇸 🚹 Puck": "am_puck",
344
- "🇺🇸 🚹 Echo": "am_echo",
345
- "🇺🇸 🚹 Eric": "am_eric",
346
- "🇺🇸 🚹 Liam": "am_liam",
347
- "🇺🇸 🚹 Onyx": "am_onyx",
348
- "🇺🇸 🚹 Santa": "am_santa",
349
- "🇺🇸 🚹 Adam": "am_adam",
350
- "🇬🇧 🚺 Emma": "bf_emma",
351
- "🇬🇧 🚺 Isabella": "bf_isabella",
352
- "🇬🇧 🚺 Alice": "bf_alice",
353
- "🇬🇧 🚺 Lily": "bf_lily",
354
- "🇬🇧 🚹 George": "bm_george",
355
- "🇬🇧 🚹 Fable": "bm_fable",
356
- "🇬🇧 🚹 Lewis": "bm_lewis",
357
- "🇬🇧 🚹 Daniel": "bm_daniel",
358
- }
359
-
360
- # Kokoro official repo for weights + voices
361
- KOKORO_REPO = "hexgrad/Kokoro-82M"
362
-
363
- # -----------------------------
364
- # PIPELINES
365
- # lang_code must match voice family. :contentReference[oaicite:7]{index=7}
366
- # -----------------------------
367
  PIPELINES = {
368
- "a": KPipeline(lang_code="a"), # American English
369
- "b": KPipeline(lang_code="b"), # British English
370
  }
371
 
372
- # -----------------------------
373
- # OPTIONAL: preload spacy model if present
374
- # prevents runtime download surprises
375
- # -----------------------------
376
- try:
377
- import spacy
378
- spacy.load("en_core_web_sm")
379
- except Exception:
380
- pass
381
-
382
- # -----------------------------
383
- # VOICE CACHE (torch tensors)
384
- # -----------------------------
385
- VOICE_TENSOR_CACHE = {}
386
 
387
  def voice_to_lang_code(voice_code: str) -> str:
388
- # af_ / am_ => 'a', bf_ / bm_ => 'b'
389
- if voice_code.startswith("b"):
390
  return "b"
391
  return "a"
392
 
393
- def get_voice_tensor(voice_code: str):
394
- if voice_code in VOICE_TENSOR_CACHE:
395
- return VOICE_TENSOR_CACHE[voice_code]
396
-
397
- path = hf_hub_download(
398
- repo_id=KOKORO_REPO,
399
- filename=f"voices/{voice_code}.pt",
400
- )
401
- # weights_only True is recommended by torch warning text in your logs
402
- vt = torch.load(path, map_location="cpu", weights_only=True)
403
- VOICE_TENSOR_CACHE[voice_code] = vt
404
- return vt
405
-
406
- # -----------------------------
407
- # TEXT NORMALIZATION
408
- # Stops “skipping” for many brand names by avoiding OOD token collapse.
409
- # Also makes acronyms pronounceable.
410
- # -----------------------------
411
- _ACRONYM_RE = re.compile(r"\b([A-Z]{2,})\b")
412
- _CAMEL_RE = re.compile(r"([a-z])([A-Z])")
413
- _DIGIT_WORD_RE = re.compile(r"\b(\d+)([A-Za-z]+)\b")
414
-
415
- def normalize_text_for_kokoro(text: str) -> str:
416
  if not text:
417
  return text
418
-
419
- # Keep your special Kokoro pronunciation trick
420
  text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
421
-
422
- # Split CamelCase: OpenAI -> Open AI
423
- text = _CAMEL_RE.sub(r"\1 \2", text)
424
-
425
- # Handle 2FA -> "2 F A" (first split digits+letters)
426
- text = _DIGIT_WORD_RE.sub(r"\1 \2", text)
427
-
428
- # Acronyms: API -> "A P I"
429
- def _spell(m):
430
- s = m.group(1)
431
- return " ".join(list(s))
432
- text = _ACRONYM_RE.sub(_spell, text)
433
-
434
  return text
435
 
436
- # -----------------------------
437
  # CHUNKING
438
- # Fewer micro-chunks reduces stalls under load.
439
- # -----------------------------
440
  _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
441
 
442
- def chunk_text(text: str, min_chars: int = 240, max_chars: int = 520):
443
- text = text.strip()
444
  if not text:
445
  return
 
446
 
447
- parts = _SENT_SPLIT.split(text)
448
  buf = ""
449
  for p in parts:
450
- if not p:
 
 
 
 
 
 
451
  continue
452
- if len(buf) + len(p) + 1 <= max_chars:
453
- buf = (buf + " " + p).strip()
454
- if len(buf) < min_chars:
455
- continue
456
- yield buf
457
- buf = ""
458
- else:
459
- if buf:
460
- yield buf
461
- buf = p.strip()
462
- if len(buf) >= min_chars:
463
- yield buf
464
- buf = ""
465
 
466
  if buf:
467
  yield buf
468
 
469
- # -----------------------------
470
- # AUDIO UTILS
471
- # Avoid trimming per-chunk to prevent audible “missing” regions.
472
- # Do optional gentle trim only on final concatenated output if needed.
473
- # -----------------------------
474
- def float_to_int16(audio_f32: np.ndarray) -> np.ndarray:
475
- audio_f32 = np.clip(audio_f32, -1.0, 1.0)
476
- return (audio_f32 * 32767.0).astype(np.int16)
477
-
478
- # -----------------------------
479
- # CORE SYNTH
480
- # Uses official generator API. :contentReference[oaicite:8]{index=8}
481
- # -----------------------------
482
- def kokoro_generate_stream(text: str, voice_code: str, speed: float):
 
 
 
 
 
 
 
 
 
 
483
  lang_code = voice_to_lang_code(voice_code)
484
  pipeline = PIPELINES[lang_code]
485
- voice_tensor = get_voice_tensor(voice_code)
486
 
487
- # We already chunk ourselves, so keep split_pattern simple.
488
- # If you pass a strong splitter here, you will double-split and create micro audio pieces.
489
  generator = pipeline(
490
- text,
491
- voice=voice_tensor,
492
  speed=float(speed),
493
- split_pattern=r"$^", # split nothing
494
  )
495
 
496
  for _, _, audio in generator:
497
- # audio is float array at 24kHz
498
  yield audio
499
 
500
- # -----------------------------
501
  # GRADIO STREAM
502
- # -----------------------------
503
  def gradio_stream_generator(text, voice_name, speed):
504
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
505
- text = normalize_text_for_kokoro(text)
506
 
507
- # warm voice cache
508
- get_voice_tensor(voice_code)
509
-
510
- for i, chunk in enumerate(chunk_text(text)):
511
  t0 = time.time()
512
- # generator yields 1 item because split_pattern disables splitting
513
- for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
514
  dur = time.time() - t0
515
  print(f"⚡ UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
516
- yield 24000, float_to_int16(audio_f32)
 
517
 
518
- # -----------------------------
519
- # FASTAPI WS
520
- # -----------------------------
 
521
  api = FastAPI()
522
-
523
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
524
  INFERENCE_QUEUE = asyncio.Queue()
525
 
@@ -534,13 +459,13 @@ async def audio_engine_loop():
534
  if ws.client_state.value > 1:
535
  continue
536
 
537
- def _run():
538
- out = []
539
- for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
540
- out.append(float_to_int16(audio_f32).tobytes())
541
- return out
542
 
543
- frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run)
544
 
545
  for frame in frames:
546
  try:
@@ -589,13 +514,10 @@ async def websocket_endpoint(ws: WebSocket):
589
  voice_name = data.get("voice", "🇺🇸 🚺 Bella")
590
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
591
  speed = float(data.get("speed", speed))
592
- get_voice_tensor(voice_code)
593
 
594
  if "text" in data:
595
- raw = data["text"]
596
- text = normalize_text_for_kokoro(raw)
597
- # Bigger chunks reduces stalls under load
598
- for chunk in chunk_text(text):
599
  if chunk.strip():
600
  await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
601
 
@@ -607,17 +529,17 @@ async def websocket_endpoint(ws: WebSocket):
607
  finally:
608
  heartbeat_task.cancel()
609
 
610
- # -----------------------------
611
  # GRADIO UI
612
- # -----------------------------
613
  with gr.Blocks(title="Kokoro TTS") as app:
614
- gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, HF CPU-friendly)")
615
  with gr.Row():
616
  with gr.Column():
617
  text_in = gr.Textbox(
618
  label="Input Text",
619
  lines=3,
620
- value="The system is live. Use the UI or connect to /ws/audio.",
621
  )
622
  voice_in = gr.Dropdown(
623
  list(VOICE_CHOICES.keys()),
@@ -634,4 +556,4 @@ with gr.Blocks(title="Kokoro TTS") as app:
634
  final_app = gr.mount_gradio_app(api, app, path="/")
635
 
636
  if __name__ == "__main__":
637
- uvicorn.run(final_app, host="0.0.0.0", port=7860)
 
295
  # if __name__ == "__main__":
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  import os
 
 
298
  import time
299
+ import re
300
  import asyncio
 
 
301
  from concurrent.futures import ThreadPoolExecutor
302
 
303
  import numpy as np
 
306
  import uvicorn
307
 
308
  import torch
 
 
309
  from kokoro import KPipeline
310
 
311
+ # Optional speed boost on HF Linux
312
+ try:
313
+ import uvloop # type: ignore
314
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
315
+ except Exception:
316
+ pass
 
317
 
318
  print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
319
 
320
+ # ------------------------------------------------------------
321
+ # OFFICIAL PIPELINES (per docs you pasted)
322
+ # 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
323
+ # ------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  PIPELINES = {
325
+ "a": KPipeline(lang_code="a"),
326
+ "b": KPipeline(lang_code="b"),
327
  }
328
 
329
+ VOICE_CHOICES = {
330
+ "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
331
+ "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
332
+ "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
333
+ "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
334
+ "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
335
+ "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
336
+ "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
337
+ "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
338
+ "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
339
+ "🇬🇧 🚹 Daniel": "bm_daniel",
340
+ }
 
 
341
 
342
  def voice_to_lang_code(voice_code: str) -> str:
343
+ # bf_ / bm_ are British
344
+ if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
345
  return "b"
346
  return "a"
347
 
348
+ # ------------------------------------------------------------
349
+ # TEXT HELPERS (sticking to your pasted docs format)
350
+ # Use IPA markup like: [Kokoro](/kˈOkəɹO/)
351
+ # ------------------------------------------------------------
352
+ def normalize_text(text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  if not text:
354
  return text
355
+ # Your docs show this exact IPA form for Kokoro
 
356
  text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  return text
358
 
359
+ # ------------------------------------------------------------
360
  # CHUNKING
361
+ # Main goal: avoid tiny chunks that cause audible discontinuity.
362
+ # ------------------------------------------------------------
363
  _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
364
 
365
+ def tuned_splitter(text: str):
366
+ text = (text or "").strip()
367
  if not text:
368
  return
369
+ parts = [p.strip() for p in _SENT_SPLIT.split(text) if p and p.strip()]
370
 
 
371
  buf = ""
372
  for p in parts:
373
+ if not buf:
374
+ buf = p
375
+ continue
376
+
377
+ # Grow chunks to reduce boundary artifacts
378
+ if len(buf) < 220:
379
+ buf = f"{buf} {p}"
380
  continue
381
+
382
+ yield buf
383
+ buf = p
 
 
 
 
 
 
 
 
 
 
384
 
385
  if buf:
386
  yield buf
387
 
388
+ # ------------------------------------------------------------
389
+ # AUDIO CONVERSION FIX
390
+ # Fixes: "'Tensor' object has no attribute 'astype'"
391
+ # ------------------------------------------------------------
392
+ def audio_to_int16_np(audio):
393
+ # audio can be torch.Tensor or np.ndarray
394
+ if isinstance(audio, torch.Tensor):
395
+ audio = audio.detach().cpu()
396
+ audio = torch.clamp(audio, -1.0, 1.0)
397
+ audio_i16 = (audio * 32767.0).to(torch.int16)
398
+ return audio_i16.numpy()
399
+
400
+ audio = np.asarray(audio)
401
+ audio = np.clip(audio, -1.0, 1.0)
402
+ return (audio * 32767.0).astype(np.int16)
403
+
404
+ def audio_to_pcm_bytes(audio) -> bytes:
405
+ return audio_to_int16_np(audio).tobytes()
406
+
407
+ # ------------------------------------------------------------
408
+ # OFFICIAL GENERATION (per your docs)
409
+ # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
410
+ # ------------------------------------------------------------
411
+ def kokoro_generate(chunk: str, voice_code: str, speed: float):
412
  lang_code = voice_to_lang_code(voice_code)
413
  pipeline = PIPELINES[lang_code]
 
414
 
415
+ # Keep split_pattern exactly in the spirit of your docs
416
+ # Our own splitter already splits on sentence/newlines, so this stays light.
417
  generator = pipeline(
418
+ chunk,
419
+ voice=voice_code,
420
  speed=float(speed),
421
+ split_pattern=r"\n+",
422
  )
423
 
424
  for _, _, audio in generator:
 
425
  yield audio
426
 
427
+ # ------------------------------------------------------------
428
  # GRADIO STREAM
429
+ # ------------------------------------------------------------
430
  def gradio_stream_generator(text, voice_name, speed):
431
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
432
+ text = normalize_text(text)
433
 
434
+ print("--- START UI STREAM ---")
435
+ for i, chunk in enumerate(tuned_splitter(text)):
 
 
436
  t0 = time.time()
437
+ for audio in kokoro_generate(chunk, voice_code, speed):
 
438
  dur = time.time() - t0
439
  print(f"⚡ UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
440
+ yield 24000, audio_to_int16_np(audio)
441
+ print("--- END UI STREAM ---")
442
 
443
+ # ------------------------------------------------------------
444
+ # FASTAPI + WEBSOCKET QUEUE
445
+ # Keep it single-file on CPU to stay stable under load.
446
+ # ------------------------------------------------------------
447
  api = FastAPI()
 
448
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
449
  INFERENCE_QUEUE = asyncio.Queue()
450
 
 
459
  if ws.client_state.value > 1:
460
  continue
461
 
462
+ def _run_and_pack():
463
+ frames = []
464
+ for audio in kokoro_generate(chunk, voice_code, speed):
465
+ frames.append(audio_to_pcm_bytes(audio))
466
+ return frames
467
 
468
+ frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run_and_pack)
469
 
470
  for frame in frames:
471
  try:
 
514
  voice_name = data.get("voice", "🇺🇸 🚺 Bella")
515
  voice_code = VOICE_CHOICES.get(voice_name, voice_name)
516
  speed = float(data.get("speed", speed))
 
517
 
518
  if "text" in data:
519
+ text = normalize_text(data["text"])
520
+ for chunk in tuned_splitter(text):
 
 
521
  if chunk.strip():
522
  await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
523
 
 
529
  finally:
530
  heartbeat_task.cancel()
531
 
532
+ # ------------------------------------------------------------
533
  # GRADIO UI
534
+ # ------------------------------------------------------------
535
  with gr.Blocks(title="Kokoro TTS") as app:
536
+ gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline)")
537
  with gr.Row():
538
  with gr.Column():
539
  text_in = gr.Textbox(
540
  label="Input Text",
541
  lines=3,
542
+ value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
543
  )
544
  voice_in = gr.Dropdown(
545
  list(VOICE_CHOICES.keys()),
 
556
  final_app = gr.mount_gradio_app(api, app, path="/")
557
 
558
  if __name__ == "__main__":
559
+ uvicorn.run(final_app, host="0.0.0.0", port=7860)