auralodyssey commited on
Commit
4daf7c6
Β·
verified Β·
1 Parent(s): 38881c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -196
app.py CHANGED
@@ -296,219 +296,228 @@
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  import os
298
  import re
 
299
  import time
300
  import asyncio
301
  import uvloop
 
 
 
302
  import numpy as np
303
  import gradio as gr
304
- import torch
305
- from functools import lru_cache
306
- from huggingface_hub import hf_hub_download
307
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
308
  import uvicorn
309
- from concurrent.futures import ThreadPoolExecutor
310
 
311
- # Force Gradio SSR off to avoid Node proxy issues on Spaces
312
- # You can also set GRADIO_SSR_MODE=0 in Space Variables
313
- os.environ.setdefault("GRADIO_SSR_MODE", "0")
 
314
 
315
- # -----------------------
316
- # HF free tier constraints
317
- # -----------------------
318
- # CPU Basic = 2 vCPU, 16GB RAM. Expect queuing under load. :contentReference[oaicite:5]{index=5}
 
319
 
320
- # -----------------------
321
- # Kokoro official pipeline
322
- # -----------------------
323
- from kokoro import KPipeline
324
 
325
- # -----------------------
326
- # Voice UI (same mapping)
327
- # -----------------------
 
 
328
  VOICE_CHOICES = {
329
- 'πŸ‡ΊπŸ‡Έ 🚺 Heart': 'af_heart', 'πŸ‡ΊπŸ‡Έ 🚺 Bella': 'af_bella', 'πŸ‡ΊπŸ‡Έ 🚺 Nicole': 'af_nicole',
330
- 'πŸ‡ΊπŸ‡Έ 🚺 Aoede': 'af_aoede', 'πŸ‡ΊπŸ‡Έ 🚺 Kore': 'af_kore', 'πŸ‡ΊπŸ‡Έ 🚺 Sarah': 'af_sarah',
331
- 'πŸ‡ΊπŸ‡Έ 🚺 Nova': 'af_nova', 'πŸ‡ΊπŸ‡Έ 🚺 Sky': 'af_sky', 'πŸ‡ΊπŸ‡Έ 🚺 Alloy': 'af_alloy',
332
- 'πŸ‡ΊπŸ‡Έ 🚺 Jessica': 'af_jessica', 'πŸ‡ΊπŸ‡Έ 🚺 River': 'af_river', 'πŸ‡ΊπŸ‡Έ 🚹 Michael': 'am_michael',
333
- 'πŸ‡ΊπŸ‡Έ 🚹 Fenrir': 'am_fenrir', 'πŸ‡ΊπŸ‡Έ 🚹 Puck': 'am_puck', 'πŸ‡ΊπŸ‡Έ 🚹 Echo': 'am_echo',
334
- 'πŸ‡ΊπŸ‡Έ 🚹 Eric': 'am_eric', 'πŸ‡ΊπŸ‡Έ 🚹 Liam': 'am_liam', 'πŸ‡ΊπŸ‡Έ 🚹 Onyx': 'am_onyx',
335
- 'πŸ‡ΊπŸ‡Έ 🚹 Santa': 'am_santa', 'πŸ‡ΊπŸ‡Έ 🚹 Adam': 'am_adam', 'πŸ‡¬πŸ‡§ 🚺 Emma': 'bf_emma',
336
- 'πŸ‡¬πŸ‡§ 🚺 Isabella': 'bf_isabella', 'πŸ‡¬πŸ‡§ 🚺 Alice': 'bf_alice', 'πŸ‡¬πŸ‡§ 🚺 Lily': 'bf_lily',
337
- 'πŸ‡¬πŸ‡§ 🚹 George': 'bm_george', 'πŸ‡¬πŸ‡§ 🚹 Fable': 'bm_fable', 'πŸ‡¬πŸ‡§ 🚹 Lewis': 'bm_lewis',
338
- 'πŸ‡¬πŸ‡§ 🚹 Daniel': 'bm_daniel',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  }
340
 
341
- VOICE_REPO = "hexgrad/Kokoro-82M" # voices/*.pt live here :contentReference[oaicite:6]{index=6}
 
342
 
343
- print("πŸš€ BOOTING KOKORO (OFFICIAL PIPELINE)")
344
- asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
 
 
 
 
 
345
 
346
- # Torch CPU tuning for HF 2 vCPU
347
- torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "2")))
348
- torch.set_num_interop_threads(int(os.getenv("TORCH_NUM_INTEROP_THREADS", "1")))
349
- torch.backends.mkldnn.enabled = True
 
 
 
 
 
 
 
 
 
 
350
 
351
- # Use American English by default to match your US voices
352
- PIPELINE = KPipeline(lang_code="a")
 
 
 
353
 
354
- # Voice cache (load once, reuse)
355
- VOICE_TENSOR_CACHE = {}
 
356
 
357
- def get_voice_tensor(voice_name: str):
358
- code = VOICE_CHOICES.get(voice_name, voice_name)
359
- if code not in VOICE_TENSOR_CACHE:
360
- path = hf_hub_download(repo_id=VOICE_REPO, filename=f"voices/{code}.pt")
361
- VOICE_TENSOR_CACHE[code] = torch.load(path, map_location="cpu")
362
- return VOICE_TENSOR_CACHE[code]
363
-
364
- # -----------------------
365
- # Text normalization to stop name skipping
366
- # -----------------------
367
- _ACRONYM = re.compile(r"\b[A-Z]{2,}\b")
368
- _CAMEL = re.compile(r"\b([A-Z][a-z]+)([A-Z][A-Za-z]+)\b")
369
- _VER = re.compile(r"\b(v|V)(\d+(\.\d+)*)\b")
370
- _GPT = re.compile(r"\bGPT[- ]?(\d+)\b", re.IGNORECASE)
371
-
372
- def normalize_for_tts(text: str) -> str:
 
 
 
373
  if not text:
374
  return text
375
 
376
- # Preserve Kokoro IPA hint pattern shown in model card :contentReference[oaicite:7]{index=7}
377
  text = text.replace("Kokoro", "[Kokoro](/kˈOkΙ™ΙΉO/)")
378
 
379
- # Turn GPT-5 into "G P T 5"
380
- text = _GPT.sub(lambda m: "G P T " + m.group(1), text)
381
-
382
- # Turn v1.0 into "version 1.0"
383
- text = _VER.sub(lambda m: "version " + m.group(2), text)
384
 
385
- # Split CamelCase: OpenAI -> Open AI, DeepInfra -> Deep Infra
386
- # Repeat a few times to handle longer chains
387
- for _ in range(3):
388
- text2 = _CAMEL.sub(r"\1 \2", text)
389
- if text2 == text:
390
- break
391
- text = text2
392
 
393
- # Spell acronyms: YC -> Y C, EF -> E F
394
- text = _ACRONYM.sub(lambda m: " ".join(list(m.group(0))), text)
 
 
 
395
 
396
  return text
397
 
398
- # -----------------------
399
- # Safer trimming and anti-gap padding
400
- # -----------------------
401
- def trim_silence(audio: np.ndarray, threshold=0.003):
402
- if audio.size == 0:
403
- return audio
404
- mask = np.abs(audio) > threshold
405
- if not np.any(mask):
406
- return audio
407
- start = int(np.argmax(mask))
408
- end = int(len(mask) - np.argmax(mask[::-1]))
409
- # Keep a little context so words do not get clipped
410
- pad = 120
411
- return audio[max(0, start - pad): min(len(audio), end + pad)]
412
-
413
- SAMPLE_RATE = 24000
414
- INTER_CHUNK_SIL_MS = 40 # reduces β€œteleport” effect between chunks
415
-
416
- def wav_chunk_from_text(text: str, voice_name: str, speed: float):
417
- text = normalize_for_tts(text).strip()
418
- if not text:
419
- return None
420
-
421
- voice_tensor = get_voice_tensor(voice_name)
422
 
423
- # Do not let kokoro split again, you already split upstream
424
- gen = PIPELINE(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  text,
426
  voice=voice_tensor,
427
  speed=float(speed),
428
- split_pattern=r"$^"
429
  )
430
 
431
- # pipeline yields (gs, ps, audio) :contentReference[oaicite:8]{index=8}
432
- try:
433
- _, _, audio = next(iter(gen))
434
- except StopIteration:
435
- return None
436
-
437
- # audio is float32 [-1,1]
438
- audio = np.asarray(audio, dtype=np.float32)
439
-
440
- # For very short chunks, trimming can remove quiet consonants
441
- if len(text) >= 40:
442
- audio = trim_silence(audio, threshold=0.003)
443
-
444
- # Add a tiny silence buffer to hide boundary artifacts
445
- if INTER_CHUNK_SIL_MS > 0:
446
- pad = np.zeros(int(SAMPLE_RATE * (INTER_CHUNK_SIL_MS / 1000.0)), dtype=np.float32)
447
- audio = np.concatenate([audio, pad], axis=0)
448
-
449
- pcm = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
450
- return SAMPLE_RATE, pcm
451
-
452
- # -----------------------
453
- # Your tuned splitter, kept
454
- # -----------------------
455
- def tuned_splitter(text):
456
- chunks = re.split(r'([.,!?;:\n]+)', text)
457
- buffer = ""
458
- chunk_count = 0
459
- for part in chunks:
460
- buffer += part
461
- if chunk_count == 0:
462
- threshold = 50
463
- elif chunk_count == 1:
464
- threshold = 100
465
- elif chunk_count == 2:
466
- threshold = 150
467
- else:
468
- threshold = 250
469
- if re.search(r'[.,!?;:\n]$', buffer) and len(buffer) >= threshold:
470
- if buffer.strip():
471
- yield buffer
472
- chunk_count += 1
473
- buffer = ""
474
- if buffer.strip():
475
- yield buffer.strip()
476
-
477
- # -----------------------
478
- # Streaming generator (Gradio UI)
479
- # -----------------------
480
- def stream_generator(text, voice_name, speed):
481
- get_voice_tensor(voice_name)
482
- for i, chunk in enumerate(tuned_splitter(text)):
483
  t0 = time.time()
484
- out = wav_chunk_from_text(chunk, voice_name, speed)
485
- if out:
486
  dur = time.time() - t0
487
- print(f"⚑ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
488
- yield out
489
-
490
- # -----------------------
491
- # Gradio UI
492
- # -----------------------
493
- with gr.Blocks(title="Kokoro TTS", ssr_mode=False) as app:
494
- gr.Markdown("## ⚑ Kokoro-82M (Official Pipeline, Streamed)")
495
- with gr.Row():
496
- with gr.Column():
497
- text_in = gr.Textbox(
498
- label="Input Text",
499
- lines=3,
500
- value="The system is live. Use the UI or connect to /ws/audio."
501
- )
502
- voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='πŸ‡ΊπŸ‡Έ 🚺 Bella', label="Voice")
503
- speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
504
- btn = gr.Button("Generate", variant="primary")
505
- with gr.Column():
506
- audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
507
- btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
508
 
509
- # -----------------------
510
- # FastAPI + WebSocket (kept)
511
- # -----------------------
512
  api = FastAPI()
513
 
514
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
@@ -519,24 +528,25 @@ async def audio_engine_loop():
519
  loop = asyncio.get_running_loop()
520
 
521
  while True:
522
- voice_name, speed, chunk, ws = await INFERENCE_QUEUE.get()
 
523
  try:
524
  if ws.client_state.value > 1:
525
  continue
526
 
527
- # Run CPU-heavy synth in the executor so WS stays responsive
528
- out = await loop.run_in_executor(
529
- INFERENCE_EXECUTOR,
530
- lambda: wav_chunk_from_text(chunk, voice_name, speed)
531
- )
532
 
533
- if out is None:
534
- continue
535
 
536
- sr, pcm = out
537
- # Send metadata once per chunk so client can validate format
538
- await ws.send_json({"type": "chunk", "sr": sr, "format": "pcm_s16le"})
539
- await ws.send_bytes(pcm.tobytes())
 
540
 
541
  except Exception as e:
542
  print(f"API Engine Error: {e}")
@@ -549,7 +559,7 @@ async def startup():
549
  async def websocket_endpoint(ws: WebSocket):
550
  await ws.accept()
551
 
552
- voice_name = 'πŸ‡ΊπŸ‡Έ 🚺 Bella'
553
  speed = 1.0
554
 
555
  print(f"βœ… Client connected: {ws.client}")
@@ -576,23 +586,52 @@ async def websocket_endpoint(ws: WebSocket):
576
  break
577
 
578
  if "config" in data:
579
- voice_name = data.get("voice", voice_name)
 
580
  speed = float(data.get("speed", speed))
581
- get_voice_tensor(voice_name)
582
 
583
  if "text" in data:
584
- text = data["text"]
585
- for chunk in tuned_splitter(text):
 
 
586
  if chunk.strip():
587
- await INFERENCE_QUEUE.put((voice_name, speed, chunk, ws))
 
 
 
588
 
589
  except Exception as e:
590
  print(f"πŸ”₯ Critical WS Error: {e}")
591
  finally:
592
  heartbeat_task.cancel()
593
 
594
- # Mount gradio onto FastAPI, SSR off to avoid Node proxy issues :contentReference[oaicite:9]{index=9}
595
- final_app = gr.mount_gradio_app(api, app, path="/", ssr_mode=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  if __name__ == "__main__":
598
- uvicorn.run(final_app, host="0.0.0.0", port=7860)
 
296
  # uvicorn.run(final_app, host="0.0.0.0", port=7860)
297
  import os
298
  import re
299
+ import json
300
  import time
301
  import asyncio
302
  import uvloop
303
+ from functools import lru_cache
304
+ from concurrent.futures import ThreadPoolExecutor
305
+
306
  import numpy as np
307
  import gradio as gr
 
 
 
308
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
309
  import uvicorn
 
310
 
311
+ import torch
312
+ import soundfile as sf
313
+ from huggingface_hub import hf_hub_download
314
+ from kokoro import KPipeline
315
 
316
+ # -----------------------------
317
+ # HF SPACE REALITY SETTINGS
318
+ # -----------------------------
319
+ # Free CPU Basic is small, so keep concurrency controlled.
320
+ torch.set_num_threads(max(1, int(os.environ.get("TORCH_NUM_THREADS", "2"))))
321
 
322
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
 
 
323
 
324
+ print("πŸš€ BOOTING KOKORO (OFFICIAL PIPELINE)")
325
+
326
+ # -----------------------------
327
+ # VOICES
328
+ # -----------------------------
329
  VOICE_CHOICES = {
330
+ "πŸ‡ΊπŸ‡Έ 🚺 Heart": "af_heart",
331
+ "πŸ‡ΊπŸ‡Έ 🚺 Bella": "af_bella",
332
+ "πŸ‡ΊπŸ‡Έ 🚺 Nicole": "af_nicole",
333
+ "πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede",
334
+ "πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore",
335
+ "πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
336
+ "πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova",
337
+ "πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky",
338
+ "πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
339
+ "πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica",
340
+ "πŸ‡ΊπŸ‡Έ 🚺 River": "af_river",
341
+ "πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
342
+ "πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir",
343
+ "πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck",
344
+ "πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
345
+ "πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric",
346
+ "πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam",
347
+ "πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
348
+ "πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa",
349
+ "πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam",
350
+ "πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
351
+ "πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella",
352
+ "πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice",
353
+ "πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
354
+ "πŸ‡¬πŸ‡§ 🚹 George": "bm_george",
355
+ "πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable",
356
+ "πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
357
+ "πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
358
  }
359
 
360
+ # Kokoro official repo for weights + voices
361
+ KOKORO_REPO = "hexgrad/Kokoro-82M"
362
 
363
+ # -----------------------------
364
+ # PIPELINES
365
+ # lang_code must match voice family. :contentReference[oaicite:7]{index=7}
366
+ # -----------------------------
367
+ PIPELINES = {
368
+ "a": KPipeline(lang_code="a"), # American English
369
+ "b": KPipeline(lang_code="b"), # British English
370
+ }
371
 
372
+ # -----------------------------
373
+ # OPTIONAL: preload spacy model if present
374
+ # prevents runtime download surprises
375
+ # -----------------------------
376
+ try:
377
+ import spacy
378
+ spacy.load("en_core_web_sm")
379
+ except Exception:
380
+ pass
381
+
382
+ # -----------------------------
383
+ # VOICE CACHE (torch tensors)
384
+ # -----------------------------
385
+ VOICE_TENSOR_CACHE = {}
386
 
387
+ def voice_to_lang_code(voice_code: str) -> str:
388
+ # af_ / am_ => 'a', bf_ / bm_ => 'b'
389
+ if voice_code.startswith("b"):
390
+ return "b"
391
+ return "a"
392
 
393
+ def get_voice_tensor(voice_code: str):
394
+ if voice_code in VOICE_TENSOR_CACHE:
395
+ return VOICE_TENSOR_CACHE[voice_code]
396
 
397
+ path = hf_hub_download(
398
+ repo_id=KOKORO_REPO,
399
+ filename=f"voices/{voice_code}.pt",
400
+ )
401
+ # weights_only True is recommended by torch warning text in your logs
402
+ vt = torch.load(path, map_location="cpu", weights_only=True)
403
+ VOICE_TENSOR_CACHE[voice_code] = vt
404
+ return vt
405
+
406
+ # -----------------------------
407
+ # TEXT NORMALIZATION
408
+ # Stops β€œskipping” for many brand names by avoiding OOD token collapse.
409
+ # Also makes acronyms pronounceable.
410
+ # -----------------------------
411
+ _ACRONYM_RE = re.compile(r"\b([A-Z]{2,})\b")
412
+ _CAMEL_RE = re.compile(r"([a-z])([A-Z])")
413
+ _DIGIT_WORD_RE = re.compile(r"\b(\d+)([A-Za-z]+)\b")
414
+
415
+ def normalize_text_for_kokoro(text: str) -> str:
416
  if not text:
417
  return text
418
 
419
+ # Keep your special Kokoro pronunciation trick
420
  text = text.replace("Kokoro", "[Kokoro](/kˈOkΙ™ΙΉO/)")
421
 
422
+ # Split CamelCase: OpenAI -> Open AI
423
+ text = _CAMEL_RE.sub(r"\1 \2", text)
 
 
 
424
 
425
+ # Handle 2FA -> "2 F A" (first split digits+letters)
426
+ text = _DIGIT_WORD_RE.sub(r"\1 \2", text)
 
 
 
 
 
427
 
428
+ # Acronyms: API -> "A P I"
429
+ def _spell(m):
430
+ s = m.group(1)
431
+ return " ".join(list(s))
432
+ text = _ACRONYM_RE.sub(_spell, text)
433
 
434
  return text
435
 
436
+ # -----------------------------
437
+ # CHUNKING
438
+ # Fewer micro-chunks reduces stalls under load.
439
+ # -----------------------------
440
+ _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
 
442
+ def chunk_text(text: str, min_chars: int = 240, max_chars: int = 520):
443
+ text = text.strip()
444
+ if not text:
445
+ return
446
+
447
+ parts = _SENT_SPLIT.split(text)
448
+ buf = ""
449
+ for p in parts:
450
+ if not p:
451
+ continue
452
+ if len(buf) + len(p) + 1 <= max_chars:
453
+ buf = (buf + " " + p).strip()
454
+ if len(buf) < min_chars:
455
+ continue
456
+ yield buf
457
+ buf = ""
458
+ else:
459
+ if buf:
460
+ yield buf
461
+ buf = p.strip()
462
+ if len(buf) >= min_chars:
463
+ yield buf
464
+ buf = ""
465
+
466
+ if buf:
467
+ yield buf
468
+
469
+ # -----------------------------
470
+ # AUDIO UTILS
471
+ # Avoid trimming per-chunk to prevent audible β€œmissing” regions.
472
+ # Do optional gentle trim only on final concatenated output if needed.
473
+ # -----------------------------
474
+ def float_to_int16(audio_f32: np.ndarray) -> np.ndarray:
475
+ audio_f32 = np.clip(audio_f32, -1.0, 1.0)
476
+ return (audio_f32 * 32767.0).astype(np.int16)
477
+
478
+ # -----------------------------
479
+ # CORE SYNTH
480
+ # Uses official generator API. :contentReference[oaicite:8]{index=8}
481
+ # -----------------------------
482
+ def kokoro_generate_stream(text: str, voice_code: str, speed: float):
483
+ lang_code = voice_to_lang_code(voice_code)
484
+ pipeline = PIPELINES[lang_code]
485
+ voice_tensor = get_voice_tensor(voice_code)
486
+
487
+ # We already chunk ourselves, so keep split_pattern simple.
488
+ # If you pass a strong splitter here, you will double-split and create micro audio pieces.
489
+ generator = pipeline(
490
  text,
491
  voice=voice_tensor,
492
  speed=float(speed),
493
+ split_pattern=r"$^", # split nothing
494
  )
495
 
496
+ for _, _, audio in generator:
497
+ # audio is float array at 24kHz
498
+ yield audio
499
+
500
+ # -----------------------------
501
+ # GRADIO STREAM
502
+ # -----------------------------
503
+ def gradio_stream_generator(text, voice_name, speed):
504
+ voice_code = VOICE_CHOICES.get(voice_name, voice_name)
505
+ text = normalize_text_for_kokoro(text)
506
+
507
+ # warm voice cache
508
+ get_voice_tensor(voice_code)
509
+
510
+ for i, chunk in enumerate(chunk_text(text)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  t0 = time.time()
512
+ # generator yields 1 item because split_pattern disables splitting
513
+ for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
514
  dur = time.time() - t0
515
+ print(f"⚑ UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
516
+ yield 24000, float_to_int16(audio_f32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
+ # -----------------------------
519
+ # FASTAPI WS
520
+ # -----------------------------
521
  api = FastAPI()
522
 
523
  INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 
528
  loop = asyncio.get_running_loop()
529
 
530
  while True:
531
+ ws, voice_code, speed, chunk = await INFERENCE_QUEUE.get()
532
+
533
  try:
534
  if ws.client_state.value > 1:
535
  continue
536
 
537
+ def _run():
538
+ out = []
539
+ for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
540
+ out.append(float_to_int16(audio_f32).tobytes())
541
+ return out
542
 
543
+ frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run)
 
544
 
545
+ for frame in frames:
546
+ try:
547
+ await ws.send_bytes(frame)
548
+ except Exception:
549
+ break
550
 
551
  except Exception as e:
552
  print(f"API Engine Error: {e}")
 
559
  async def websocket_endpoint(ws: WebSocket):
560
  await ws.accept()
561
 
562
+ voice_code = "af_bella"
563
  speed = 1.0
564
 
565
  print(f"βœ… Client connected: {ws.client}")
 
586
  break
587
 
588
  if "config" in data:
589
+ voice_name = data.get("voice", "πŸ‡ΊπŸ‡Έ 🚺 Bella")
590
+ voice_code = VOICE_CHOICES.get(voice_name, voice_name)
591
  speed = float(data.get("speed", speed))
592
+ get_voice_tensor(voice_code)
593
 
594
  if "text" in data:
595
+ raw = data["text"]
596
+ text = normalize_text_for_kokoro(raw)
597
+ # Bigger chunks reduces stalls under load
598
+ for chunk in chunk_text(text):
599
  if chunk.strip():
600
+ await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
601
+
602
+ if "flush" in data:
603
+ pass
604
 
605
  except Exception as e:
606
  print(f"πŸ”₯ Critical WS Error: {e}")
607
  finally:
608
  heartbeat_task.cancel()
609
 
610
+ # -----------------------------
611
+ # GRADIO UI
612
+ # -----------------------------
613
+ with gr.Blocks(title="Kokoro TTS") as app:
614
+ gr.Markdown("## ⚑ Kokoro-82M (Official Pipeline, HF CPU-friendly)")
615
+ with gr.Row():
616
+ with gr.Column():
617
+ text_in = gr.Textbox(
618
+ label="Input Text",
619
+ lines=3,
620
+ value="The system is live. Use the UI or connect to /ws/audio.",
621
+ )
622
+ voice_in = gr.Dropdown(
623
+ list(VOICE_CHOICES.keys()),
624
+ value="πŸ‡ΊπŸ‡Έ 🚺 Bella",
625
+ label="Voice",
626
+ )
627
+ speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
628
+ btn = gr.Button("Generate", variant="primary")
629
+ with gr.Column():
630
+ audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
631
+
632
+ btn.click(gradio_stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
633
+
634
+ final_app = gr.mount_gradio_app(api, app, path="/")
635
 
636
  if __name__ == "__main__":
637
+ uvicorn.run(final_app, host="0.0.0.0", port=7860)