Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Jan 6

Commit

f78ae4b

verified ·

1 Parent(s): 03d1b02

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -366

app.py CHANGED Viewed

@@ -295,43 +295,44 @@
 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
-import json
-import time
 import re
 import asyncio
-import threading
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
-import onnxruntime as ort
-from huggingface_hub import hf_hub_download
-from misaki import en
-from functools import lru_cache
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uvicorn
-# =========================================================
-# HF CPU BOX TUNING (2 vCPU)
-# =========================================================
 os.environ.setdefault("OMP_NUM_THREADS", "2")
 os.environ.setdefault("MKL_NUM_THREADS", "2")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
 try:
     import uvloop  # type: ignore
     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 except Exception:
     pass
-SAMPLE_RATE = 24000
-# =========================================================
-# ONNX KOKORO CONFIG (YOUR ONNX STYLE)
-# =========================================================
-MODEL_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
-MODEL_FILE = "onnx/model.onnx"
-TOKENIZER_FILE = "tokenizer.json"
 VOICE_CHOICES = {
     "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
     "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
@@ -344,333 +345,157 @@ VOICE_CHOICES = {
     "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
     "🇬🇧 🚹 Daniel": "bm_daniel",
 }
-ALLOWED_VOICE_IDS = set(VOICE_CHOICES.values())
-# ✅ DEFAULT VOICE = ONYX
-DEFAULT_VOICE_ID = "am_onyx"
-DEFAULT_SPEED = 1.0
-print("🚀 BOOTING ONNX KOKORO API (LOW LATENCY, API ONLY)")
-# =========================================================
-# 1) G2P
-# =========================================================
-G2P = en.G2P(trf=False, british=False, fallback=None)
-# =========================================================
-# 2) TOKENIZER
-# =========================================================
-vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
-with open(vocab_path, "r", encoding="utf-8") as f:
-    data = json.load(f)
-TOKENIZER = data["model"]["vocab"] if "model" in data else data.get("vocab", {})
-# =========================================================
-# 3) VOICES (LAZY LOAD, CACHE)
-# =========================================================
-VOICE_CACHE = {}  # voice_id -> np.ndarray (T,1,256)
-def _load_voice_bin(voice_id: str) -> np.ndarray:
-    path = hf_hub_download(repo_id=MODEL_REPO, filename=f"voices/{voice_id}.bin")
-    return np.fromfile(path, dtype=np.float32).reshape(-1, 1, 256)
-def get_voice(voice_id_or_label: str) -> np.ndarray:
-    vid = VOICE_CHOICES.get(voice_id_or_label, voice_id_or_label).strip()
-    if vid not in ALLOWED_VOICE_IDS:
-        vid = DEFAULT_VOICE_ID
-    if vid not in VOICE_CACHE:
-        try:
-            print(f"⬇️ Loading Voice: {vid}")
-            VOICE_CACHE[vid] = _load_voice_bin(vid)
-        except Exception:
-            if "af_bella" not in VOICE_CACHE:
-                print("⚠️ Voice load failed, falling back to af_bella")
-                VOICE_CACHE["af_bella"] = _load_voice_bin("af_bella")
-            return VOICE_CACHE["af_bella"]
-    return VOICE_CACHE[vid]
-# =========================================================
-# 4) ONNX SESSION (TUNED FOR 2 vCPU)
-# =========================================================
-model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-sess_options = ort.SessionOptions()
-sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
-# On 2 vCPU, keep it tight
-sess_options.intra_op_num_threads = int(os.environ.get("ORT_INTRA_OP_THREADS", "2"))
-sess_options.inter_op_num_threads = int(os.environ.get("ORT_INTER_OP_THREADS", "1"))
-SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
-print("✅ ONNX SESSION READY")
-# =========================================================
-# TEXT QUALITY FIXES (NAMES, ACRONYMS, CAMELCASE)
-# =========================================================
-RE_ALLCAPS = re.compile(r"\b([A-Z]{2,})\b")
-RE_CAMEL = re.compile(r"([a-z])([A-Z])")
-RE_SENT_SPLIT = re.compile(r'([.,!?;:\n]+)')
-def normalize_names(text: str) -> str:
-    if not text:
-        return ""
-    # AI -> A I
-    text = RE_ALLCAPS.sub(lambda m: " ".join(list(m.group(1))), text)
-    # OpenAI -> Open AI
-    text = RE_CAMEL.sub(r"\1 \2", text)
-    return text
-@lru_cache(maxsize=10000)
-def get_tokens_cached(text: str):
-    # Your IPA hint behavior from v1
-    if "Kokoro" in text:
-        text = text.replace("Kokoro", "kˈOkəɹO")
-    phonemes, _ = G2P(text)
-    return tuple(TOKENIZER.get(p, 0) for p in phonemes)
-def tuned_splitter(text: str):
-    # Fast first audio, bigger later chunks
-    parts = RE_SENT_SPLIT.split(text)
-    buf = ""
-    chunk_idx = 0
-    for p in parts:
-        if p is None:
-            continue
-        buf += p
-        if chunk_idx == 0:
-            threshold = 60
-        elif chunk_idx == 1:
-            threshold = 120
-        elif chunk_idx == 2:
-            threshold = 180
-        else:
-            threshold = 280
-        if buf and re.search(r"[.,!?;:\n]$", buf) and len(buf) >= threshold:
-            s = buf.strip()
-            if s:
-                yield s
-                chunk_idx += 1
-            buf = ""
-    s = buf.strip()
-    if s:
-        yield s
-# =========================================================
-# AUDIO POST (LESS AGGRESSIVE TRIM + CROSSFADE TO REMOVE "DROPS")
-# =========================================================
-def trim_leading(audio_f32: np.ndarray, threshold=0.01, pad=80) -> np.ndarray:
-    if audio_f32.size == 0:
-        return audio_f32
-    mask = np.abs(audio_f32) > threshold
-    if not np.any(mask):
-        return audio_f32
-    start = int(np.argmax(mask))
-    start = max(0, start - pad)
-    return audio_f32[start:]
-def trim_trailing(audio_f32: np.ndarray, threshold=0.01, pad=120) -> np.ndarray:
-    if audio_f32.size == 0:
-        return audio_f32
-    mask = np.abs(audio_f32) > threshold
-    if not np.any(mask):
-        return audio_f32
-    end = int(len(mask) - np.argmax(mask[::-1]))
-    end = min(len(audio_f32), end + pad)
-    return audio_f32[:end]
-def float_to_pcm_bytes(audio_f32: np.ndarray) -> bytes:
-    audio_f32 = np.clip(audio_f32, -1.0, 1.0).astype(np.float32)
-    pcm = (audio_f32 * 32767.0).astype(np.int16)
-    return pcm.tobytes()
-def crossfade_bytes_stream(chunks_f32, overlap=1200):
-    """
-    overlap=1200 samples ~= 50ms at 24kHz
-    We hold the last overlap of each chunk, blend into next chunk head,
-    then stream without clicks or "drops".
-    """
-    prev_tail = None
-    for i, a in enumerate(chunks_f32):
-        if a is None or a.size == 0:
-            continue
-        if prev_tail is None:
-            if a.size <= overlap * 2:
-                yield float_to_pcm_bytes(a)
-                prev_tail = None
-                continue
-            body = a[:-overlap]
-            prev_tail = a[-overlap:]
-            yield float_to_pcm_bytes(body)
-            continue
-        if a.size < overlap:
-            # too small, just append
-            blended = np.concatenate([prev_tail, a])
-            prev_tail = None
-            yield float_to_pcm_bytes(blended)
-            continue
-        fade_out = np.linspace(1.0, 0.0, overlap, dtype=np.float32)
-        fade_in = 1.0 - fade_out
-        head = a[:overlap]
-        blended = (prev_tail * fade_out) + (head * fade_in)
-        if a.size <= overlap * 2:
-            # nothing meaningful to hold
-            out = np.concatenate([blended, a[overlap:]])
-            prev_tail = None
-            yield float_to_pcm_bytes(out)
-            continue
-        mid = a[overlap:-overlap]
-        prev_tail = a[-overlap:]
-        out = np.concatenate([blended, mid])
-        yield float_to_pcm_bytes(out)
-    if prev_tail is not None and prev_tail.size > 0:
-        yield float_to_pcm_bytes(prev_tail)
-# =========================================================
-# ONNX INFER (FAST)
-# =========================================================
-def infer_tokens(tokens, voice_vec, speed: float):
-    ids = tokens[:510]
-    if not ids:
-        return None
-    # voice_vec shape: (T,1,256)
-    style = voice_vec[min(len(ids), voice_vec.shape[0] - 1)]  # -> (1,256)
-    audio = SESSION.run(
-        None,
-        {
-            "input_ids": np.array([[0, *ids, 0]], dtype=np.int64),
-            "style": style,
-            "speed": np.array([float(speed)], dtype=np.float32),
-        },
-    )[0]  # expected shape: (1, N)
-    out = audio[0].astype(np.float32, copy=False)
-    return out
-# =========================================================
-# API ONLY (FASTAPI + WS)
-# =========================================================
-api = FastAPI()
-# Single worker thread for full job generation (tokens + onnx + crossfade)
-INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-# Queue of jobs: each job is 1 full text for 1 websocket
-JOB_QUEUE: asyncio.Queue = asyncio.Queue(maxsize=64)
-def resolve_voice(value: str) -> str:
-    if not value:
-        return DEFAULT_VOICE_ID
-    v = VOICE_CHOICES.get(value, value).strip()
-    if v not in ALLOWED_VOICE_IDS:
-        return DEFAULT_VOICE_ID
-    return v
-@api.get("/health")
-async def health():
-    return {
-        "ok": True,
-        "engine": "onnxruntime",
-        "sample_rate": SAMPLE_RATE,
-        "default_voice": DEFAULT_VOICE_ID,
-    }
-def warmup_once():
     try:
-        get_voice(DEFAULT_VOICE_ID)
-        tokens = get_tokens_cached("Hello.")  # cached tuple
-        _ = infer_tokens(tokens, VOICE_CACHE[DEFAULT_VOICE_ID], 1.0)
-        print("✅ WARMUP OK")
     except Exception as e:
         print(f"⚠️ WARMUP FAILED: {e}")
-@api.on_event("startup")
-async def startup():
-    loop = asyncio.get_running_loop()
-    await loop.run_in_executor(INFERENCE_EXECUTOR, warmup_once)
-    asyncio.create_task(engine_loop())
-async def engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
-        ws, voice_id, speed, text = await JOB_QUEUE.get()
         if ws.client_state.value > 1:
             continue
-        # This queue carries PCM frames from the worker thread back to asyncio
-        frame_q: asyncio.Queue = asyncio.Queue(maxsize=8)
-        stop_flag = threading.Event()
-        def _worker_full_job():
             try:
-                t0 = time.time()
-                voice_vec = get_voice(voice_id)
-                # Build per-chunk float32 audio list, with light leading trim
-                audio_chunks = []
-                first = True
-                for chunk in tuned_splitter(text):
-                    if stop_flag.is_set():
-                        break
-                    # tokenize (cached)
-                    tokens = get_tokens_cached(chunk)
-                    if not tokens:
-                        continue
-                    a = infer_tokens(tokens, voice_vec, speed)
-                    if a is None or a.size == 0:
-                        continue
-                    # do NOT aggressively trim every chunk, only leading a bit
-                    if first:
-                        a = trim_leading(a, threshold=0.01, pad=120)
-                        first = False
-                    else:
-                        a = trim_leading(a, threshold=0.01, pad=60)
-                    audio_chunks.append(a)
-                    # Push first audio as soon as we have it, no waiting for the full list
-                    if len(audio_chunks) == 1:
-                        for frame in crossfade_bytes_stream(audio_chunks, overlap=1200):
-                            loop.call_soon_threadsafe(frame_q.put_nowait, frame)
-                        audio_chunks.clear()
-                # Flush remaining with crossfade
-                if not stop_flag.is_set():
-                    if audio_chunks:
-                        # trim trailing only at the very end to avoid cutting words mid stream
-                        audio_chunks[-1] = trim_trailing(audio_chunks[-1], threshold=0.01, pad=160)
-                        for frame in crossfade_bytes_stream(audio_chunks, overlap=1200):
-                            loop.call_soon_threadsafe(frame_q.put_nowait, frame)
                 loop.call_soon_threadsafe(frame_q.put_nowait, None)
-                dt = time.time() - t0
-                print(f"✅ job done in {dt:.2f}s")
             except Exception as e:
                 print(f"API Worker Error: {e}")
                 try:
@@ -678,7 +503,7 @@ async def engine_loop():
                 except Exception:
                     pass
-        INFERENCE_EXECUTOR.submit(_worker_full_job)
         first_sent = False
         started = time.time()
@@ -689,25 +514,28 @@ async def engine_loop():
                 break
             if ws.client_state.value > 1:
-                stop_flag.set()
                 break
             try:
                 await ws.send_bytes(frame)
                 if not first_sent:
                     first_sent = True
-                    print(f"⚡ first audio sent in {time.time() - started:.2f}s")
             except Exception:
-                stop_flag.set()
                 break
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
-    # per-connection state
-    voice_id = DEFAULT_VOICE_ID  # ✅ default Onyx
-    speed = DEFAULT_SPEED
     print(f"✅ Client connected: {ws.client}")
@@ -726,51 +554,53 @@ async def websocket_endpoint(ws: WebSocket):
             try:
                 data = await ws.receive_json()
             except WebSocketDisconnect:
-                print("❌ Client disconnected")
                 break
-            except Exception:
                 break
-            # client config
-            if "config" in data or data.get("type") == "config":
-                voice_id = resolve_voice(str(data.get("voice", voice_id)))
-                try:
-                    speed = float(data.get("speed", speed))
-                except Exception:
-                    speed = DEFAULT_SPEED
-                # preload voice immediately so the next text has no voice load delay
-                try:
-                    get_voice(voice_id)
-                except Exception:
-                    voice_id = DEFAULT_VOICE_ID
-                    get_voice(voice_id)
-            # client text
-            if "text" in data or data.get("type") == "text":
-                raw = str(data.get("text", ""))
-                raw = raw.strip()
-                if not raw:
-                    continue
-                # name + acronym fix so it stops skipping brands and people names
-                raw = normalize_names(raw)
-                # hard cap to prevent one user blocking the box forever
-                if len(raw) > 6000:
-                    await ws.send_json({"type": "error", "message": "text_too_long", "max_chars": 6000})
-                    continue
-                try:
-                    JOB_QUEUE.put_nowait((ws, voice_id, speed, raw))
-                except asyncio.QueueFull:
-                    await ws.send_json({"type": "error", "message": "server_busy"})
-            if "flush" in data or data.get("type") == "flush":
-                await ws.send_json({"type": "flushed"})
     finally:
         heartbeat_task.cancel()
 if __name__ == "__main__":
-    uvicorn.run(api, host="0.0.0.0", port=7860)

 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import re
+import time
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
+import gradio as gr
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uvicorn
+import torch
+from kokoro import KPipeline
+# ----------------------------
+# HARD LIMIT CPU THREADS (2 vCPU box)
+# ----------------------------
 os.environ.setdefault("OMP_NUM_THREADS", "2")
 os.environ.setdefault("MKL_NUM_THREADS", "2")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
+try:
+    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
+    torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
+except Exception:
+    pass
+# Optional: uvloop for faster event loop on HF Linux
 try:
     import uvloop  # type: ignore
     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 except Exception:
     pass
+print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
+# ----------------------------
+# VOICES
+# ----------------------------
 VOICE_CHOICES = {
     "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
     "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
     "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
     "🇬🇧 🚹 Daniel": "bm_daniel",
 }
+def voice_to_lang_code(voice_code: str) -> str:
+    if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
+        return "b"  # British
+    return "a"      # American
+# ----------------------------
+# PIPELINES (keep hot in RAM)
+# ----------------------------
+PIPELINES = {
+    "a": KPipeline(lang_code="a"),
+    "b": KPipeline(lang_code="b"),
+}
+# ----------------------------
+# TEXT NORMALIZATION (matches your pasted official docs)
+# ----------------------------
+def normalize_text(text: str) -> str:
+    if not text:
+        return ""
+    return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
+# ----------------------------
+# LOW LATENCY SEGMENTATION
+# One pipeline call per request.
+# We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
+# We also force a small first segment for fast first audio.
+# ----------------------------
+_SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
+def inject_newlines_for_fast_stream(text: str) -> str:
+    text = normalize_text(text).strip()
+    if not text:
+        return ""
+    # Sentence boundaries -> newline so official split_pattern can segment
+    text = _SENT_BOUNDARY.sub(r"\1\n", text)
+    # Also split on existing multi-newlines
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    # Guarantee a small first segment for low time-to-first-audio
+    if "\n" not in text and len(text) > 90:
+        cut = text.rfind(" ", 0, 70)
+        if cut < 35:
+            cut = 70
+        text = text[:cut].strip() + "\n" + text[cut:].strip()
+    return text
+# ----------------------------
+# AUDIO CONVERSION (fast, safe)
+# ----------------------------
+def audio_to_int16_np(audio):
+    if isinstance(audio, torch.Tensor):
+        audio = audio.detach().cpu()
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return (audio * 32767.0).to(torch.int16).numpy()
+    audio = np.asarray(audio)
+    audio = np.clip(audio, -1.0, 1.0)
+    return (audio * 32767.0).astype(np.int16)
+def audio_to_pcm_bytes(audio) -> bytes:
+    return audio_to_int16_np(audio).tobytes()
+# ----------------------------
+# OFFICIAL GENERATION PATH (single pipeline call)
+# generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
+# ----------------------------
+def kokoro_generator_full(text: str, voice_code: str, speed: float):
+    lang_code = voice_to_lang_code(voice_code)
+    pipeline = PIPELINES[lang_code]
+    text = inject_newlines_for_fast_stream(text)
+    if not text:
+        return
+    with torch.inference_mode():
+        generator = pipeline(
+            text,
+            voice=voice_code,
+            speed=float(speed),
+            split_pattern=r"\n+",
+        )
+        for _, _, audio in generator:
+            yield audio
+# ----------------------------
+# WARMUP (pay cold-start cost at boot)
+# ----------------------------
+def warmup():
     try:
+        t0 = time.time()
+        for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
+            break
+        print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
     except Exception as e:
         print(f"⚠️ WARMUP FAILED: {e}")
+# ----------------------------
+# GRADIO UI STREAM
+# ----------------------------
+def gradio_stream(text, voice_name, speed):
+    voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+    text = normalize_text(text)
+    i = 0
+    t0 = time.time()
+    for audio in kokoro_generator_full(text, voice_code, speed):
+        if i == 0:
+            print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
+        i += 1
+        yield 24000, audio_to_int16_np(audio)
+# ----------------------------
+# FASTAPI WS ENGINE
+# Single worker thread for actual generation.
+# Stream frames to client as soon as they exist.
+# No buffering a full list before sending.
+# ----------------------------
+api = FastAPI()
+INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
+async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
+        ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
+        # Skip dead clients early
         if ws.client_state.value > 1:
             continue
+        frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
+        def _worker():
             try:
+                for audio in kokoro_generator_full(text, voice_code, speed):
+                    b = audio_to_pcm_bytes(audio)
+                    # backpressure aware
+                    while True:
+                        try:
+                            loop.call_soon_threadsafe(frame_q.put_nowait, b)
+                            break
+                        except Exception:
+                            time.sleep(0.001)
                 loop.call_soon_threadsafe(frame_q.put_nowait, None)
             except Exception as e:
                 print(f"API Worker Error: {e}")
                 try:
                 except Exception:
                     pass
+        INFERENCE_EXECUTOR.submit(_worker)
         first_sent = False
         started = time.time()
                 break
             if ws.client_state.value > 1:
                 break
             try:
                 await ws.send_bytes(frame)
                 if not first_sent:
+                    print(f"⚡ API first audio in {time.time() - started:.2f}s")
                     first_sent = True
             except Exception:
                 break
+@api.on_event("startup")
+async def startup():
+    loop = asyncio.get_running_loop()
+    await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
+    asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
+    voice_code = "af_bella"
+    speed = 1.0
     print(f"✅ Client connected: {ws.client}")
             try:
                 data = await ws.receive_json()
             except WebSocketDisconnect:
+                print("❌ Client disconnected cleanly")
                 break
+            except Exception as e:
+                print(f"⚠️ Connection lost: {e}")
                 break
+            if "config" in data:
+                voice_name = data.get("voice", "🇺🇸 🚺 Bella")
+                voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+                speed = float(data.get("speed", speed))
+            if "text" in data:
+                text = normalize_text(data.get("text", ""))
+                if text.strip():
+                    await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
+            if "flush" in data:
+                pass
     finally:
         heartbeat_task.cancel()
+# ----------------------------
+# GRADIO APP
+# ----------------------------
+with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
+    with gr.Row():
+        with gr.Column():
+            text_in = gr.Textbox(
+                label="Input Text",
+                lines=3,
+                value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
+            )
+            voice_in = gr.Dropdown(
+                list(VOICE_CHOICES.keys()),
+                value="🇺🇸 🚺 Bella",
+                label="Voice",
+            )
+            speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
+            btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+    btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
+final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
+    uvicorn.run(final_app, host="0.0.0.0", port=7860)