Spaces:

auralodyssey
/

api

Sleeping

App Files Files Community

auralodyssey commited on Jan 7

Commit

d638591

verified ·

1 Parent(s): 89cbd38

Update app.py

Browse files

Files changed (1) hide show

app.py +638 -638

app.py CHANGED Viewed

@@ -295,767 +295,767 @@
 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 #OLD KOKORO CHATGPT CODE
-# import os
-# import re
-# import time
-# import asyncio
-# from concurrent.futures import ThreadPoolExecutor
-# import numpy as np
-# import gradio as gr
-# from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-# import uvicorn
-# import torch
-# from kokoro import KPipeline
-# # ----------------------------
-# # HARD LIMIT CPU THREADS (2 vCPU box)
-# # ----------------------------
-# os.environ.setdefault("OMP_NUM_THREADS", "2")
-# os.environ.setdefault("MKL_NUM_THREADS", "2")
-# os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
-# try:
-#     torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
-#     torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
-# except Exception:
-#     pass
-# # Optional: uvloop for faster event loop on HF Linux
-# try:
-#     import uvloop  # type: ignore
-#     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-# except Exception:
-#     pass
-# print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
-# # ----------------------------
-# # VOICES
-# # ----------------------------
-# VOICE_CHOICES = {
-#     "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
-#     "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
-#     "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
-#     "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
-#     "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
-#     "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
-#     "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
-#     "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
-#     "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
-#     "🇬🇧 🚹 Daniel": "bm_daniel",
-# }
-# def voice_to_lang_code(voice_code: str) -> str:
-#     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
-#         return "b"  # British
-#     return "a"      # American
-# # ----------------------------
-# # PIPELINES (keep hot in RAM)
-# # ----------------------------
-# PIPELINES = {
-#     "a": KPipeline(lang_code="a"),
-#     "b": KPipeline(lang_code="b"),
-# }
-# # ----------------------------
-# # TEXT NORMALIZATION (matches your pasted official docs)
-# # ----------------------------
-# def normalize_text(text: str) -> str:
-#     if not text:
-#         return ""
-#     return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
-# # ----------------------------
-# # LOW LATENCY SEGMENTATION
-# # One pipeline call per request.
-# # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
-# # We also force a small first segment for fast first audio.
-# # ----------------------------
-# _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
-# def inject_newlines_for_fast_stream(text: str) -> str:
-#     text = normalize_text(text).strip()
-#     if not text:
-#         return ""
-#     # Sentence boundaries -> newline so official split_pattern can segment
-#     text = _SENT_BOUNDARY.sub(r"\1\n", text)
-#     # Also split on existing multi-newlines
-#     text = re.sub(r"\n{3,}", "\n\n", text)
-#     # Guarantee a small first segment for low time-to-first-audio
-#     if "\n" not in text and len(text) > 90:
-#         cut = text.rfind(" ", 0, 70)
-#         if cut < 35:
-#             cut = 70
-#         text = text[:cut].strip() + "\n" + text[cut:].strip()
-#     return text
-# # ----------------------------
-# # AUDIO CONVERSION (fast, safe)
-# # ----------------------------
-# def audio_to_int16_np(audio):
-#     if isinstance(audio, torch.Tensor):
-#         audio = audio.detach().cpu()
-#         audio = torch.clamp(audio, -1.0, 1.0)
-#         return (audio * 32767.0).to(torch.int16).numpy()
-#     audio = np.asarray(audio)
-#     audio = np.clip(audio, -1.0, 1.0)
-#     return (audio * 32767.0).astype(np.int16)
-# def audio_to_pcm_bytes(audio) -> bytes:
-#     return audio_to_int16_np(audio).tobytes()
-# # ----------------------------
-# # OFFICIAL GENERATION PATH (single pipeline call)
-# # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
-# # ----------------------------
-# def kokoro_generator_full(text: str, voice_code: str, speed: float):
-#     lang_code = voice_to_lang_code(voice_code)
-#     pipeline = PIPELINES[lang_code]
-#     text = inject_newlines_for_fast_stream(text)
-#     if not text:
-#         return
-#     with torch.inference_mode():
-#         generator = pipeline(
-#             text,
-#             voice=voice_code,
-#             speed=float(speed),
-#             split_pattern=r"\n+",
-#         )
-#         for _, _, audio in generator:
-#             yield audio
-# # ----------------------------
-# # WARMUP (pay cold-start cost at boot)
-# # ----------------------------
-# def warmup():
-#     try:
-#         t0 = time.time()
-#         for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
-#             break
-#         print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
-#     except Exception as e:
-#         print(f"⚠️ WARMUP FAILED: {e}")
-# # ----------------------------
-# # GRADIO UI STREAM
-# # ----------------------------
-# def gradio_stream(text, voice_name, speed):
-#     voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-#     text = normalize_text(text)
-#     i = 0
-#     t0 = time.time()
-#     for audio in kokoro_generator_full(text, voice_code, speed):
-#         if i == 0:
-#             print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
-#         i += 1
-#         yield 24000, audio_to_int16_np(audio)
-# # ----------------------------
-# # FASTAPI WS ENGINE
-# # Single worker thread for actual generation.
-# # Stream frames to client as soon as they exist.
-# # No buffering a full list before sending.
-# # ----------------------------
-# api = FastAPI()
-# INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-# INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
-# async def audio_engine_loop():
-#     print("⚡ API AUDIO PIPELINE STARTED")
-#     loop = asyncio.get_running_loop()
-#     while True:
-#         ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
-#         # Skip dead clients early
-#         if ws.client_state.value > 1:
-#             continue
-#         frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
-#         def _worker():
-#             try:
-#                 for audio in kokoro_generator_full(text, voice_code, speed):
-#                     b = audio_to_pcm_bytes(audio)
-#                     # backpressure aware
-#                     while True:
-#                         try:
-#                             loop.call_soon_threadsafe(frame_q.put_nowait, b)
-#                             break
-#                         except Exception:
-#                             time.sleep(0.001)
-#                 loop.call_soon_threadsafe(frame_q.put_nowait, None)
-#             except Exception as e:
-#                 print(f"API Worker Error: {e}")
-#                 try:
-#                     loop.call_soon_threadsafe(frame_q.put_nowait, None)
-#                 except Exception:
-#                     pass
-#         INFERENCE_EXECUTOR.submit(_worker)
-#         first_sent = False
-#         started = time.time()
-#         while True:
-#             frame = await frame_q.get()
-#             if frame is None:
-#                 break
-#             if ws.client_state.value > 1:
-#                 break
-#             try:
-#                 await ws.send_bytes(frame)
-#                 if not first_sent:
-#                     print(f"⚡ API first audio in {time.time() - started:.2f}s")
-#                     first_sent = True
-#             except Exception:
-#                 break
-# @api.on_event("startup")
-# async def startup():
-#     loop = asyncio.get_running_loop()
-#     await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
-#     asyncio.create_task(audio_engine_loop())
-# @api.websocket("/ws/audio")
-# async def websocket_endpoint(ws: WebSocket):
-#     await ws.accept()
-#     voice_code = "af_bella"
-#     speed = 1.0
-#     print(f"✅ Client connected: {ws.client}")
-#     async def keep_alive():
-#         while True:
-#             try:
-#                 await asyncio.sleep(15)
-#                 await ws.send_json({"type": "ping"})
-#             except Exception:
-#                 break
-#     heartbeat_task = asyncio.create_task(keep_alive())
-#     try:
-#         while True:
-#             try:
-#                 data = await ws.receive_json()
-#             except WebSocketDisconnect:
-#                 print("❌ Client disconnected cleanly")
-#                 break
-#             except Exception as e:
-#                 print(f"⚠️ Connection lost: {e}")
-#                 break
-#             if "config" in data:
-#                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
-#                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-#                 speed = float(data.get("speed", speed))
-#             if "text" in data:
-#                 text = normalize_text(data.get("text", ""))
-#                 if text.strip():
-#                     await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
-#             if "flush" in data:
-#                 pass
-#     finally:
-#         heartbeat_task.cancel()
-# # ----------------------------
-# # GRADIO APP
-# # ----------------------------
-# with gr.Blocks(title="Kokoro TTS") as app:
-#     gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
-#     with gr.Row():
-#         with gr.Column():
-#             text_in = gr.Textbox(
-#                 label="Input Text",
-#                 lines=3,
-#                 value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
-#             )
-#             voice_in = gr.Dropdown(
-#                 list(VOICE_CHOICES.keys()),
-#                 value="🇺🇸 🚺 Bella",
-#                 label="Voice",
-#             )
-#             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
-#             btn = gr.Button("Generate", variant="primary")
-#         with gr.Column():
-#             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
-#     btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
-# final_app = gr.mount_gradio_app(api, app, path="/")
-# if __name__ == "__main__":
-#     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 #claude code
-"""
-Kokoro TTS WebSocket Server - OPTIMIZED for 2 vCPU / 16GB RAM
-============================================================
-Fixes:
-- Backpressure loop timeout prevents worker thread hang
-- Parallel inference workers (2, one per vCPU)
-- Proper error handling with traceback logging
-- Generation timeout to prevent infinite hangs
-- Memory-optimized with periodic garbage collection
-- Aggressive batching for throughput
-"""
-import os
-import re
-import gc
-import time
-import asyncio
-import traceback
-from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
-import numpy as np
-import gradio as gr
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-import uvicorn
-import torch
-from kokoro import KPipeline
-# ----------------------------
-# MAXIMIZE 2 vCPU UTILIZATION
-# ----------------------------
-CPU_COUNT = 2
-os.environ["OMP_NUM_THREADS"] = str(CPU_COUNT)
-os.environ["MKL_NUM_THREADS"] = str(CPU_COUNT)
-os.environ["NUMEXPR_NUM_THREADS"] = str(CPU_COUNT)
-os.environ["OPENBLAS_NUM_THREADS"] = str(CPU_COUNT)
-try:
-    torch.set_num_threads(CPU_COUNT)
-    torch.set_num_interop_threads(CPU_COUNT)
-except Exception:
-    pass
-# Use uvloop for faster async on Linux
-try:
-    import uvloop
-    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-    print("✅ Using uvloop for faster async")
-except ImportError:
-    print("⚠️ uvloop not available, using default event loop")
-print(f"🚀 BOOTING KOKORO - Optimized for {CPU_COUNT} vCPU / 16GB RAM")
-# ----------------------------
-# CONFIGURATION
-# ----------------------------
-GENERATION_TIMEOUT_SECONDS = 60  # Max time for a single TTS generation
-BACKPRESSURE_TIMEOUT_MS = 10000  # Max wait for queue space (10 seconds)
-WORKER_COUNT = 2  # One per vCPU for parallel processing
-QUEUE_MAXSIZE = 12  # Buffer more frames for smoother streaming
-# ----------------------------
-# VOICES
-# ----------------------------
-VOICE_CHOICES = {
-    "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
-    "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
-    "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
-    "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
-    "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
-    "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
-    "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
-    "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
-    "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
-    "🇬🇧 🚹 Daniel": "bm_daniel",
-}
-def voice_to_lang_code(voice_code: str) -> str:
-    if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
-        return "b"  # British
-    return "a"  # American
-# ----------------------------
-# PIPELINES (hot in RAM - uses ~2GB per pipeline)
-# With 16GB RAM we can comfortably hold both
-# ----------------------------
-print("📦 Loading Kokoro pipelines into RAM...")
-PIPELINES = {
-    "a": KPipeline(lang_code="a"),
-    "b": KPipeline(lang_code="b"),
-}
-print(f"✅ Pipelines loaded. Memory usage: ~4GB for models")
-# ----------------------------
-# TEXT NORMALIZATION
-# ----------------------------
-def normalize_text(text: str) -> str:
-    if not text:
-        return ""
-    # Kokoro pronunciation helper
-    text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
-    return text
-# ----------------------------
-# FAST SEGMENTATION FOR STREAMING
-# ----------------------------
-_SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
-def inject_newlines_for_fast_stream(text: str) -> str:
-    text = normalize_text(text).strip()
-    if not text:
-        return ""
-    # Sentence boundaries -> newline for pipeline segmentation
-    text = _SENT_BOUNDARY.sub(r"\1\n", text)
-    # Normalize excessive newlines
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    # Guarantee a small first segment for low time-to-first-audio
-    if "\n" not in text and len(text) > 90:
-        cut = text.rfind(" ", 0, 70)
-        if cut < 35:
-            cut = 70
-        text = text[:cut].strip() + "\n" + text[cut:].strip()
-    return text
-# ----------------------------
-# AUDIO CONVERSION (optimized)
-# ----------------------------
-def audio_to_int16_np(audio):
-    if isinstance(audio, torch.Tensor):
-        audio = audio.detach().cpu()
-        audio = torch.clamp(audio, -1.0, 1.0)
-        return (audio * 32767.0).to(torch.int16).numpy()
-    audio = np.asarray(audio, dtype=np.float32)
-    audio = np.clip(audio, -1.0, 1.0)
-    return (audio * 32767.0).astype(np.int16)
-def audio_to_pcm_bytes(audio) -> bytes:
-    return audio_to_int16_np(audio).tobytes()
-# ----------------------------
-# GENERATION WITH TIMEOUT
-# ----------------------------
-def kokoro_generator_full(text: str, voice_code: str, speed: float):
-    """
-    Generate audio chunks from text using Kokoro pipeline.
-    Yields audio tensors for each segment.
-    """
-    lang_code = voice_to_lang_code(voice_code)
-    pipeline = PIPELINES[lang_code]
-    text = inject_newlines_for_fast_stream(text)
-    if not text:
-        return
-    chunk_count = 0
-    start_time = time.time()
-    try:
-        with torch.inference_mode():
-            generator = pipeline(
-                text,
-                voice=voice_code,
-                speed=float(speed),
-                split_pattern=r"\n+",
-            )
-            for _, _, audio in generator:
-                chunk_count += 1
-                elapsed = time.time() - start_time
-                # Timeout protection
-                if elapsed > GENERATION_TIMEOUT_SECONDS:
-                    print(f"⚠️ Generation timeout after {elapsed:.1f}s, {chunk_count} chunks")
-                    break
-                yield audio
-        print(f"✅ Generated {chunk_count} chunks in {time.time() - start_time:.2f}s")
-    except Exception as e:
-        print(f"❌ Generation error: {e}")
-        traceback.print_exc()
-    finally:
-        # Periodic garbage collection to prevent memory buildup
-        if chunk_count > 10:
-            gc.collect()
-# ----------------------------
-# WARMUP (preload models)
-# ----------------------------
-def warmup():
-    try:
-        t0 = time.time()
-        for _ in kokoro_generator_full("Hello, this is a warmup test.", "af_bella", 1.0):
-            break
-        print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
-    except Exception as e:
-        print(f"⚠️ WARMUP FAILED: {e}")
-        traceback.print_exc()
-# ----------------------------
-# GRADIO UI STREAM
-# ----------------------------
-def gradio_stream(text, voice_name, speed):
-    voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-    text = normalize_text(text)
-    i = 0
-    t0 = time.time()
-    for audio in kokoro_generator_full(text, voice_code, speed):
-        if i == 0:
-            print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
-        i += 1
-        yield 24000, audio_to_int16_np(audio)
-# ----------------------------
-# FASTAPI WEBSOCKET ENGINE
-# ----------------------------
-api = FastAPI()
-# Use multiple workers for parallel inference
-INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=WORKER_COUNT)
-INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
-async def audio_engine_loop():
-    """
-    Main audio processing loop.
-    Pulls requests from queue and streams audio back to clients.
-    """
-    print(f"⚡ API AUDIO PIPELINE STARTED ({WORKER_COUNT} workers)")
-    loop = asyncio.get_running_loop()
-    while True:
-        try:
-            ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
-        except Exception as e:
-            print(f"⚠️ Queue get error: {e}")
-            continue
-        # Skip dead clients early
-        try:
-            if ws.client_state.value > 1:
-                print("⏭️ Skipping dead client")
-                continue
-        except Exception:
-            continue
-        frame_q: asyncio.Queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
-        generation_id = id(ws)
-        def _worker():
-            """Worker thread for audio generation."""
-            chunk_count = 0
-            start_time = time.time()
-            try:
-                print(f"🔊 [{generation_id}] Starting TTS: {text[:50]}...")
-                for audio in kokoro_generator_full(text, voice_code, speed):
-                    b = audio_to_pcm_bytes(audio)
-                    chunk_count += 1
-                    if chunk_count == 1:
-                        print(f"⚡ [{generation_id}] First chunk ready in {time.time() - start_time:.2f}s")
-                    # Backpressure with TIMEOUT to prevent infinite hang
-                    attempts = 0
-                    max_attempts = BACKPRESSURE_TIMEOUT_MS  # 10 seconds at 1ms/attempt
-                    while attempts < max_attempts:
-                        try:
-                            loop.call_soon_threadsafe(frame_q.put_nowait, b)
-                            break
-                        except asyncio.QueueFull:
-                            time.sleep(0.001)
-                            attempts += 1
-                    else:
-                        # Timeout reached - client too slow or disconnected
-                        print(f"⚠️ [{generation_id}] Backpressure timeout after {attempts}ms - aborting")
-                        break
-                # Send completion signal
-                loop.call_soon_threadsafe(frame_q.put_nowait, None)
-                print(f"✅ [{generation_id}] Completed: {chunk_count} chunks in {time.time() - start_time:.2f}s")
-            except Exception as e:
-                print(f"❌ [{generation_id}] Worker error: {e}")
-                traceback.print_exc()
-                try:
-                    loop.call_soon_threadsafe(frame_q.put_nowait, None)
-                except Exception:
-                    pass
-        # Submit to executor
-        INFERENCE_EXECUTOR.submit(_worker)
-        # Stream frames to client
-        first_sent = False
-        started = time.time()
-        frames_sent = 0
-        while True:
-            try:
-                # Timeout on frame retrieval to prevent infinite hang
-                frame = await asyncio.wait_for(frame_q.get(), timeout=30.0)
-            except asyncio.TimeoutError:
-                print(f"⚠️ [{generation_id}] Frame queue timeout - no data for 30s")
-                break
-            if frame is None:
-                break
-            # Check client still alive
-            try:
-                if ws.client_state.value > 1:
-                    print(f"⏭️ [{generation_id}] Client disconnected mid-stream")
-                    break
-            except Exception:
-                break
-            try:
-                await ws.send_bytes(frame)
-                frames_sent += 1
-                if not first_sent:
-                    print(f"⚡ [{generation_id}] First audio sent in {time.time() - started:.2f}s")
-                    first_sent = True
-            except Exception as e:
-                print(f"⚠️ [{generation_id}] Send failed: {e}")
-                break
-        print(f"📤 [{generation_id}] Streaming complete: {frames_sent} frames sent")
-@api.on_event("startup")
-async def startup():
-    loop = asyncio.get_running_loop()
-    # Warmup in executor to not block startup
-    await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
-    # Start the audio engine loop
-    asyncio.create_task(audio_engine_loop())
-    print("🚀 Server ready!")
-@api.websocket("/ws/audio")
-async def websocket_endpoint(ws: WebSocket):
-    await ws.accept()
-    voice_code = "af_bella"
-    speed = 1.0
-    client_id = id(ws)
-    print(f"✅ [{client_id}] Client connected: {ws.client}")
-    async def keep_alive():
-        """Send periodic pings to keep connection alive."""
-        while True:
-            try:
-                await asyncio.sleep(15)
-                await ws.send_json({"type": "ping"})
-            except Exception:
-                break
-    heartbeat_task = asyncio.create_task(keep_alive())
-    try:
-        while True:
-            try:
-                data = await asyncio.wait_for(ws.receive_json(), timeout=120.0)
-            except asyncio.TimeoutError:
-                print(f"⏱️ [{client_id}] Connection timeout - no messages for 120s")
-                break
-            except WebSocketDisconnect:
-                print(f"❌ [{client_id}] Client disconnected cleanly")
-                break
-            except Exception as e:
-                print(f"⚠️ [{client_id}] Connection error: {e}")
-                break
-            # Handle config updates
-            if "config" in data:
-                voice_name = data.get("voice", "🇺🇸 🚺 Bella")
-                voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-                speed = float(data.get("speed", speed))
-                print(f"🎛️ [{client_id}] Config: voice={voice_code}, speed={speed}")
-            # Handle text-to-speech request
-            if "text" in data:
-                text = normalize_text(data.get("text", ""))
-                if text.strip():
-                    print(f"📥 [{client_id}] TTS request: {text[:50]}...")
-                    await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
-            # Handle flush (no-op for now, could clear queue)
-            if "flush" in data:
-                pass
-    finally:
-        heartbeat_task.cancel()
-        print(f"👋 [{client_id}] Connection closed")
-# ----------------------------
-# HEALTH CHECK ENDPOINT
-# ----------------------------
-@api.get("/health")
-async def health_check():
-    return {
-        "status": "healthy",
-        "workers": WORKER_COUNT,
-        "queue_size": INFERENCE_QUEUE.qsize(),
-    }
-# ----------------------------
-# GRADIO APP
-# ----------------------------
-with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M (Optimized for 2 vCPU / 16GB RAM)")
-    gr.Markdown("API: Connect to `/ws/audio` for real-time streaming")
-    with gr.Row():
-        with gr.Column():
-            text_in = gr.Textbox(
-                label="Input Text",
-                lines=3,
-                value="Hello! This is the Kokoro text-to-speech system. The server is optimized for low latency streaming.",
-            )
-            voice_in = gr.Dropdown(
-                list(VOICE_CHOICES.keys()),
-                value="🇺🇸 🚺 Bella",
-                label="Voice",
-            )
-            speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
-            btn = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
-    btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
-final_app = gr.mount_gradio_app(api, app, path="/")
-if __name__ == "__main__":
-    uvicorn.run(
-        final_app,
-        host="0.0.0.0",
-        port=7860,
-        workers=1,  # Single process, multiple threads
-        log_level="info",
-    )

 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 #OLD KOKORO CHATGPT CODE
+import os
+import re
+import time
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import gradio as gr
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+import uvicorn
+import torch
+from kokoro import KPipeline
+# ----------------------------
+# HARD LIMIT CPU THREADS (2 vCPU box)
+# ----------------------------
+os.environ.setdefault("OMP_NUM_THREADS", "2")
+os.environ.setdefault("MKL_NUM_THREADS", "2")
+os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
+try:
+    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
+    torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
+except Exception:
+    pass
+# Optional: uvloop for faster event loop on HF Linux
+try:
+    import uvloop  # type: ignore
+    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+except Exception:
+    pass
+print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
+# ----------------------------
+# VOICES
+# ----------------------------
+VOICE_CHOICES = {
+    "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
+    "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
+    "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
+    "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
+    "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
+    "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
+    "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
+    "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
+    "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
+    "🇬🇧 🚹 Daniel": "bm_daniel",
+}
+def voice_to_lang_code(voice_code: str) -> str:
+    if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
+        return "b"  # British
+    return "a"      # American
+# ----------------------------
+# PIPELINES (keep hot in RAM)
+# ----------------------------
+PIPELINES = {
+    "a": KPipeline(lang_code="a"),
+    "b": KPipeline(lang_code="b"),
+}
+# ----------------------------
+# TEXT NORMALIZATION (matches your pasted official docs)
+# ----------------------------
+def normalize_text(text: str) -> str:
+    if not text:
+        return ""
+    return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
+# ----------------------------
+# LOW LATENCY SEGMENTATION
+# One pipeline call per request.
+# We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
+# We also force a small first segment for fast first audio.
+# ----------------------------
+_SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
+def inject_newlines_for_fast_stream(text: str) -> str:
+    text = normalize_text(text).strip()
+    if not text:
+        return ""
+    # Sentence boundaries -> newline so official split_pattern can segment
+    text = _SENT_BOUNDARY.sub(r"\1\n", text)
+    # Also split on existing multi-newlines
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    # Guarantee a small first segment for low time-to-first-audio
+    if "\n" not in text and len(text) > 90:
+        cut = text.rfind(" ", 0, 70)
+        if cut < 35:
+            cut = 70
+        text = text[:cut].strip() + "\n" + text[cut:].strip()
+    return text
+# ----------------------------
+# AUDIO CONVERSION (fast, safe)
+# ----------------------------
+def audio_to_int16_np(audio):
+    if isinstance(audio, torch.Tensor):
+        audio = audio.detach().cpu()
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return (audio * 32767.0).to(torch.int16).numpy()
+    audio = np.asarray(audio)
+    audio = np.clip(audio, -1.0, 1.0)
+    return (audio * 32767.0).astype(np.int16)
+def audio_to_pcm_bytes(audio) -> bytes:
+    return audio_to_int16_np(audio).tobytes()
+# ----------------------------
+# OFFICIAL GENERATION PATH (single pipeline call)
+# generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
+# ----------------------------
+def kokoro_generator_full(text: str, voice_code: str, speed: float):
+    lang_code = voice_to_lang_code(voice_code)
+    pipeline = PIPELINES[lang_code]
+    text = inject_newlines_for_fast_stream(text)
+    if not text:
+        return
+    with torch.inference_mode():
+        generator = pipeline(
+            text,
+            voice=voice_code,
+            speed=float(speed),
+            split_pattern=r"\n+",
+        )
+        for _, _, audio in generator:
+            yield audio
+# ----------------------------
+# WARMUP (pay cold-start cost at boot)
+# ----------------------------
+def warmup():
+    try:
+        t0 = time.time()
+        for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
+            break
+        print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
+    except Exception as e:
+        print(f"⚠️ WARMUP FAILED: {e}")
+# ----------------------------
+# GRADIO UI STREAM
+# ----------------------------
+def gradio_stream(text, voice_name, speed):
+    voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+    text = normalize_text(text)
+    i = 0
+    t0 = time.time()
+    for audio in kokoro_generator_full(text, voice_code, speed):
+        if i == 0:
+            print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
+        i += 1
+        yield 24000, audio_to_int16_np(audio)
+# ----------------------------
+# FASTAPI WS ENGINE
+# Single worker thread for actual generation.
+# Stream frames to client as soon as they exist.
+# No buffering a full list before sending.
+# ----------------------------
+api = FastAPI()
+INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
+async def audio_engine_loop():
+    print("⚡ API AUDIO PIPELINE STARTED")
+    loop = asyncio.get_running_loop()
+    while True:
+        ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
+        # Skip dead clients early
+        if ws.client_state.value > 1:
+            continue
+        frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
+        def _worker():
+            try:
+                for audio in kokoro_generator_full(text, voice_code, speed):
+                    b = audio_to_pcm_bytes(audio)
+                    # backpressure aware
+                    while True:
+                        try:
+                            loop.call_soon_threadsafe(frame_q.put_nowait, b)
+                            break
+                        except Exception:
+                            time.sleep(0.001)
+                loop.call_soon_threadsafe(frame_q.put_nowait, None)
+            except Exception as e:
+                print(f"API Worker Error: {e}")
+                try:
+                    loop.call_soon_threadsafe(frame_q.put_nowait, None)
+                except Exception:
+                    pass
+        INFERENCE_EXECUTOR.submit(_worker)
+        first_sent = False
+        started = time.time()
+        while True:
+            frame = await frame_q.get()
+            if frame is None:
+                break
+            if ws.client_state.value > 1:
+                break
+            try:
+                await ws.send_bytes(frame)
+                if not first_sent:
+                    print(f"⚡ API first audio in {time.time() - started:.2f}s")
+                    first_sent = True
+            except Exception:
+                break
+@api.on_event("startup")
+async def startup():
+    loop = asyncio.get_running_loop()
+    await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
+    asyncio.create_task(audio_engine_loop())
+@api.websocket("/ws/audio")
+async def websocket_endpoint(ws: WebSocket):
+    await ws.accept()
+    voice_code = "af_bella"
+    speed = 1.0
+    print(f"✅ Client connected: {ws.client}")
+    async def keep_alive():
+        while True:
+            try:
+                await asyncio.sleep(15)
+                await ws.send_json({"type": "ping"})
+            except Exception:
+                break
+    heartbeat_task = asyncio.create_task(keep_alive())
+    try:
+        while True:
+            try:
+                data = await ws.receive_json()
+            except WebSocketDisconnect:
+                print("❌ Client disconnected cleanly")
+                break
+            except Exception as e:
+                print(f"⚠️ Connection lost: {e}")
+                break
+            if "config" in data:
+                voice_name = data.get("voice", "🇺🇸 🚺 Bella")
+                voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+                speed = float(data.get("speed", speed))
+            if "text" in data:
+                text = normalize_text(data.get("text", ""))
+                if text.strip():
+                    await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
+            if "flush" in data:
+                pass
+    finally:
+        heartbeat_task.cancel()
+# ----------------------------
+# GRADIO APP
+# ----------------------------
+with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
+    with gr.Row():
+        with gr.Column():
+            text_in = gr.Textbox(
+                label="Input Text",
+                lines=3,
+                value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
+            )
+            voice_in = gr.Dropdown(
+                list(VOICE_CHOICES.keys()),
+                value="🇺🇸 🚺 Bella",
+                label="Voice",
+            )
+            speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
+            btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+    btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
+final_app = gr.mount_gradio_app(api, app, path="/")
+if __name__ == "__main__":
+    uvicorn.run(final_app, host="0.0.0.0", port=7860)
 #claude code
+# """
+# Kokoro TTS WebSocket Server - OPTIMIZED for 2 vCPU / 16GB RAM
+# ============================================================
+# Fixes:
+# - Backpressure loop timeout prevents worker thread hang
+# - Parallel inference workers (2, one per vCPU)
+# - Proper error handling with traceback logging
+# - Generation timeout to prevent infinite hangs
+# - Memory-optimized with periodic garbage collection
+# - Aggressive batching for throughput
+# """
+# import os
+# import re
+# import gc
+# import time
+# import asyncio
+# import traceback
+# from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
+# import numpy as np
+# import gradio as gr
+# from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+# import uvicorn
+# import torch
+# from kokoro import KPipeline
+# # ----------------------------
+# # MAXIMIZE 2 vCPU UTILIZATION
+# # ----------------------------
+# CPU_COUNT = 2
+# os.environ["OMP_NUM_THREADS"] = str(CPU_COUNT)
+# os.environ["MKL_NUM_THREADS"] = str(CPU_COUNT)
+# os.environ["NUMEXPR_NUM_THREADS"] = str(CPU_COUNT)
+# os.environ["OPENBLAS_NUM_THREADS"] = str(CPU_COUNT)
+# try:
+#     torch.set_num_threads(CPU_COUNT)
+#     torch.set_num_interop_threads(CPU_COUNT)
+# except Exception:
+#     pass
+# # Use uvloop for faster async on Linux
+# try:
+#     import uvloop
+#     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+#     print("✅ Using uvloop for faster async")
+# except ImportError:
+#     print("⚠️ uvloop not available, using default event loop")
+# print(f"🚀 BOOTING KOKORO - Optimized for {CPU_COUNT} vCPU / 16GB RAM")
+# # ----------------------------
+# # CONFIGURATION
+# # ----------------------------
+# GENERATION_TIMEOUT_SECONDS = 60  # Max time for a single TTS generation
+# BACKPRESSURE_TIMEOUT_MS = 10000  # Max wait for queue space (10 seconds)
+# WORKER_COUNT = 2  # One per vCPU for parallel processing
+# QUEUE_MAXSIZE = 12  # Buffer more frames for smoother streaming
+# # ----------------------------
+# # VOICES
+# # ----------------------------
+# VOICE_CHOICES = {
+#     "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
+#     "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
+#     "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
+#     "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
+#     "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
+#     "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
+#     "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
+#     "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
+#     "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
+#     "🇬🇧 🚹 Daniel": "bm_daniel",
+# }
+# def voice_to_lang_code(voice_code: str) -> str:
+#     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
+#         return "b"  # British
+#     return "a"  # American
+# # ----------------------------
+# # PIPELINES (hot in RAM - uses ~2GB per pipeline)
+# # With 16GB RAM we can comfortably hold both
+# # ----------------------------
+# print("📦 Loading Kokoro pipelines into RAM...")
+# PIPELINES = {
+#     "a": KPipeline(lang_code="a"),
+#     "b": KPipeline(lang_code="b"),
+# }
+# print(f"✅ Pipelines loaded. Memory usage: ~4GB for models")
+# # ----------------------------
+# # TEXT NORMALIZATION
+# # ----------------------------
+# def normalize_text(text: str) -> str:
+#     if not text:
+#         return ""
+#     # Kokoro pronunciation helper
+#     text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
+#     return text
+# # ----------------------------
+# # FAST SEGMENTATION FOR STREAMING
+# # ----------------------------
+# _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
+# def inject_newlines_for_fast_stream(text: str) -> str:
+#     text = normalize_text(text).strip()
+#     if not text:
+#         return ""
+#     # Sentence boundaries -> newline for pipeline segmentation
+#     text = _SENT_BOUNDARY.sub(r"\1\n", text)
+#     # Normalize excessive newlines
+#     text = re.sub(r"\n{3,}", "\n\n", text)
+#     # Guarantee a small first segment for low time-to-first-audio
+#     if "\n" not in text and len(text) > 90:
+#         cut = text.rfind(" ", 0, 70)
+#         if cut < 35:
+#             cut = 70
+#         text = text[:cut].strip() + "\n" + text[cut:].strip()
+#     return text
+# # ----------------------------
+# # AUDIO CONVERSION (optimized)
+# # ----------------------------
+# def audio_to_int16_np(audio):
+#     if isinstance(audio, torch.Tensor):
+#         audio = audio.detach().cpu()
+#         audio = torch.clamp(audio, -1.0, 1.0)
+#         return (audio * 32767.0).to(torch.int16).numpy()
+#     audio = np.asarray(audio, dtype=np.float32)
+#     audio = np.clip(audio, -1.0, 1.0)
+#     return (audio * 32767.0).astype(np.int16)
+# def audio_to_pcm_bytes(audio) -> bytes:
+#     return audio_to_int16_np(audio).tobytes()
+# # ----------------------------
+# # GENERATION WITH TIMEOUT
+# # ----------------------------
+# def kokoro_generator_full(text: str, voice_code: str, speed: float):
+#     """
+#     Generate audio chunks from text using Kokoro pipeline.
+#     Yields audio tensors for each segment.
+#     """
+#     lang_code = voice_to_lang_code(voice_code)
+#     pipeline = PIPELINES[lang_code]
+#     text = inject_newlines_for_fast_stream(text)
+#     if not text:
+#         return
+#     chunk_count = 0
+#     start_time = time.time()
+#     try:
+#         with torch.inference_mode():
+#             generator = pipeline(
+#                 text,
+#                 voice=voice_code,
+#                 speed=float(speed),
+#                 split_pattern=r"\n+",
+#             )
+#             for _, _, audio in generator:
+#                 chunk_count += 1
+#                 elapsed = time.time() - start_time
+#                 # Timeout protection
+#                 if elapsed > GENERATION_TIMEOUT_SECONDS:
+#                     print(f"⚠️ Generation timeout after {elapsed:.1f}s, {chunk_count} chunks")
+#                     break
+#                 yield audio
+#         print(f"✅ Generated {chunk_count} chunks in {time.time() - start_time:.2f}s")
+#     except Exception as e:
+#         print(f"❌ Generation error: {e}")
+#         traceback.print_exc()
+#     finally:
+#         # Periodic garbage collection to prevent memory buildup
+#         if chunk_count > 10:
+#             gc.collect()
+# # ----------------------------
+# # WARMUP (preload models)
+# # ----------------------------
+# def warmup():
+#     try:
+#         t0 = time.time()
+#         for _ in kokoro_generator_full("Hello, this is a warmup test.", "af_bella", 1.0):
+#             break
+#         print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
+#     except Exception as e:
+#         print(f"⚠��� WARMUP FAILED: {e}")
+#         traceback.print_exc()
+# # ----------------------------
+# # GRADIO UI STREAM
+# # ----------------------------
+# def gradio_stream(text, voice_name, speed):
+#     voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+#     text = normalize_text(text)
+#     i = 0
+#     t0 = time.time()
+#     for audio in kokoro_generator_full(text, voice_code, speed):
+#         if i == 0:
+#             print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
+#         i += 1
+#         yield 24000, audio_to_int16_np(audio)
+# # ----------------------------
+# # FASTAPI WEBSOCKET ENGINE
+# # ----------------------------
+# api = FastAPI()
+# # Use multiple workers for parallel inference
+# INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=WORKER_COUNT)
+# INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
+# async def audio_engine_loop():
+#     """
+#     Main audio processing loop.
+#     Pulls requests from queue and streams audio back to clients.
+#     """
+#     print(f"⚡ API AUDIO PIPELINE STARTED ({WORKER_COUNT} workers)")
+#     loop = asyncio.get_running_loop()
+#     while True:
+#         try:
+#             ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
+#         except Exception as e:
+#             print(f"⚠️ Queue get error: {e}")
+#             continue
+#         # Skip dead clients early
+#         try:
+#             if ws.client_state.value > 1:
+#                 print("⏭️ Skipping dead client")
+#                 continue
+#         except Exception:
+#             continue
+#         frame_q: asyncio.Queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
+#         generation_id = id(ws)
+#         def _worker():
+#             """Worker thread for audio generation."""
+#             chunk_count = 0
+#             start_time = time.time()
+#             try:
+#                 print(f"🔊 [{generation_id}] Starting TTS: {text[:50]}...")
+#                 for audio in kokoro_generator_full(text, voice_code, speed):
+#                     b = audio_to_pcm_bytes(audio)
+#                     chunk_count += 1
+#                     if chunk_count == 1:
+#                         print(f"⚡ [{generation_id}] First chunk ready in {time.time() - start_time:.2f}s")
+#                     # Backpressure with TIMEOUT to prevent infinite hang
+#                     attempts = 0
+#                     max_attempts = BACKPRESSURE_TIMEOUT_MS  # 10 seconds at 1ms/attempt
+#                     while attempts < max_attempts:
+#                         try:
+#                             loop.call_soon_threadsafe(frame_q.put_nowait, b)
+#                             break
+#                         except asyncio.QueueFull:
+#                             time.sleep(0.001)
+#                             attempts += 1
+#                     else:
+#                         # Timeout reached - client too slow or disconnected
+#                         print(f"⚠️ [{generation_id}] Backpressure timeout after {attempts}ms - aborting")
+#                         break
+#                 # Send completion signal
+#                 loop.call_soon_threadsafe(frame_q.put_nowait, None)
+#                 print(f"✅ [{generation_id}] Completed: {chunk_count} chunks in {time.time() - start_time:.2f}s")
+#             except Exception as e:
+#                 print(f"❌ [{generation_id}] Worker error: {e}")
+#                 traceback.print_exc()
+#                 try:
+#                     loop.call_soon_threadsafe(frame_q.put_nowait, None)
+#                 except Exception:
+#                     pass
+#         # Submit to executor
+#         INFERENCE_EXECUTOR.submit(_worker)
+#         # Stream frames to client
+#         first_sent = False
+#         started = time.time()
+#         frames_sent = 0
+#         while True:
+#             try:
+#                 # Timeout on frame retrieval to prevent infinite hang
+#                 frame = await asyncio.wait_for(frame_q.get(), timeout=30.0)
+#             except asyncio.TimeoutError:
+#                 print(f"⚠️ [{generation_id}] Frame queue timeout - no data for 30s")
+#                 break
+#             if frame is None:
+#                 break
+#             # Check client still alive
+#             try:
+#                 if ws.client_state.value > 1:
+#                     print(f"⏭️ [{generation_id}] Client disconnected mid-stream")
+#                     break
+#             except Exception:
+#                 break
+#             try:
+#                 await ws.send_bytes(frame)
+#                 frames_sent += 1
+#                 if not first_sent:
+#                     print(f"⚡ [{generation_id}] First audio sent in {time.time() - started:.2f}s")
+#                     first_sent = True
+#             except Exception as e:
+#                 print(f"⚠️ [{generation_id}] Send failed: {e}")
+#                 break
+#         print(f"📤 [{generation_id}] Streaming complete: {frames_sent} frames sent")
+# @api.on_event("startup")
+# async def startup():
+#     loop = asyncio.get_running_loop()
+#     # Warmup in executor to not block startup
+#     await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
+#     # Start the audio engine loop
+#     asyncio.create_task(audio_engine_loop())
+#     print("🚀 Server ready!")
+# @api.websocket("/ws/audio")
+# async def websocket_endpoint(ws: WebSocket):
+#     await ws.accept()
+#     voice_code = "af_bella"
+#     speed = 1.0
+#     client_id = id(ws)
+#     print(f"✅ [{client_id}] Client connected: {ws.client}")
+#     async def keep_alive():
+#         """Send periodic pings to keep connection alive."""
+#         while True:
+#             try:
+#                 await asyncio.sleep(15)
+#                 await ws.send_json({"type": "ping"})
+#             except Exception:
+#                 break
+#     heartbeat_task = asyncio.create_task(keep_alive())
+#     try:
+#         while True:
+#             try:
+#                 data = await asyncio.wait_for(ws.receive_json(), timeout=120.0)
+#             except asyncio.TimeoutError:
+#                 print(f"⏱️ [{client_id}] Connection timeout - no messages for 120s")
+#                 break
+#             except WebSocketDisconnect:
+#                 print(f"❌ [{client_id}] Client disconnected cleanly")
+#                 break
+#             except Exception as e:
+#                 print(f"⚠️ [{client_id}] Connection error: {e}")
+#                 break
+#             # Handle config updates
+#             if "config" in data:
+#                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
+#                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+#                 speed = float(data.get("speed", speed))
+#                 print(f"🎛️ [{client_id}] Config: voice={voice_code}, speed={speed}")
+#             # Handle text-to-speech request
+#             if "text" in data:
+#                 text = normalize_text(data.get("text", ""))
+#                 if text.strip():
+#                     print(f"📥 [{client_id}] TTS request: {text[:50]}...")
+#                     await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
+#             # Handle flush (no-op for now, could clear queue)
+#             if "flush" in data:
+#                 pass
+#     finally:
+#         heartbeat_task.cancel()
+#         print(f"👋 [{client_id}] Connection closed")
+# # ----------------------------
+# # HEALTH CHECK ENDPOINT
+# # ----------------------------
+# @api.get("/health")
+# async def health_check():
+#     return {
+#         "status": "healthy",
+#         "workers": WORKER_COUNT,
+#         "queue_size": INFERENCE_QUEUE.qsize(),
+#     }
+# # ----------------------------
+# # GRADIO APP
+# # ----------------------------
+# with gr.Blocks(title="Kokoro TTS") as app:
+#     gr.Markdown("## ⚡ Kokoro-82M (Optimized for 2 vCPU / 16GB RAM)")
+#     gr.Markdown("API: Connect to `/ws/audio` for real-time streaming")
+#     with gr.Row():
+#         with gr.Column():
+#             text_in = gr.Textbox(
+#                 label="Input Text",
+#                 lines=3,
+#                 value="Hello! This is the Kokoro text-to-speech system. The server is optimized for low latency streaming.",
+#             )
+#             voice_in = gr.Dropdown(
+#                 list(VOICE_CHOICES.keys()),
+#                 value="🇺🇸 🚺 Bella",
+#                 label="Voice",
+#             )
+#             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
+#             btn = gr.Button("Generate", variant="primary")
+#         with gr.Column():
+#             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+#     btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
+# final_app = gr.mount_gradio_app(api, app, path="/")
+# if __name__ == "__main__":
+#     uvicorn.run(
+#         final_app,
+#         host="0.0.0.0",
+#         port=7860,
+#         workers=1,  # Single process, multiple threads
+#         log_level="info",
+#     )