Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Jan 7

Commit

89cbd38

verified ·

1 Parent(s): f78ae4b

Update app.py

Browse files

Files changed (1) hide show

app.py +517 -62

app.py CHANGED Viewed

@@ -294,11 +294,337 @@
 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import re
 import time
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import gradio as gr
@@ -309,26 +635,37 @@ import torch
 from kokoro import KPipeline
 # ----------------------------
-# HARD LIMIT CPU THREADS (2 vCPU box)
 # ----------------------------
-os.environ.setdefault("OMP_NUM_THREADS", "2")
-os.environ.setdefault("MKL_NUM_THREADS", "2")
-os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
 try:
-    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
-    torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
 except Exception:
     pass
-# Optional: uvloop for faster event loop on HF Linux
 try:
-    import uvloop  # type: ignore
     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-except Exception:
-    pass
-print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
 # ----------------------------
 # VOICES
@@ -349,29 +686,31 @@ VOICE_CHOICES = {
 def voice_to_lang_code(voice_code: str) -> str:
     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
         return "b"  # British
-    return "a"      # American
 # ----------------------------
-# PIPELINES (keep hot in RAM)
 # ----------------------------
 PIPELINES = {
     "a": KPipeline(lang_code="a"),
     "b": KPipeline(lang_code="b"),
 }
 # ----------------------------
-# TEXT NORMALIZATION (matches your pasted official docs)
 # ----------------------------
 def normalize_text(text: str) -> str:
     if not text:
         return ""
-    return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
 # ----------------------------
-# LOW LATENCY SEGMENTATION
-# One pipeline call per request.
-# We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
-# We also force a small first segment for fast first audio.
 # ----------------------------
 _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
@@ -380,10 +719,10 @@ def inject_newlines_for_fast_stream(text: str) -> str:
     if not text:
         return ""
-    # Sentence boundaries -> newline so official split_pattern can segment
     text = _SENT_BOUNDARY.sub(r"\1\n", text)
-    # Also split on existing multi-newlines
     text = re.sub(r"\n{3,}", "\n\n", text)
     # Guarantee a small first segment for low time-to-first-audio
@@ -396,7 +735,7 @@ def inject_newlines_for_fast_stream(text: str) -> str:
     return text
 # ----------------------------
-# AUDIO CONVERSION (fast, safe)
 # ----------------------------
 def audio_to_int16_np(audio):
     if isinstance(audio, torch.Tensor):
@@ -404,7 +743,7 @@ def audio_to_int16_np(audio):
         audio = torch.clamp(audio, -1.0, 1.0)
         return (audio * 32767.0).to(torch.int16).numpy()
-    audio = np.asarray(audio)
     audio = np.clip(audio, -1.0, 1.0)
     return (audio * 32767.0).astype(np.int16)
@@ -412,10 +751,13 @@ def audio_to_pcm_bytes(audio) -> bytes:
     return audio_to_int16_np(audio).tobytes()
 # ----------------------------
-# OFFICIAL GENERATION PATH (single pipeline call)
-# generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
 # ----------------------------
 def kokoro_generator_full(text: str, voice_code: str, speed: float):
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
     text = inject_newlines_for_fast_stream(text)
@@ -423,27 +765,50 @@ def kokoro_generator_full(text: str, voice_code: str, speed: float):
     if not text:
         return
-    with torch.inference_mode():
-        generator = pipeline(
-            text,
-            voice=voice_code,
-            speed=float(speed),
-            split_pattern=r"\n+",
-        )
-        for _, _, audio in generator:
-            yield audio
 # ----------------------------
-# WARMUP (pay cold-start cost at boot)
 # ----------------------------
 def warmup():
     try:
         t0 = time.time()
-        for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
             break
         print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
     except Exception as e:
         print(f"⚠️ WARMUP FAILED: {e}")
 # ----------------------------
 # GRADIO UI STREAM
@@ -461,74 +826,134 @@ def gradio_stream(text, voice_name, speed):
         yield 24000, audio_to_int16_np(audio)
 # ----------------------------
-# FASTAPI WS ENGINE
-# Single worker thread for actual generation.
-# Stream frames to client as soon as they exist.
-# No buffering a full list before sending.
 # ----------------------------
 api = FastAPI()
-INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
 async def audio_engine_loop():
-    print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
-        ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
         # Skip dead clients early
-        if ws.client_state.value > 1:
             continue
-        frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
         def _worker():
             try:
                 for audio in kokoro_generator_full(text, voice_code, speed):
                     b = audio_to_pcm_bytes(audio)
-                    # backpressure aware
-                    while True:
                         try:
                             loop.call_soon_threadsafe(frame_q.put_nowait, b)
                             break
-                        except Exception:
                             time.sleep(0.001)
                 loop.call_soon_threadsafe(frame_q.put_nowait, None)
             except Exception as e:
-                print(f"API Worker Error: {e}")
                 try:
                     loop.call_soon_threadsafe(frame_q.put_nowait, None)
                 except Exception:
                     pass
         INFERENCE_EXECUTOR.submit(_worker)
         first_sent = False
         started = time.time()
         while True:
-            frame = await frame_q.get()
             if frame is None:
                 break
-            if ws.client_state.value > 1:
                 break
             try:
                 await ws.send_bytes(frame)
                 if not first_sent:
-                    print(f"⚡ API first audio in {time.time() - started:.2f}s")
                     first_sent = True
-            except Exception:
                 break
 @api.on_event("startup")
 async def startup():
     loop = asyncio.get_running_loop()
     await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
@@ -536,10 +961,12 @@ async def websocket_endpoint(ws: WebSocket):
     voice_code = "af_bella"
     speed = 1.0
-    print(f"✅ Client connected: {ws.client}")
     async def keep_alive():
         while True:
             try:
                 await asyncio.sleep(15)
@@ -552,41 +979,63 @@ async def websocket_endpoint(ws: WebSocket):
     try:
         while True:
             try:
-                data = await ws.receive_json()
             except WebSocketDisconnect:
-                print("❌ Client disconnected cleanly")
                 break
             except Exception as e:
-                print(f"⚠️ Connection lost: {e}")
                 break
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 speed = float(data.get("speed", speed))
             if "text" in data:
                 text = normalize_text(data.get("text", ""))
                 if text.strip():
                     await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
             if "flush" in data:
                 pass
     finally:
         heartbeat_task.cancel()
 # ----------------------------
 # GRADIO APP
 # ----------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
                 label="Input Text",
                 lines=3,
-                value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
             )
             voice_in = gr.Dropdown(
                 list(VOICE_CHOICES.keys()),
@@ -603,4 +1052,10 @@ with gr.Blocks(title="Kokoro TTS") as app:
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
-    uvicorn.run(final_app, host="0.0.0.0", port=7860)

 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
+#OLD KOKORO CHATGPT CODE
+# import os
+# import re
+# import time
+# import asyncio
+# from concurrent.futures import ThreadPoolExecutor
+# import numpy as np
+# import gradio as gr
+# from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+# import uvicorn
+# import torch
+# from kokoro import KPipeline
+# # ----------------------------
+# # HARD LIMIT CPU THREADS (2 vCPU box)
+# # ----------------------------
+# os.environ.setdefault("OMP_NUM_THREADS", "2")
+# os.environ.setdefault("MKL_NUM_THREADS", "2")
+# os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
+# try:
+#     torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
+#     torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
+# except Exception:
+#     pass
+# # Optional: uvloop for faster event loop on HF Linux
+# try:
+#     import uvloop  # type: ignore
+#     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+# except Exception:
+#     pass
+# print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
+# # ----------------------------
+# # VOICES
+# # ----------------------------
+# VOICE_CHOICES = {
+#     "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
+#     "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
+#     "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
+#     "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
+#     "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
+#     "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
+#     "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
+#     "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
+#     "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
+#     "🇬🇧 🚹 Daniel": "bm_daniel",
+# }
+# def voice_to_lang_code(voice_code: str) -> str:
+#     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
+#         return "b"  # British
+#     return "a"      # American
+# # ----------------------------
+# # PIPELINES (keep hot in RAM)
+# # ----------------------------
+# PIPELINES = {
+#     "a": KPipeline(lang_code="a"),
+#     "b": KPipeline(lang_code="b"),
+# }
+# # ----------------------------
+# # TEXT NORMALIZATION (matches your pasted official docs)
+# # ----------------------------
+# def normalize_text(text: str) -> str:
+#     if not text:
+#         return ""
+#     return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
+# # ----------------------------
+# # LOW LATENCY SEGMENTATION
+# # One pipeline call per request.
+# # We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
+# # We also force a small first segment for fast first audio.
+# # ----------------------------
+# _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
+# def inject_newlines_for_fast_stream(text: str) -> str:
+#     text = normalize_text(text).strip()
+#     if not text:
+#         return ""
+#     # Sentence boundaries -> newline so official split_pattern can segment
+#     text = _SENT_BOUNDARY.sub(r"\1\n", text)
+#     # Also split on existing multi-newlines
+#     text = re.sub(r"\n{3,}", "\n\n", text)
+#     # Guarantee a small first segment for low time-to-first-audio
+#     if "\n" not in text and len(text) > 90:
+#         cut = text.rfind(" ", 0, 70)
+#         if cut < 35:
+#             cut = 70
+#         text = text[:cut].strip() + "\n" + text[cut:].strip()
+#     return text
+# # ----------------------------
+# # AUDIO CONVERSION (fast, safe)
+# # ----------------------------
+# def audio_to_int16_np(audio):
+#     if isinstance(audio, torch.Tensor):
+#         audio = audio.detach().cpu()
+#         audio = torch.clamp(audio, -1.0, 1.0)
+#         return (audio * 32767.0).to(torch.int16).numpy()
+#     audio = np.asarray(audio)
+#     audio = np.clip(audio, -1.0, 1.0)
+#     return (audio * 32767.0).astype(np.int16)
+# def audio_to_pcm_bytes(audio) -> bytes:
+#     return audio_to_int16_np(audio).tobytes()
+# # ----------------------------
+# # OFFICIAL GENERATION PATH (single pipeline call)
+# # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
+# # ----------------------------
+# def kokoro_generator_full(text: str, voice_code: str, speed: float):
+#     lang_code = voice_to_lang_code(voice_code)
+#     pipeline = PIPELINES[lang_code]
+#     text = inject_newlines_for_fast_stream(text)
+#     if not text:
+#         return
+#     with torch.inference_mode():
+#         generator = pipeline(
+#             text,
+#             voice=voice_code,
+#             speed=float(speed),
+#             split_pattern=r"\n+",
+#         )
+#         for _, _, audio in generator:
+#             yield audio
+# # ----------------------------
+# # WARMUP (pay cold-start cost at boot)
+# # ----------------------------
+# def warmup():
+#     try:
+#         t0 = time.time()
+#         for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
+#             break
+#         print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
+#     except Exception as e:
+#         print(f"⚠️ WARMUP FAILED: {e}")
+# # ----------------------------
+# # GRADIO UI STREAM
+# # ----------------------------
+# def gradio_stream(text, voice_name, speed):
+#     voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+#     text = normalize_text(text)
+#     i = 0
+#     t0 = time.time()
+#     for audio in kokoro_generator_full(text, voice_code, speed):
+#         if i == 0:
+#             print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
+#         i += 1
+#         yield 24000, audio_to_int16_np(audio)
+# # ----------------------------
+# # FASTAPI WS ENGINE
+# # Single worker thread for actual generation.
+# # Stream frames to client as soon as they exist.
+# # No buffering a full list before sending.
+# # ----------------------------
+# api = FastAPI()
+# INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+# INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
+# async def audio_engine_loop():
+#     print("⚡ API AUDIO PIPELINE STARTED")
+#     loop = asyncio.get_running_loop()
+#     while True:
+#         ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
+#         # Skip dead clients early
+#         if ws.client_state.value > 1:
+#             continue
+#         frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
+#         def _worker():
+#             try:
+#                 for audio in kokoro_generator_full(text, voice_code, speed):
+#                     b = audio_to_pcm_bytes(audio)
+#                     # backpressure aware
+#                     while True:
+#                         try:
+#                             loop.call_soon_threadsafe(frame_q.put_nowait, b)
+#                             break
+#                         except Exception:
+#                             time.sleep(0.001)
+#                 loop.call_soon_threadsafe(frame_q.put_nowait, None)
+#             except Exception as e:
+#                 print(f"API Worker Error: {e}")
+#                 try:
+#                     loop.call_soon_threadsafe(frame_q.put_nowait, None)
+#                 except Exception:
+#                     pass
+#         INFERENCE_EXECUTOR.submit(_worker)
+#         first_sent = False
+#         started = time.time()
+#         while True:
+#             frame = await frame_q.get()
+#             if frame is None:
+#                 break
+#             if ws.client_state.value > 1:
+#                 break
+#             try:
+#                 await ws.send_bytes(frame)
+#                 if not first_sent:
+#                     print(f"⚡ API first audio in {time.time() - started:.2f}s")
+#                     first_sent = True
+#             except Exception:
+#                 break
+# @api.on_event("startup")
+# async def startup():
+#     loop = asyncio.get_running_loop()
+#     await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
+#     asyncio.create_task(audio_engine_loop())
+# @api.websocket("/ws/audio")
+# async def websocket_endpoint(ws: WebSocket):
+#     await ws.accept()
+#     voice_code = "af_bella"
+#     speed = 1.0
+#     print(f"✅ Client connected: {ws.client}")
+#     async def keep_alive():
+#         while True:
+#             try:
+#                 await asyncio.sleep(15)
+#                 await ws.send_json({"type": "ping"})
+#             except Exception:
+#                 break
+#     heartbeat_task = asyncio.create_task(keep_alive())
+#     try:
+#         while True:
+#             try:
+#                 data = await ws.receive_json()
+#             except WebSocketDisconnect:
+#                 print("❌ Client disconnected cleanly")
+#                 break
+#             except Exception as e:
+#                 print(f"⚠️ Connection lost: {e}")
+#                 break
+#             if "config" in data:
+#                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
+#                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+#                 speed = float(data.get("speed", speed))
+#             if "text" in data:
+#                 text = normalize_text(data.get("text", ""))
+#                 if text.strip():
+#                     await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
+#             if "flush" in data:
+#                 pass
+#     finally:
+#         heartbeat_task.cancel()
+# # ----------------------------
+# # GRADIO APP
+# # ----------------------------
+# with gr.Blocks(title="Kokoro TTS") as app:
+#     gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
+#     with gr.Row():
+#         with gr.Column():
+#             text_in = gr.Textbox(
+#                 label="Input Text",
+#                 lines=3,
+#                 value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
+#             )
+#             voice_in = gr.Dropdown(
+#                 list(VOICE_CHOICES.keys()),
+#                 value="🇺🇸 🚺 Bella",
+#                 label="Voice",
+#             )
+#             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
+#             btn = gr.Button("Generate", variant="primary")
+#         with gr.Column():
+#             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+#     btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
+# final_app = gr.mount_gradio_app(api, app, path="/")
+# if __name__ == "__main__":
+#     uvicorn.run(final_app, host="0.0.0.0", port=7860)
+#claude code
+"""
+Kokoro TTS WebSocket Server - OPTIMIZED for 2 vCPU / 16GB RAM
+============================================================
+Fixes:
+- Backpressure loop timeout prevents worker thread hang
+- Parallel inference workers (2, one per vCPU)
+- Proper error handling with traceback logging
+- Generation timeout to prevent infinite hangs
+- Memory-optimized with periodic garbage collection
+- Aggressive batching for throughput
+"""
 import os
 import re
+import gc
 import time
 import asyncio
+import traceback
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
 import numpy as np
 import gradio as gr
 from kokoro import KPipeline
 # ----------------------------
+# MAXIMIZE 2 vCPU UTILIZATION
 # ----------------------------
+CPU_COUNT = 2
+os.environ["OMP_NUM_THREADS"] = str(CPU_COUNT)
+os.environ["MKL_NUM_THREADS"] = str(CPU_COUNT)
+os.environ["NUMEXPR_NUM_THREADS"] = str(CPU_COUNT)
+os.environ["OPENBLAS_NUM_THREADS"] = str(CPU_COUNT)
 try:
+    torch.set_num_threads(CPU_COUNT)
+    torch.set_num_interop_threads(CPU_COUNT)
 except Exception:
     pass
+# Use uvloop for faster async on Linux
 try:
+    import uvloop
     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+    print("✅ Using uvloop for faster async")
+except ImportError:
+    print("⚠️ uvloop not available, using default event loop")
+print(f"🚀 BOOTING KOKORO - Optimized for {CPU_COUNT} vCPU / 16GB RAM")
+# ----------------------------
+# CONFIGURATION
+# ----------------------------
+GENERATION_TIMEOUT_SECONDS = 60  # Max time for a single TTS generation
+BACKPRESSURE_TIMEOUT_MS = 10000  # Max wait for queue space (10 seconds)
+WORKER_COUNT = 2  # One per vCPU for parallel processing
+QUEUE_MAXSIZE = 12  # Buffer more frames for smoother streaming
 # ----------------------------
 # VOICES
 def voice_to_lang_code(voice_code: str) -> str:
     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
         return "b"  # British
+    return "a"  # American
 # ----------------------------
+# PIPELINES (hot in RAM - uses ~2GB per pipeline)
+# With 16GB RAM we can comfortably hold both
 # ----------------------------
+print("📦 Loading Kokoro pipelines into RAM...")
 PIPELINES = {
     "a": KPipeline(lang_code="a"),
     "b": KPipeline(lang_code="b"),
 }
+print(f"✅ Pipelines loaded. Memory usage: ~4GB for models")
 # ----------------------------
+# TEXT NORMALIZATION
 # ----------------------------
 def normalize_text(text: str) -> str:
     if not text:
         return ""
+    # Kokoro pronunciation helper
+    text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
+    return text
 # ----------------------------
+# FAST SEGMENTATION FOR STREAMING
 # ----------------------------
 _SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
     if not text:
         return ""
+    # Sentence boundaries -> newline for pipeline segmentation
     text = _SENT_BOUNDARY.sub(r"\1\n", text)
+    # Normalize excessive newlines
     text = re.sub(r"\n{3,}", "\n\n", text)
     # Guarantee a small first segment for low time-to-first-audio
     return text
 # ----------------------------
+# AUDIO CONVERSION (optimized)
 # ----------------------------
 def audio_to_int16_np(audio):
     if isinstance(audio, torch.Tensor):
         audio = torch.clamp(audio, -1.0, 1.0)
         return (audio * 32767.0).to(torch.int16).numpy()
+    audio = np.asarray(audio, dtype=np.float32)
     audio = np.clip(audio, -1.0, 1.0)
     return (audio * 32767.0).astype(np.int16)
     return audio_to_int16_np(audio).tobytes()
 # ----------------------------
+# GENERATION WITH TIMEOUT
 # ----------------------------
 def kokoro_generator_full(text: str, voice_code: str, speed: float):
+    """
+    Generate audio chunks from text using Kokoro pipeline.
+    Yields audio tensors for each segment.
+    """
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
     text = inject_newlines_for_fast_stream(text)
     if not text:
         return
+    chunk_count = 0
+    start_time = time.time()
+    try:
+        with torch.inference_mode():
+            generator = pipeline(
+                text,
+                voice=voice_code,
+                speed=float(speed),
+                split_pattern=r"\n+",
+            )
+            for _, _, audio in generator:
+                chunk_count += 1
+                elapsed = time.time() - start_time
+                # Timeout protection
+                if elapsed > GENERATION_TIMEOUT_SECONDS:
+                    print(f"⚠️ Generation timeout after {elapsed:.1f}s, {chunk_count} chunks")
+                    break
+                yield audio
+        print(f"✅ Generated {chunk_count} chunks in {time.time() - start_time:.2f}s")
+    except Exception as e:
+        print(f"❌ Generation error: {e}")
+        traceback.print_exc()
+    finally:
+        # Periodic garbage collection to prevent memory buildup
+        if chunk_count > 10:
+            gc.collect()
 # ----------------------------
+# WARMUP (preload models)
 # ----------------------------
 def warmup():
     try:
         t0 = time.time()
+        for _ in kokoro_generator_full("Hello, this is a warmup test.", "af_bella", 1.0):
             break
         print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
     except Exception as e:
         print(f"⚠️ WARMUP FAILED: {e}")
+        traceback.print_exc()
 # ----------------------------
 # GRADIO UI STREAM
         yield 24000, audio_to_int16_np(audio)
 # ----------------------------
+# FASTAPI WEBSOCKET ENGINE
 # ----------------------------
 api = FastAPI()
+# Use multiple workers for parallel inference
+INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=WORKER_COUNT)
 INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
 async def audio_engine_loop():
+    """
+    Main audio processing loop.
+    Pulls requests from queue and streams audio back to clients.
+    """
+    print(f"⚡ API AUDIO PIPELINE STARTED ({WORKER_COUNT} workers)")
     loop = asyncio.get_running_loop()
     while True:
+        try:
+            ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
+        except Exception as e:
+            print(f"⚠️ Queue get error: {e}")
+            continue
         # Skip dead clients early
+        try:
+            if ws.client_state.value > 1:
+                print("⏭️ Skipping dead client")
+                continue
+        except Exception:
             continue
+        frame_q: asyncio.Queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
+        generation_id = id(ws)
         def _worker():
+            """Worker thread for audio generation."""
+            chunk_count = 0
+            start_time = time.time()
             try:
+                print(f"🔊 [{generation_id}] Starting TTS: {text[:50]}...")
                 for audio in kokoro_generator_full(text, voice_code, speed):
                     b = audio_to_pcm_bytes(audio)
+                    chunk_count += 1
+                    if chunk_count == 1:
+                        print(f"⚡ [{generation_id}] First chunk ready in {time.time() - start_time:.2f}s")
+                    # Backpressure with TIMEOUT to prevent infinite hang
+                    attempts = 0
+                    max_attempts = BACKPRESSURE_TIMEOUT_MS  # 10 seconds at 1ms/attempt
+                    while attempts < max_attempts:
                         try:
                             loop.call_soon_threadsafe(frame_q.put_nowait, b)
                             break
+                        except asyncio.QueueFull:
                             time.sleep(0.001)
+                            attempts += 1
+                    else:
+                        # Timeout reached - client too slow or disconnected
+                        print(f"⚠️ [{generation_id}] Backpressure timeout after {attempts}ms - aborting")
+                        break
+                # Send completion signal
                 loop.call_soon_threadsafe(frame_q.put_nowait, None)
+                print(f"✅ [{generation_id}] Completed: {chunk_count} chunks in {time.time() - start_time:.2f}s")
             except Exception as e:
+                print(f"❌ [{generation_id}] Worker error: {e}")
+                traceback.print_exc()
                 try:
                     loop.call_soon_threadsafe(frame_q.put_nowait, None)
                 except Exception:
                     pass
+        # Submit to executor
         INFERENCE_EXECUTOR.submit(_worker)
+        # Stream frames to client
         first_sent = False
         started = time.time()
+        frames_sent = 0
         while True:
+            try:
+                # Timeout on frame retrieval to prevent infinite hang
+                frame = await asyncio.wait_for(frame_q.get(), timeout=30.0)
+            except asyncio.TimeoutError:
+                print(f"⚠️ [{generation_id}] Frame queue timeout - no data for 30s")
+                break
             if frame is None:
                 break
+            # Check client still alive
+            try:
+                if ws.client_state.value > 1:
+                    print(f"⏭️ [{generation_id}] Client disconnected mid-stream")
+                    break
+            except Exception:
                 break
             try:
                 await ws.send_bytes(frame)
+                frames_sent += 1
                 if not first_sent:
+                    print(f"⚡ [{generation_id}] First audio sent in {time.time() - started:.2f}s")
                     first_sent = True
+            except Exception as e:
+                print(f"⚠️ [{generation_id}] Send failed: {e}")
                 break
+        print(f"📤 [{generation_id}] Streaming complete: {frames_sent} frames sent")
 @api.on_event("startup")
 async def startup():
     loop = asyncio.get_running_loop()
+    # Warmup in executor to not block startup
     await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
+    # Start the audio engine loop
     asyncio.create_task(audio_engine_loop())
+    print("🚀 Server ready!")
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     voice_code = "af_bella"
     speed = 1.0
+    client_id = id(ws)
+    print(f"✅ [{client_id}] Client connected: {ws.client}")
     async def keep_alive():
+        """Send periodic pings to keep connection alive."""
         while True:
             try:
                 await asyncio.sleep(15)
     try:
         while True:
             try:
+                data = await asyncio.wait_for(ws.receive_json(), timeout=120.0)
+            except asyncio.TimeoutError:
+                print(f"⏱️ [{client_id}] Connection timeout - no messages for 120s")
+                break
             except WebSocketDisconnect:
+                print(f"❌ [{client_id}] Client disconnected cleanly")
                 break
             except Exception as e:
+                print(f"⚠️ [{client_id}] Connection error: {e}")
                 break
+            # Handle config updates
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 speed = float(data.get("speed", speed))
+                print(f"🎛️ [{client_id}] Config: voice={voice_code}, speed={speed}")
+            # Handle text-to-speech request
             if "text" in data:
                 text = normalize_text(data.get("text", ""))
                 if text.strip():
+                    print(f"📥 [{client_id}] TTS request: {text[:50]}...")
                     await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
+            # Handle flush (no-op for now, could clear queue)
             if "flush" in data:
                 pass
     finally:
         heartbeat_task.cancel()
+        print(f"👋 [{client_id}] Connection closed")
+# ----------------------------
+# HEALTH CHECK ENDPOINT
+# ----------------------------
+@api.get("/health")
+async def health_check():
+    return {
+        "status": "healthy",
+        "workers": WORKER_COUNT,
+        "queue_size": INFERENCE_QUEUE.qsize(),
+    }
 # ----------------------------
 # GRADIO APP
 # ----------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Optimized for 2 vCPU / 16GB RAM)")
+    gr.Markdown("API: Connect to `/ws/audio` for real-time streaming")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
                 label="Input Text",
                 lines=3,
+                value="Hello! This is the Kokoro text-to-speech system. The server is optimized for low latency streaming.",
             )
             voice_in = gr.Dropdown(
                 list(VOICE_CHOICES.keys()),
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
+    uvicorn.run(
+        final_app,
+        host="0.0.0.0",
+        port=7860,
+        workers=1,  # Single process, multiple threads
+        log_level="info",
+    )