Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Jan 5

Commit

d87f15e

verified ·

1 Parent(s): eff63e9

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -365

app.py CHANGED Viewed

@@ -1,305 +1,12 @@
-# import os
-# import json
-# import time
-# import re
-# import numpy as np
-# import onnxruntime as ort
-# import gradio as gr
-# from huggingface_hub import hf_hub_download
-# from misaki import en
-# from functools import lru_cache
-# from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-# import asyncio
-# import uvloop
-# import uvicorn
-# from concurrent.futures import ThreadPoolExecutor
-# # --- CONFIGURATION ---
-# MODEL_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
-# MODEL_FILE = "onnx/model.onnx"
-# TOKENIZER_FILE = "tokenizer.json"
-# # --- VOICE UI ---
-# VOICE_CHOICES = {
-#     '🇺🇸 🚺 Heart': 'af_heart', '🇺🇸 🚺 Bella': 'af_bella', '🇺🇸 🚺 Nicole': 'af_nicole',
-#     '🇺🇸 🚺 Aoede': 'af_aoede', '🇺🇸 🚺 Kore': 'af_kore', '🇺🇸 🚺 Sarah': 'af_sarah',
-#     '🇺🇸 🚺 Nova': 'af_nova', '🇺🇸 🚺 Sky': 'af_sky', '🇺🇸 🚺 Alloy': 'af_alloy',
-#     '🇺🇸 🚺 Jessica': 'af_jessica', '🇺🇸 🚺 River': 'af_river', '🇺🇸 🚹 Michael': 'am_michael',
-#     '🇺🇸 🚹 Fenrir': 'am_fenrir', '🇺🇸 🚹 Puck': 'am_puck', '🇺🇸 🚹 Echo': 'am_echo',
-#     '🇺🇸 🚹 Eric': 'am_eric', '🇺🇸 🚹 Liam': 'am_liam', '🇺🇸 🚹 Onyx': 'am_onyx',
-#     '🇺🇸 🚹 Santa': 'am_santa', '🇺🇸 🚹 Adam': 'am_adam', '🇬🇧 🚺 Emma': 'bf_emma',
-#     '🇬🇧 🚺 Isabella': 'bf_isabella', '🇬🇧 🚺 Alice': 'bf_alice', '🇬🇧 🚺 Lily': 'bf_lily',
-#     '🇬🇧 🚹 George': 'bm_george', '🇬🇧 🚹 Fable': 'bm_fable', '🇬🇧 🚹 Lewis': 'bm_lewis',
-#     '🇬🇧 🚹 Daniel': 'bm_daniel',
-# }
-# # --- ENGINE ---
-# print("🚀 BOOTING HIGH-RAM ENGINE...")
-# # Enable fast networking immediately
-# asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-# # 1. Phonemizer
-# G2P = en.G2P(trf=False, british=False, fallback=None)
-# # 2. Tokenizer
-# vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
-# with open(vocab_path, "r", encoding="utf-8") as f:
-#     data = json.load(f)
-#     TOKENIZER = data["model"]["vocab"] if "model" in data else data.get("vocab", {})
-# # 3. Voices (Lazy Load)
-# VOICE_CACHE = {}
-# def get_voice(name):
-#     code = VOICE_CHOICES.get(name, name)
-#     if code not in VOICE_CACHE:
-#         try:
-#             print(f"⬇️ Loading Voice: {code}")
-#             path = hf_hub_download(repo_id=MODEL_REPO, filename=f"voices/{code}.bin")
-#             VOICE_CACHE[code] = np.fromfile(path, dtype=np.float32).reshape(-1, 1, 256)
-#         except:
-#             if 'af_bella' not in VOICE_CACHE:
-#                 p = hf_hub_download(repo_id=MODEL_REPO, filename="voices/af_bella.bin")
-#                 VOICE_CACHE['af_bella'] = np.fromfile(p, dtype=np.float32).reshape(-1, 1, 256)
-#             return VOICE_CACHE['af_bella']
-#     return VOICE_CACHE[code]
-# # 4. ONNX Engine
-# model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-# sess_options = ort.SessionOptions()
-# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-# sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
-# sess_options.intra_op_num_threads = 0
-# sess_options.inter_op_num_threads = 0
-# SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
-# print("✅ ENGINE READY")
-# # --- CORE LOGIC (Shared by UI and API) ---
-# @lru_cache(maxsize=5000)
-# def get_tokens(text):
-#     if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
-#     phonemes, _ = G2P(text)
-#     return [TOKENIZER.get(p, 0) for p in phonemes]
-# def trim_silence(audio, threshold=0.01):
-#     if audio.size == 0: return audio
-#     mask = np.abs(audio) > threshold
-#     if not np.any(mask): return audio
-#     start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
-#     return audio[max(0, start-50) : min(len(audio), end+50)]
-# def infer(text, voice_name, speed):
-#     if not text.strip(): return None
-#     ids = get_tokens(text)[:510]
-#     if not ids: return None
-#     voice = get_voice(voice_name)
-#     style = voice[min(len(ids), voice.shape[0]-1)]
-#     try:
-#         audio = SESSION.run(None, {
-#             "input_ids": np.array([[0] + ids + [0]], dtype=np.int64),
-#             "style": style,
-#             "speed": np.array([speed], dtype=np.float32)
-#         })[0]
-#         return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
-#     except: return None
-# def tuned_splitter(text):
-#     chunks = re.split(r'([.,!?;:\n]+)', text)
-#     buffer = ""
-#     chunk_count = 0
-#     for part in chunks:
-#         buffer += part
-#         if chunk_count == 0: threshold = 50
-#         elif chunk_count == 1: threshold = 100
-#         elif chunk_count == 2: threshold = 150
-#         else: threshold = 250
-#         if re.search(r'[.,!?;:\n]$', buffer) and len(buffer) >= threshold:
-#             if buffer.strip():
-#                 yield buffer
-#                 chunk_count += 1
-#                 buffer = ""
-#     if buffer.strip():
-#         yield buffer.strip()
-# def stream_generator(text, voice_name, speed):
-#     print("--- START STREAM ---")
-#     get_voice(voice_name)
-#     for i, chunk in enumerate(tuned_splitter(text)):
-#         t0 = time.time()
-#         audio = infer(chunk, voice_name, speed)
-#         if audio:
-#             dur = time.time() - t0
-#             print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
-#             yield audio
-#     print("--- END STREAM ---")
-# # --- UI DEFINITION ---
-# with gr.Blocks(title="Kokoro TTS") as app:
-#     gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
-#     with gr.Row():
-#         with gr.Column():
-#             text_in = gr.Textbox(label="Input Text", lines=3, value="The system is live. Use the Gradio UI for testing, or connect to /ws/audio for the API.")
-#             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
-#             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
-#             btn = gr.Button("Generate", variant="primary")
-#         with gr.Column():
-#             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
-#     btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
-# # --- API INTEGRATION ---
-# # --- API INTEGRATION ---
-# from concurrent.futures import ThreadPoolExecutor
-# # 1. Define FastAPI
-# api = FastAPI()
-# # 2. Define Worker Pools
-# # We use max_workers=1 because ONNX is already multithreaded internally.
-# # Adding more workers on a 2 vCPU machine will actually SLOW it down due to context switching.
-# INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-# G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-# INFERENCE_QUEUE = asyncio.Queue()
-# # 3. Background Tasks
-# def g2p_task(text):
-#     # Reuses the exact same G2P/Tokenizer logic as the UI
-#     if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
-#     phonemes, _ = G2P(text)
-#     return [TOKENIZER.get(p, 0) for p in phonemes]
-# # This is the "Engine Room". It pulls tickets and cooks them one by one.
-# async def audio_engine_loop():
-#     print("⚡ API AUDIO PIPELINE STARTED")
-#     loop = asyncio.get_running_loop()
-#     while True:
-#         # Wait for a ticket (text tokens + websocket connection)
-#         job = await INFERENCE_QUEUE.get()
-#         tokens, style, speed, ws = job
-#         try:
-#             # Check if client is still connected before doing heavy math
-#             # (FastAPI WS state: 1 = Connected, 2/3 = Closing/Closed)
-#             if ws.client_state.value > 1:
-#                 continue
-#             # Reuses the exact same SESSION as the UI
-#             input_ids = np.array([[0, *tokens[:510], 0]], dtype=np.int64)
-#             style_vec = style[min(len(tokens), style.shape[0]-1)]
-#             # --- CRITICAL FIX: Run blocking math in a separate thread ---
-#             # This allows the main server to keep talking to the other 59 users
-#             # while this calculation happens in the background.
-#             audio = await loop.run_in_executor(
-#                 INFERENCE_EXECUTOR,
-#                 lambda: SESSION.run(None, {
-#                     "input_ids": input_ids,
-#                     "style": style_vec,
-#                     "speed": np.array([speed], dtype=np.float32)
-#                 })[0]
-#             )
-#             # Post-Process (Fast enough to run on main thread)
-#             pcm_bytes = (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16).tobytes()
-#             # Send audio back to the specific user who asked for it
-#             try:
-#                 await ws.send_bytes(pcm_bytes)
-#             except Exception:
-#                 # If sending fails, just move on. Don't crash the engine.
-#                 pass
-#         except Exception as e:
-#             print(f"API Engine Error: {e}")
-# @api.on_event("startup")
-# async def startup():
-#     asyncio.create_task(audio_engine_loop())
-# # -------------------------------------------------------
-# # ROBUST WEBSOCKET ENDPOINT
-# # -------------------------------------------------------
-# @api.websocket("/ws/audio")
-# async def websocket_endpoint(ws: WebSocket):
-#     await ws.accept()
-#     # Defaults
-#     voice_key = "af_bella"
-#     speed = 1.0
-#     loop = asyncio.get_running_loop()
-#     print(f"✅ Client connected: {ws.client}")
-#     # --- HEARTBEAT KEEPER ---
-#     # This prevents HF Nginx from killing the connection during silence.
-#     async def keep_alive():
-#         while True:
-#             try:
-#                 await asyncio.sleep(15) # Send a ping every 15s
-#                 # We send a text frame as a ping. The browser ignores it or handles it.
-#                 await ws.send_json({"type": "ping"})
-#             except:
-#                 break
-#     heartbeat_task = asyncio.create_task(keep_alive())
-#     try:
-#         while True:
-#             try:
-#                 # Wait for JSON command
-#                 data = await ws.receive_json()
-#             except WebSocketDisconnect:
-#                 print("❌ Client disconnected cleanly")
-#                 break # BREAK THE LOOP
-#             except Exception as e:
-#                 print(f"⚠️ Connection lost: {e}")
-#                 break # BREAK THE LOOP
-#             # 1. Config Change
-#             if "config" in data:
-#                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
-#                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-#                 get_voice(voice_name)
-#                 voice_key = voice_code
-#                 speed = float(data.get("speed", speed))
-#                 # print(f"⚙️ Config updated: {voice_key}") # Commented out to reduce log noise
-#             # 2. Text Stream
-#             if "text" in data:
-#                 text = data["text"]
-#                 # The splitter breaks "500 words" into small sentences.
-#                 # These small sentences are added to the queue instantly.
-#                 for chunk in tuned_splitter(text):
-#                     if chunk.strip():
-#                         # Run G2P in thread to avoid blocking input
-#                         tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
-#                         if tokens:
-#                             style = VOICE_CACHE.get(voice_key)
-#                             if style is None:
-#                                 get_voice(voice_key)
-#                                 style = VOICE_CACHE.get(voice_key)
-#                             # Put the ticket in the global queue
-#                             await INFERENCE_QUEUE.put((tokens, style, speed, ws))
-#             if "flush" in data:
-#                 pass
-#     except Exception as e:
-#         print(f"🔥 Critical WS Error: {e}")
-#     finally:
-#         heartbeat_task.cancel() # Clean up the heartbeat task
-# # --- FINAL MOUNT ---
-# final_app = gr.mount_gradio_app(api, app, path="/")
-# if __name__ == "__main__":
-#     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import json
 import time
 import re
 import numpy as np
 import gradio as gr
 from functools import lru_cache
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import asyncio
@@ -307,11 +14,10 @@ import uvloop
 import uvicorn
 from concurrent.futures import ThreadPoolExecutor
-# 🔥 USE KOKORO PIPELINE INSTEAD OF RAW MISAKI
-from kokoro import KPipeline
 # --- CONFIGURATION ---
-SAMPLE_RATE = 24000
 # --- VOICE UI ---
 VOICE_CHOICES = {
@@ -328,50 +34,75 @@ VOICE_CHOICES = {
 }
 # --- ENGINE ---
-print("🚀 BOOTING KOKORO PIPELINE ENGINE...")
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-# Initialize KPipeline - this handles espeak fallback automatically!
-PIPELINE = KPipeline(lang_code='a')  # 'a' = American English
-print("✅ KOKORO PIPELINE READY")
-# --- CORE LOGIC ---
-def generate_audio(text, voice_name, speed):
-    """Generate audio using KPipeline - handles all phonemes properly!"""
-    if not text or not text.strip():
-        return None
-    voice = VOICE_CHOICES.get(voice_name, voice_name)
-    try:
-        # KPipeline returns generator of (graphemes, phonemes, audio)
-        audio_chunks = []
-        for gs, ps, audio in PIPELINE(text, voice=voice, speed=speed):
-            if audio is not None and len(audio) > 0:
-                audio_chunks.append(audio)
-        if not audio_chunks:
-            return None
-        # Concatenate all audio chunks
-        full_audio = np.concatenate(audio_chunks)
-        return full_audio
-    except Exception as e:
-        print(f"⚠️ Audio generation error: {e}")
-        return None
 def trim_silence(audio, threshold=0.01):
-    if audio is None or audio.size == 0:
-        return audio
     mask = np.abs(audio) > threshold
-    if not np.any(mask):
-        return audio
     start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
     return audio[max(0, start-50) : min(len(audio), end+50)]
 def tuned_splitter(text):
-    """Split text into chunks for streaming"""
     chunks = re.split(r'([.,!?;:\n]+)', text)
     buffer = ""
     chunk_count = 0
@@ -390,26 +121,23 @@ def tuned_splitter(text):
         yield buffer.strip()
 def stream_generator(text, voice_name, speed):
-    """Generate audio stream for Gradio UI"""
-    print(f"--- START STREAM: {text[:50]}... ---")
     for i, chunk in enumerate(tuned_splitter(text)):
         t0 = time.time()
-        audio = generate_audio(chunk, voice_name, speed)
-        if audio is not None and len(audio) > 0:
-            audio = trim_silence(audio)
             dur = time.time() - t0
             print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
-            # Convert to int16 for audio output
-            audio_int16 = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
-            yield (SAMPLE_RATE, audio_int16)
     print("--- END STREAM ---")
 # --- UI DEFINITION ---
 with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M with KPipeline (Proper Name Support!)")
     with gr.Row():
         with gr.Column():
-            text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! My name is Yaman and I work at Willo. Testing pronunciation of names!")
             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
@@ -418,59 +146,96 @@ with gr.Blocks(title="Kokoro TTS") as app:
     btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 # --- API INTEGRATION ---
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
 async def audio_engine_loop():
-    """Background worker that processes audio requests"""
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
         job = await INFERENCE_QUEUE.get()
-        text, voice, speed, ws = job
         try:
             if ws.client_state.value > 1:
                 continue
-            # Generate audio using KPipeline (in thread to not block)
             audio = await loop.run_in_executor(
-                INFERENCE_EXECUTOR,
-                lambda: generate_audio(text, voice, speed)
             )
-            if audio is not None and len(audio) > 0:
-                audio = trim_silence(audio)
-                pcm_bytes = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
-                try:
-                    await ws.send_bytes(pcm_bytes)
-                except Exception:
-                    pass
         except Exception as e:
-            print(f"⚠️ API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
     voice_key = "af_bella"
     speed = 1.0
     print(f"✅ Client connected: {ws.client}")
     async def keep_alive():
         while True:
             try:
-                await asyncio.sleep(15)
                 await ws.send_json({"type": "ping"})
             except:
                 break
@@ -480,35 +245,49 @@ async def websocket_endpoint(ws: WebSocket):
     try:
         while True:
             try:
                 data = await ws.receive_json()
             except WebSocketDisconnect:
                 print("❌ Client disconnected cleanly")
-                break
             except Exception as e:
                 print(f"⚠️ Connection lost: {e}")
-                break
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 voice_key = voice_code
                 speed = float(data.get("speed", speed))
             if "text" in data:
                 text = data["text"]
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
-                        await INFERENCE_QUEUE.put((chunk, voice_key, speed, ws))
             if "flush" in data:
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
-        import traceback
-        traceback.print_exc()
     finally:
-        heartbeat_task.cancel()
 # --- FINAL MOUNT ---
 final_app = gr.mount_gradio_app(api, app, path="/")

 import os
 import json
 import time
 import re
 import numpy as np
+import onnxruntime as ort
 import gradio as gr
+from huggingface_hub import hf_hub_download
+from misaki import en
 from functools import lru_cache
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import asyncio
 import uvicorn
 from concurrent.futures import ThreadPoolExecutor
 # --- CONFIGURATION ---
+MODEL_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
+MODEL_FILE = "onnx/model.onnx"
+TOKENIZER_FILE = "tokenizer.json"
 # --- VOICE UI ---
 VOICE_CHOICES = {
 }
 # --- ENGINE ---
+print("🚀 BOOTING HIGH-RAM ENGINE...")
+# Enable fast networking immediately
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+# 1. Phonemizer
+G2P = en.G2P(trf=False, british=False, fallback=None)
+# 2. Tokenizer
+vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
+with open(vocab_path, "r", encoding="utf-8") as f:
+    data = json.load(f)
+    TOKENIZER = data["model"]["vocab"] if "model" in data else data.get("vocab", {})
+# 3. Voices (Lazy Load)
+VOICE_CACHE = {}
+def get_voice(name):
+    code = VOICE_CHOICES.get(name, name)
+    if code not in VOICE_CACHE:
+        try:
+            print(f"⬇️ Loading Voice: {code}")
+            path = hf_hub_download(repo_id=MODEL_REPO, filename=f"voices/{code}.bin")
+            VOICE_CACHE[code] = np.fromfile(path, dtype=np.float32).reshape(-1, 1, 256)
+        except:
+            if 'af_bella' not in VOICE_CACHE:
+                p = hf_hub_download(repo_id=MODEL_REPO, filename="voices/af_bella.bin")
+                VOICE_CACHE['af_bella'] = np.fromfile(p, dtype=np.float32).reshape(-1, 1, 256)
+            return VOICE_CACHE['af_bella']
+    return VOICE_CACHE[code]
+# 4. ONNX Engine
+model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+sess_options = ort.SessionOptions()
+sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
+sess_options.intra_op_num_threads = 0
+sess_options.inter_op_num_threads = 0
+SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
+print("✅ ENGINE READY")
+# --- CORE LOGIC (Shared by UI and API) ---
+@lru_cache(maxsize=5000)
+def get_tokens(text):
+    if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
+    phonemes, _ = G2P(text)
+    return [TOKENIZER.get(p, 0) for p in phonemes]
 def trim_silence(audio, threshold=0.01):
+    if audio.size == 0: return audio
     mask = np.abs(audio) > threshold
+    if not np.any(mask): return audio
     start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
     return audio[max(0, start-50) : min(len(audio), end+50)]
+def infer(text, voice_name, speed):
+    if not text.strip(): return None
+    ids = get_tokens(text)[:510]
+    if not ids: return None
+    voice = get_voice(voice_name)
+    style = voice[min(len(ids), voice.shape[0]-1)]
+    try:
+        audio = SESSION.run(None, {
+            "input_ids": np.array([[0] + ids + [0]], dtype=np.int64),
+            "style": style,
+            "speed": np.array([speed], dtype=np.float32)
+        })[0]
+        return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
+    except: return None
 def tuned_splitter(text):
     chunks = re.split(r'([.,!?;:\n]+)', text)
     buffer = ""
     chunk_count = 0
         yield buffer.strip()
 def stream_generator(text, voice_name, speed):
+    print("--- START STREAM ---")
+    get_voice(voice_name)
     for i, chunk in enumerate(tuned_splitter(text)):
         t0 = time.time()
+        audio = infer(chunk, voice_name, speed)
+        if audio:
             dur = time.time() - t0
             print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
+            yield audio
     print("--- END STREAM ---")
 # --- UI DEFINITION ---
 with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
     with gr.Row():
         with gr.Column():
+            text_in = gr.Textbox(label="Input Text", lines=3, value="The system is live. Use the Gradio UI for testing, or connect to /ws/audio for the API.")
             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
     btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 # --- API INTEGRATION ---
+# --- API INTEGRATION ---
+from concurrent.futures import ThreadPoolExecutor
+# 1. Define FastAPI
 api = FastAPI()
+# 2. Define Worker Pools
+# We use max_workers=1 because ONNX is already multithreaded internally.
+# Adding more workers on a 2 vCPU machine will actually SLOW it down due to context switching.
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
+# 3. Background Tasks
+def g2p_task(text):
+    # Reuses the exact same G2P/Tokenizer logic as the UI
+    if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
+    phonemes, _ = G2P(text)
+    return [TOKENIZER.get(p, 0) for p in phonemes]
+# This is the "Engine Room". It pulls tickets and cooks them one by one.
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
+        # Wait for a ticket (text tokens + websocket connection)
         job = await INFERENCE_QUEUE.get()
+        tokens, style, speed, ws = job
         try:
+            # Check if client is still connected before doing heavy math
+            # (FastAPI WS state: 1 = Connected, 2/3 = Closing/Closed)
             if ws.client_state.value > 1:
                 continue
+            # Reuses the exact same SESSION as the UI
+            input_ids = np.array([[0, *tokens[:510], 0]], dtype=np.int64)
+            style_vec = style[min(len(tokens), style.shape[0]-1)]
+            # --- CRITICAL FIX: Run blocking math in a separate thread ---
+            # This allows the main server to keep talking to the other 59 users
+            # while this calculation happens in the background.
             audio = await loop.run_in_executor(
+                INFERENCE_EXECUTOR,
+                lambda: SESSION.run(None, {
+                    "input_ids": input_ids,
+                    "style": style_vec,
+                    "speed": np.array([speed], dtype=np.float32)
+                })[0]
             )
+            # Post-Process (Fast enough to run on main thread)
+            pcm_bytes = (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16).tobytes()
+            # Send audio back to the specific user who asked for it
+            try:
+                await ws.send_bytes(pcm_bytes)
+            except Exception:
+                # If sending fails, just move on. Don't crash the engine.
+                pass
         except Exception as e:
+            print(f"API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
+# -------------------------------------------------------
+# ROBUST WEBSOCKET ENDPOINT
+# -------------------------------------------------------
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
+    # Defaults
     voice_key = "af_bella"
     speed = 1.0
+    loop = asyncio.get_running_loop()
     print(f"✅ Client connected: {ws.client}")
+    # --- HEARTBEAT KEEPER ---
+    # This prevents HF Nginx from killing the connection during silence.
     async def keep_alive():
         while True:
             try:
+                await asyncio.sleep(15) # Send a ping every 15s
+                # We send a text frame as a ping. The browser ignores it or handles it.
                 await ws.send_json({"type": "ping"})
             except:
                 break
     try:
         while True:
             try:
+                # Wait for JSON command
                 data = await ws.receive_json()
             except WebSocketDisconnect:
                 print("❌ Client disconnected cleanly")
+                break # BREAK THE LOOP
             except Exception as e:
                 print(f"⚠️ Connection lost: {e}")
+                break # BREAK THE LOOP
+            # 1. Config Change
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+                get_voice(voice_name)
                 voice_key = voice_code
                 speed = float(data.get("speed", speed))
+                # print(f"⚙️ Config updated: {voice_key}") # Commented out to reduce log noise
+            # 2. Text Stream
             if "text" in data:
                 text = data["text"]
+                # The splitter breaks "500 words" into small sentences.
+                # These small sentences are added to the queue instantly.
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
+                        # Run G2P in thread to avoid blocking input
+                        tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
+                        if tokens:
+                            style = VOICE_CACHE.get(voice_key)
+                            if style is None:
+                                get_voice(voice_key)
+                                style = VOICE_CACHE.get(voice_key)
+                            # Put the ticket in the global queue
+                            await INFERENCE_QUEUE.put((tokens, style, speed, ws))
             if "flush" in data:
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
     finally:
+        heartbeat_task.cancel() # Clean up the heartbeat task
 # --- FINAL MOUNT ---
 final_app = gr.mount_gradio_app(api, app, path="/")