Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Jan 6

Commit

70de827

verified ·

1 Parent(s): ee7f838

Update app.py

Browse files

Files changed (1) hide show

app.py +527 -223

app.py CHANGED Viewed

@@ -1,25 +1,336 @@
 import os
-import json
-import time
 import re
 import numpy as np
-import onnxruntime as ort
 import gradio as gr
-from huggingface_hub import hf_hub_download
-from misaki import en
-from functools import lru_cache
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-import asyncio
-import uvloop
 import uvicorn
-from concurrent.futures import ThreadPoolExecutor
-# --- CONFIGURATION ---
-MODEL_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
-MODEL_FILE = "onnx/model.onnx"
-TOKENIZER_FILE = "tokenizer.json"
-# --- VOICE UI ---
 VOICE_CHOICES = {
     '🇺🇸 🚺 Heart': 'af_heart', '🇺🇸 🚺 Bella': 'af_bella', '🇺🇸 🚺 Nicole': 'af_nicole',
     '🇺🇸 🚺 Aoede': 'af_aoede', '🇺🇸 🚺 Kore': 'af_kore', '🇺🇸 🚺 Sarah': 'af_sarah',
@@ -33,263 +344,256 @@ VOICE_CHOICES = {
     '🇬🇧 🚹 Daniel': 'bm_daniel',
 }
-# --- ENGINE ---
-print("🚀 BOOTING HIGH-RAM ENGINE...")
-# Enable fast networking immediately
-asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-# 1. Phonemizer
-G2P = en.G2P(trf=False, british=False, fallback=None)
-# 2. Tokenizer
-vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
-with open(vocab_path, "r", encoding="utf-8") as f:
-    data = json.load(f)
-    TOKENIZER = data["model"]["vocab"] if "model" in data else data.get("vocab", {})
-# 3. Voices (Lazy Load)
-VOICE_CACHE = {}
-def get_voice(name):
-    code = VOICE_CHOICES.get(name, name)
-    if code not in VOICE_CACHE:
-        try:
-            print(f"⬇️ Loading Voice: {code}")
-            path = hf_hub_download(repo_id=MODEL_REPO, filename=f"voices/{code}.bin")
-            VOICE_CACHE[code] = np.fromfile(path, dtype=np.float32).reshape(-1, 1, 256)
-        except:
-            if 'af_bella' not in VOICE_CACHE:
-                p = hf_hub_download(repo_id=MODEL_REPO, filename="voices/af_bella.bin")
-                VOICE_CACHE['af_bella'] = np.fromfile(p, dtype=np.float32).reshape(-1, 1, 256)
-            return VOICE_CACHE['af_bella']
-    return VOICE_CACHE[code]
-# 4. ONNX Engine
-model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-sess_options = ort.SessionOptions()
-sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
-sess_options.intra_op_num_threads = 0
-sess_options.inter_op_num_threads = 0
-SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
-print("✅ ENGINE READY")
-# --- CORE LOGIC (Shared by UI and API) ---
-@lru_cache(maxsize=5000)
-def get_tokens(text):
-    if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
-    phonemes, _ = G2P(text)
-    return [TOKENIZER.get(p, 0) for p in phonemes]
-def trim_silence(audio, threshold=0.01):
-    if audio.size == 0: return audio
-    mask = np.abs(audio) > threshold
-    if not np.any(mask): return audio
-    start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
-    return audio[max(0, start-50) : min(len(audio), end+50)]
-def infer(text, voice_name, speed):
-    if not text.strip(): return None
-    ids = get_tokens(text)[:510]
-    if not ids: return None
-    voice = get_voice(voice_name)
-    style = voice[min(len(ids), voice.shape[0]-1)]
     try:
-        audio = SESSION.run(None, {
-            "input_ids": np.array([[0] + ids + [0]], dtype=np.int64),
-            "style": style,
-            "speed": np.array([speed], dtype=np.float32)
-        })[0]
-        return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
-    except: return None
-def tuned_splitter(text):
-    chunks = re.split(r'([.,!?;:\n]+)', text)
-    buffer = ""
-    chunk_count = 0
-    for part in chunks:
-        buffer += part
-        if chunk_count == 0: threshold = 50
-        elif chunk_count == 1: threshold = 100
-        elif chunk_count == 2: threshold = 150
-        else: threshold = 250
-        if re.search(r'[.,!?;:\n]$', buffer) and len(buffer) >= threshold:
-            if buffer.strip():
-                yield buffer
-                chunk_count += 1
-                buffer = ""
-    if buffer.strip():
-        yield buffer.strip()
-def stream_generator(text, voice_name, speed):
-    print("--- START STREAM ---")
-    get_voice(voice_name)
-    for i, chunk in enumerate(tuned_splitter(text)):
-        t0 = time.time()
-        audio = infer(chunk, voice_name, speed)
-        if audio:
-            dur = time.time() - t0
-            print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
-            yield audio
-    print("--- END STREAM ---")
-# --- UI DEFINITION ---
-with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
-    with gr.Row():
-        with gr.Column():
-            text_in = gr.Textbox(label="Input Text", lines=3, value="The system is live. Use the Gradio UI for testing, or connect to /ws/audio for the API.")
-            voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
-            speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
-            btn = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
-    btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
-# --- API INTEGRATION ---
-# --- API INTEGRATION ---
-from concurrent.futures import ThreadPoolExecutor
-# 1. Define FastAPI
-api = FastAPI()
-# 2. Define Worker Pools
-# We use max_workers=1 because ONNX is already multithreaded internally.
-# Adding more workers on a 2 vCPU machine will actually SLOW it down due to context switching.
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-INFERENCE_QUEUE = asyncio.Queue()
-# 3. Background Tasks
-def g2p_task(text):
-    # Reuses the exact same G2P/Tokenizer logic as the UI
-    if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
-    phonemes, _ = G2P(text)
-    return [TOKENIZER.get(p, 0) for p in phonemes]
-# This is the "Engine Room". It pulls tickets and cooks them one by one.
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
-        # Wait for a ticket (text tokens + websocket connection)
         job = await INFERENCE_QUEUE.get()
-        tokens, style, speed, ws = job
-        try:
-            # Check if client is still connected before doing heavy math
-            # (FastAPI WS state: 1 = Connected, 2/3 = Closing/Closed)
             if ws.client_state.value > 1:
-                continue
-            # Reuses the exact same SESSION as the UI
-            input_ids = np.array([[0, *tokens[:510], 0]], dtype=np.int64)
-            style_vec = style[min(len(tokens), style.shape[0]-1)]
-            # --- CRITICAL FIX: Run blocking math in a separate thread ---
-            # This allows the main server to keep talking to the other 59 users
-            # while this calculation happens in the background.
-            audio = await loop.run_in_executor(
-                INFERENCE_EXECUTOR,
-                lambda: SESSION.run(None, {
-                    "input_ids": input_ids,
-                    "style": style_vec,
-                    "speed": np.array([speed], dtype=np.float32)
-                })[0]
             )
-            # Post-Process (Fast enough to run on main thread)
-            pcm_bytes = (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16).tobytes()
-            # Send audio back to the specific user who asked for it
-            try:
-                await ws.send_bytes(pcm_bytes)
-            except Exception:
-                # If sending fails, just move on. Don't crash the engine.
-                pass
-        except Exception as e:
-            print(f"API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
-# -------------------------------------------------------
-# ROBUST WEBSOCKET ENDPOINT
-# -------------------------------------------------------
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
-    # Defaults
-    voice_key = "af_bella"
     speed = 1.0
     loop = asyncio.get_running_loop()
-    print(f"✅ Client connected: {ws.client}")
-    # --- HEARTBEAT KEEPER ---
-    # This prevents HF Nginx from killing the connection during silence.
-    async def keep_alive():
-        while True:
-            try:
-                await asyncio.sleep(15) # Send a ping every 15s
-                # We send a text frame as a ping. The browser ignores it or handles it.
-                await ws.send_json({"type": "ping"})
-            except:
-                break
-    heartbeat_task = asyncio.create_task(keep_alive())
     try:
         while True:
             try:
-                # Wait for JSON command
                 data = await ws.receive_json()
             except WebSocketDisconnect:
                 print("❌ Client disconnected cleanly")
-                break # BREAK THE LOOP
             except Exception as e:
                 print(f"⚠️ Connection lost: {e}")
-                break # BREAK THE LOOP
-            # 1. Config Change
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-                get_voice(voice_name)
-                voice_key = voice_code
                 speed = float(data.get("speed", speed))
-                # print(f"⚙️ Config updated: {voice_key}") # Commented out to reduce log noise
-            # 2. Text Stream
             if "text" in data:
                 text = data["text"]
-                # The splitter breaks "500 words" into small sentences.
-                # These small sentences are added to the queue instantly.
-                for chunk in tuned_splitter(text):
-                    if chunk.strip():
-                        # Run G2P in thread to avoid blocking input
-                        tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
-                        if tokens:
-                            style = VOICE_CACHE.get(voice_key)
-                            if style is None:
-                                get_voice(voice_key)
-                                style = VOICE_CACHE.get(voice_key)
-                            # Put the ticket in the global queue
-                            await INFERENCE_QUEUE.put((tokens, style, speed, ws))
             if "flush" in data:
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
-    finally:
-        heartbeat_task.cancel() # Clean up the heartbeat task
-# --- FINAL MOUNT ---
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":

+# import os
+# import json
+# import time
+# import re
+# import numpy as np
+# import onnxruntime as ort
+# import gradio as gr
+# from huggingface_hub import hf_hub_download
+# from misaki import en
+# from functools import lru_cache
+# from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+# import asyncio
+# import uvloop
+# import uvicorn
+# from concurrent.futures import ThreadPoolExecutor
+# # --- CONFIGURATION ---
+# MODEL_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
+# MODEL_FILE = "onnx/model.onnx"
+# TOKENIZER_FILE = "tokenizer.json"
+# # --- VOICE UI ---
+# VOICE_CHOICES = {
+#     '🇺🇸 🚺 Heart': 'af_heart', '🇺🇸 🚺 Bella': 'af_bella', '🇺🇸 🚺 Nicole': 'af_nicole',
+#     '🇺🇸 🚺 Aoede': 'af_aoede', '🇺🇸 🚺 Kore': 'af_kore', '🇺🇸 🚺 Sarah': 'af_sarah',
+#     '🇺🇸 🚺 Nova': 'af_nova', '🇺🇸 🚺 Sky': 'af_sky', '🇺🇸 🚺 Alloy': 'af_alloy',
+#     '🇺🇸 🚺 Jessica': 'af_jessica', '🇺🇸 🚺 River': 'af_river', '🇺🇸 🚹 Michael': 'am_michael',
+#     '🇺🇸 🚹 Fenrir': 'am_fenrir', '🇺🇸 🚹 Puck': 'am_puck', '🇺🇸 🚹 Echo': 'am_echo',
+#     '🇺🇸 🚹 Eric': 'am_eric', '🇺🇸 🚹 Liam': 'am_liam', '🇺🇸 🚹 Onyx': 'am_onyx',
+#     '🇺🇸 🚹 Santa': 'am_santa', '🇺🇸 🚹 Adam': 'am_adam', '🇬🇧 🚺 Emma': 'bf_emma',
+#     '🇬🇧 🚺 Isabella': 'bf_isabella', '🇬🇧 🚺 Alice': 'bf_alice', '🇬🇧 🚺 Lily': 'bf_lily',
+#     '🇬🇧 🚹 George': 'bm_george', '🇬🇧 🚹 Fable': 'bm_fable', '🇬🇧 🚹 Lewis': 'bm_lewis',
+#     '🇬🇧 🚹 Daniel': 'bm_daniel',
+# }
+# # --- ENGINE ---
+# print("🚀 BOOTING HIGH-RAM ENGINE...")
+# # Enable fast networking immediately
+# asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+# # 1. Phonemizer
+# G2P = en.G2P(trf=False, british=False, fallback=None)
+# # 2. Tokenizer
+# vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
+# with open(vocab_path, "r", encoding="utf-8") as f:
+#     data = json.load(f)
+#     TOKENIZER = data["model"]["vocab"] if "model" in data else data.get("vocab", {})
+# # 3. Voices (Lazy Load)
+# VOICE_CACHE = {}
+# def get_voice(name):
+#     code = VOICE_CHOICES.get(name, name)
+#     if code not in VOICE_CACHE:
+#         try:
+#             print(f"⬇️ Loading Voice: {code}")
+#             path = hf_hub_download(repo_id=MODEL_REPO, filename=f"voices/{code}.bin")
+#             VOICE_CACHE[code] = np.fromfile(path, dtype=np.float32).reshape(-1, 1, 256)
+#         except:
+#             if 'af_bella' not in VOICE_CACHE:
+#                 p = hf_hub_download(repo_id=MODEL_REPO, filename="voices/af_bella.bin")
+#                 VOICE_CACHE['af_bella'] = np.fromfile(p, dtype=np.float32).reshape(-1, 1, 256)
+#             return VOICE_CACHE['af_bella']
+#     return VOICE_CACHE[code]
+# # 4. ONNX Engine
+# model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+# sess_options = ort.SessionOptions()
+# sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+# sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
+# sess_options.intra_op_num_threads = 0
+# sess_options.inter_op_num_threads = 0
+# SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
+# print("✅ ENGINE READY")
+# # --- CORE LOGIC (Shared by UI and API) ---
+# @lru_cache(maxsize=5000)
+# def get_tokens(text):
+#     if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
+#     phonemes, _ = G2P(text)
+#     return [TOKENIZER.get(p, 0) for p in phonemes]
+# def trim_silence(audio, threshold=0.01):
+#     if audio.size == 0: return audio
+#     mask = np.abs(audio) > threshold
+#     if not np.any(mask): return audio
+#     start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
+#     return audio[max(0, start-50) : min(len(audio), end+50)]
+# def infer(text, voice_name, speed):
+#     if not text.strip(): return None
+#     ids = get_tokens(text)[:510]
+#     if not ids: return None
+#     voice = get_voice(voice_name)
+#     style = voice[min(len(ids), voice.shape[0]-1)]
+#     try:
+#         audio = SESSION.run(None, {
+#             "input_ids": np.array([[0] + ids + [0]], dtype=np.int64),
+#             "style": style,
+#             "speed": np.array([speed], dtype=np.float32)
+#         })[0]
+#         return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
+#     except: return None
+# def tuned_splitter(text):
+#     chunks = re.split(r'([.,!?;:\n]+)', text)
+#     buffer = ""
+#     chunk_count = 0
+#     for part in chunks:
+#         buffer += part
+#         if chunk_count == 0: threshold = 50
+#         elif chunk_count == 1: threshold = 100
+#         elif chunk_count == 2: threshold = 150
+#         else: threshold = 250
+#         if re.search(r'[.,!?;:\n]$', buffer) and len(buffer) >= threshold:
+#             if buffer.strip():
+#                 yield buffer
+#                 chunk_count += 1
+#                 buffer = ""
+#     if buffer.strip():
+#         yield buffer.strip()
+# def stream_generator(text, voice_name, speed):
+#     print("--- START STREAM ---")
+#     get_voice(voice_name)
+#     for i, chunk in enumerate(tuned_splitter(text)):
+#         t0 = time.time()
+#         audio = infer(chunk, voice_name, speed)
+#         if audio:
+#             dur = time.time() - t0
+#             print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
+#             yield audio
+#     print("--- END STREAM ---")
+# # --- UI DEFINITION ---
+# with gr.Blocks(title="Kokoro TTS") as app:
+#     gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
+#     with gr.Row():
+#         with gr.Column():
+#             text_in = gr.Textbox(label="Input Text", lines=3, value="The system is live. Use the Gradio UI for testing, or connect to /ws/audio for the API.")
+#             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
+#             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
+#             btn = gr.Button("Generate", variant="primary")
+#         with gr.Column():
+#             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+#     btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
+# # --- API INTEGRATION ---
+# # --- API INTEGRATION ---
+# from concurrent.futures import ThreadPoolExecutor
+# # 1. Define FastAPI
+# api = FastAPI()
+# # 2. Define Worker Pools
+# # We use max_workers=1 because ONNX is already multithreaded internally.
+# # Adding more workers on a 2 vCPU machine will actually SLOW it down due to context switching.
+# INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+# G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+# INFERENCE_QUEUE = asyncio.Queue()
+# # 3. Background Tasks
+# def g2p_task(text):
+#     # Reuses the exact same G2P/Tokenizer logic as the UI
+#     if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
+#     phonemes, _ = G2P(text)
+#     return [TOKENIZER.get(p, 0) for p in phonemes]
+# # This is the "Engine Room". It pulls tickets and cooks them one by one.
+# async def audio_engine_loop():
+#     print("⚡ API AUDIO PIPELINE STARTED")
+#     loop = asyncio.get_running_loop()
+#     while True:
+#         # Wait for a ticket (text tokens + websocket connection)
+#         job = await INFERENCE_QUEUE.get()
+#         tokens, style, speed, ws = job
+#         try:
+#             # Check if client is still connected before doing heavy math
+#             # (FastAPI WS state: 1 = Connected, 2/3 = Closing/Closed)
+#             if ws.client_state.value > 1:
+#                 continue
+#             # Reuses the exact same SESSION as the UI
+#             input_ids = np.array([[0, *tokens[:510], 0]], dtype=np.int64)
+#             style_vec = style[min(len(tokens), style.shape[0]-1)]
+#             # --- CRITICAL FIX: Run blocking math in a separate thread ---
+#             # This allows the main server to keep talking to the other 59 users
+#             # while this calculation happens in the background.
+#             audio = await loop.run_in_executor(
+#                 INFERENCE_EXECUTOR,
+#                 lambda: SESSION.run(None, {
+#                     "input_ids": input_ids,
+#                     "style": style_vec,
+#                     "speed": np.array([speed], dtype=np.float32)
+#                 })[0]
+#             )
+#             # Post-Process (Fast enough to run on main thread)
+#             pcm_bytes = (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16).tobytes()
+#             # Send audio back to the specific user who asked for it
+#             try:
+#                 await ws.send_bytes(pcm_bytes)
+#             except Exception:
+#                 # If sending fails, just move on. Don't crash the engine.
+#                 pass
+#         except Exception as e:
+#             print(f"API Engine Error: {e}")
+# @api.on_event("startup")
+# async def startup():
+#     asyncio.create_task(audio_engine_loop())
+# # -------------------------------------------------------
+# # ROBUST WEBSOCKET ENDPOINT
+# # -------------------------------------------------------
+# @api.websocket("/ws/audio")
+# async def websocket_endpoint(ws: WebSocket):
+#     await ws.accept()
+#     # Defaults
+#     voice_key = "af_bella"
+#     speed = 1.0
+#     loop = asyncio.get_running_loop()
+#     print(f"✅ Client connected: {ws.client}")
+#     # --- HEARTBEAT KEEPER ---
+#     # This prevents HF Nginx from killing the connection during silence.
+#     async def keep_alive():
+#         while True:
+#             try:
+#                 await asyncio.sleep(15) # Send a ping every 15s
+#                 # We send a text frame as a ping. The browser ignores it or handles it.
+#                 await ws.send_json({"type": "ping"})
+#             except:
+#                 break
+#     heartbeat_task = asyncio.create_task(keep_alive())
+#     try:
+#         while True:
+#             try:
+#                 # Wait for JSON command
+#                 data = await ws.receive_json()
+#             except WebSocketDisconnect:
+#                 print("❌ Client disconnected cleanly")
+#                 break # BREAK THE LOOP
+#             except Exception as e:
+#                 print(f"⚠️ Connection lost: {e}")
+#                 break # BREAK THE LOOP
+#             # 1. Config Change
+#             if "config" in data:
+#                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
+#                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+#                 get_voice(voice_name)
+#                 voice_key = voice_code
+#                 speed = float(data.get("speed", speed))
+#                 # print(f"⚙️ Config updated: {voice_key}") # Commented out to reduce log noise
+#             # 2. Text Stream
+#             if "text" in data:
+#                 text = data["text"]
+#                 # The splitter breaks "500 words" into small sentences.
+#                 # These small sentences are added to the queue instantly.
+#                 for chunk in tuned_splitter(text):
+#                     if chunk.strip():
+#                         # Run G2P in thread to avoid blocking input
+#                         tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
+#                         if tokens:
+#                             style = VOICE_CACHE.get(voice_key)
+#                             if style is None:
+#                                 get_voice(voice_key)
+#                                 style = VOICE_CACHE.get(voice_key)
+#                             # Put the ticket in the global queue
+#                             await INFERENCE_QUEUE.put((tokens, style, speed, ws))
+#             if "flush" in data:
+#                 pass
+#     except Exception as e:
+#         print(f"🔥 Critical WS Error: {e}")
+#     finally:
+#         heartbeat_task.cancel() # Clean up the heartbeat task
+# # --- FINAL MOUNT ---
+# final_app = gr.mount_gradio_app(api, app, path="/")
+# if __name__ == "__main__":
+#     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import re
+import time
+import asyncio
+from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import gradio as gr
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uvicorn
+# Kokoro official inference lib (PyTorch)
+from kokoro import KPipeline
+# -----------------------------
+# CONFIG
+# -----------------------------
+KOKORO_REPO_ID = os.getenv("KOKORO_REPO_ID", "hexgrad/Kokoro-82M")
+AUDIO_SR = 24000
+# Split early to reduce latency on long paragraphs
+# Sentences or newlines
+SPLIT_PATTERN = os.getenv("KOKORO_SPLIT_PATTERN", r"(?<=[.!?])\s+|\n+")
+# Hard safety caps for HF free tier
+MAX_QUEUE = int(os.getenv("MAX_QUEUE", "100"))
+MAX_CHUNKS_PER_UTTERANCE = int(os.getenv("MAX_CHUNKS_PER_UTTERANCE", "120"))
+# Keep CPU thread usage predictable on 2 vCPU
+os.environ.setdefault("OMP_NUM_THREADS", "2")
+os.environ.setdefault("MKL_NUM_THREADS", "2")
+os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
+# -----------------------------
+# VOICES
+# -----------------------------
 VOICE_CHOICES = {
     '🇺🇸 🚺 Heart': 'af_heart', '🇺🇸 🚺 Bella': 'af_bella', '🇺🇸 🚺 Nicole': 'af_nicole',
     '🇺🇸 🚺 Aoede': 'af_aoede', '🇺🇸 🚺 Kore': 'af_kore', '🇺🇸 🚺 Sarah': 'af_sarah',
     '🇬🇧 🚹 Daniel': 'bm_daniel',
 }
+def _is_uk_voice(voice_code: str) -> bool:
+    return voice_code.startswith("bf_") or voice_code.startswith("bm_")
+# -----------------------------
+# BOOT
+# -----------------------------
+print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
+# 1) One shared model instance for both pipelines (loads weights once)
+PIPE_A = KPipeline(lang_code="a", repo_id=KOKORO_REPO_ID, trf=False, device="cpu")
+MODEL = PIPE_A.model
+PIPE_B = KPipeline(lang_code="b", repo_id=KOKORO_REPO_ID, trf=False, device="cpu", model=MODEL)
+# 2) Quiet pipelines for fast G2P + chunking without inference
+QUIET_A = KPipeline(lang_code="a", repo_id=KOKORO_REPO_ID, trf=False, model=False)
+QUIET_B = KPipeline(lang_code="b", repo_id=KOKORO_REPO_ID, trf=False, model=False)
+# 3) Voice cache (on device)
+VOICE_PACK_CACHE = {}
+def _pick_pipes(voice_code: str):
+    if _is_uk_voice(voice_code):
+        return PIPE_B, QUIET_B
+    return PIPE_A, QUIET_A
+def get_voice_pack(voice_code: str):
+    if voice_code in VOICE_PACK_CACHE:
+        return VOICE_PACK_CACHE[voice_code]
+    pipe, _ = _pick_pipes(voice_code)
+    pack = pipe.load_voice(voice_code)  # cached inside pipeline too, but we pin our own ref
+    VOICE_PACK_CACHE[voice_code] = pack
+    return pack
+# -----------------------------
+# TEXT NORMALIZATION
+# -----------------------------
+_KOKORO_IPA = "[Kokoro](/kˈOkəɹO/)"  # official usage pattern :contentReference[oaicite:5]{index=5}
+def normalize_text(text: str) -> str:
+    if not text:
+        return ""
+    t = text.strip()
+    # Stable fixes for common “skipped” tokens
+    t = t.replace("&", " and ")
+    t = t.replace("@", " at ")
+    t = t.replace("_", " ")
+    # Split CamelCase to reduce OOD risk: OpenAI -> Open AI
+    t = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", t)
+    # Expand short acronyms: CEO -> C E O
+    t = re.sub(r"\b([A-Z]{2,6})\b", lambda m: " ".join(list(m.group(1))), t)
+    # Force Kokoro pronunciation in a way the official pipeline supports
+    t = re.sub(r"\bKokoro\b", _KOKORO_IPA, t)
+    # Compress whitespace
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+# -----------------------------
+# CHUNKING: text -> phoneme chunks
+# -----------------------------
+@lru_cache(maxsize=2000)
+def _split_segments(text: str):
+    # cached split only
+    parts = re.split(SPLIT_PATTERN, text)
+    return [p.strip() for p in parts if p and p.strip()]
+def text_to_phoneme_chunks(text: str, voice_code: str):
+    _, quiet = _pick_pipes(voice_code)
+    t = normalize_text(text)
+    if not t:
+        return []
+    chunks = []
+    for seg in _split_segments(t):
+        # g2p returns (phoneme_str, tokens)
+        _, tokens = quiet.g2p(seg)
+        # en_tokenize returns (graphemes, phonemes, token_chunk)
+        for _, ps, _ in quiet.en_tokenize(tokens):
+            if ps:
+                chunks.append(ps)
+                if len(chunks) >= MAX_CHUNKS_PER_UTTERANCE:
+                    return chunks
+    return chunks
+# -----------------------------
+# INFERENCE: phonemes -> audio
+# -----------------------------
+def infer_phonemes(ps: str, voice_code: str, speed: float):
+    pipe, _ = _pick_pipes(voice_code)
+    pack = get_voice_pack(voice_code)
+    # This calls the same internal path as KPipeline.generate_from_tokens
+    audio = pipe.infer(ps, voice=pack, speed=speed)
+    # audio can be numpy or torch depending on kokoro version
     try:
+        import torch
+        if torch.is_tensor(audio):
+            audio = audio.detach().cpu().numpy()
+    except Exception:
+        pass
+    audio = np.asarray(audio, dtype=np.float32)
+    audio = np.clip(audio, -1.0, 1.0)
+    pcm16 = (audio * 32767.0).astype(np.int16)
+    return pcm16
+# -----------------------------
+# EXECUTORS + QUEUE (HF free tier safe)
+# -----------------------------
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+INFERENCE_QUEUE = asyncio.Queue(maxsize=MAX_QUEUE)
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
         job = await INFERENCE_QUEUE.get()
+        ws = job["ws"]
+        voice_code = job["voice"]
+        speed = job["speed"]
+        phoneme_chunks = job["chunks"]
+        # Do not interleave chunks across users within one utterance
+        for ps in phoneme_chunks:
             if ws.client_state.value > 1:
+                break
+            try:
+                pcm16 = await loop.run_in_executor(
+                    INFERENCE_EXECUTOR,
+                    lambda: infer_phonemes(ps, voice_code, speed)
+                )
+                await ws.send_bytes(pcm16.tobytes())
+            except Exception as e:
+                print(f"API Engine Error: {e}")
+                break
+# -----------------------------
+# GRADIO UI (streaming)
+# -----------------------------
+def gradio_stream(text: str, voice_name: str, speed: float):
+    voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+    get_voice_pack(voice_code)
+    phoneme_chunks = text_to_phoneme_chunks(text, voice_code)
+    for i, ps in enumerate(phoneme_chunks):
+        t0 = time.time()
+        pcm16 = infer_phonemes(ps, voice_code, float(speed))
+        dt = time.time() - t0
+        print(f"⚡ UI chunk {i}: {len(ps)} phonemes in {dt:.2f}s")
+        yield (AUDIO_SR, pcm16)
+with gr.Blocks(title="Kokoro TTS (Official)") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, HF Free Tier Safe)")
+    with gr.Row():
+        with gr.Column():
+            text_in = gr.Textbox(
+                label="Input Text",
+                lines=3,
+                value="The system is live. Use Gradio for testing, or connect to /ws/audio for the API."
             )
+            voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value="🇺🇸 🚺 Bella", label="Voice")
+            speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
+            btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+    btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
+# -----------------------------
+# FASTAPI + WEBSOCKET
+# -----------------------------
+api = FastAPI()
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
+    voice_code = "af_bella"
     speed = 1.0
     loop = asyncio.get_running_loop()
+    print(f"✅ Client connected: {ws.client}")
     try:
         while True:
             try:
                 data = await ws.receive_json()
             except WebSocketDisconnect:
                 print("❌ Client disconnected cleanly")
+                break
             except Exception as e:
                 print(f"⚠️ Connection lost: {e}")
+                break
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 speed = float(data.get("speed", speed))
+                get_voice_pack(voice_code)
             if "text" in data:
                 text = data["text"]
+                # Build whole utterance first so we do not interleave chunks across users
+                phoneme_chunks = await loop.run_in_executor(
+                    G2P_EXECUTOR,
+                    lambda: text_to_phoneme_chunks(text, voice_code)
+                )
+                if not phoneme_chunks:
+                    continue
+                try:
+                    await INFERENCE_QUEUE.put({
+                        "ws": ws,
+                        "voice": voice_code,
+                        "speed": speed,
+                        "chunks": phoneme_chunks,
+                    })
+                except asyncio.QueueFull:
+                    # Hard backpressure on HF free tier
+                    try:
+                        await ws.send_json({"type": "error", "message": "Server busy. Try again."})
+                    except Exception:
+                        pass
             if "flush" in data:
+                # Client controlled. No server side buffering needed here.
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
+# Mount gradio on FastAPI
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":