Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Jan 5

Commit

eff63e9

verified ·

1 Parent(s): b8af37a

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -125

app.py CHANGED Viewed

@@ -294,26 +294,25 @@
 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import json
 import time
 import re
 import numpy as np
-import onnxruntime as ort
 import gradio as gr
-from huggingface_hub import hf_hub_download
-from misaki import en
 from functools import lru_cache
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import asyncio
 import uvloop
 import uvicorn
 from concurrent.futures import ThreadPoolExecutor
 # --- CONFIGURATION ---
-MODEL_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
-MODEL_FILE = "onnx/model.onnx"
-TOKENIZER_FILE = "tokenizer.json"
 # --- VOICE UI ---
 VOICE_CHOICES = {
     '🇺🇸 🚺 Heart': 'af_heart', '🇺🇸 🚺 Bella': 'af_bella', '🇺🇸 🚺 Nicole': 'af_nicole',
@@ -327,96 +326,52 @@ VOICE_CHOICES = {
     '🇬🇧 🚹 George': 'bm_george', '🇬🇧 🚹 Fable': 'bm_fable', '🇬🇧 🚹 Lewis': 'bm_lewis',
     '🇬🇧 🚹 Daniel': 'bm_daniel',
 }
 # --- ENGINE ---
-print("🚀 BOOTING HIGH-RAM ENGINE...")
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-# 1. Phonemizer - Try with espeak fallback, fall back to None if it fails
-try:
-    from misaki.espeak import EspeakFallback
-    espeak_fallback = EspeakFallback()
-    G2P = en.G2P(trf=False, british=False, fallback=espeak_fallback)
-    print("✅ G2P initialized with espeak fallback")
-except Exception as e:
-    print(f"⚠️ Could not load espeak fallback: {e}")
-    G2P = en.G2P(trf=False, british=False, fallback=None)
-    print("✅ G2P initialized without fallback")
-# 2. Tokenizer
-vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
-with open(vocab_path, "r", encoding="utf-8") as f:
-    data = json.load(f)
-    TOKENIZER = data["model"]["vocab"] if "model" in data else data.get("vocab", {})
-# 3. Voices (Lazy Load)
-VOICE_CACHE = {}
-def get_voice(name):
-    code = VOICE_CHOICES.get(name, name)
-    if code not in VOICE_CACHE:
-        try:
-            print(f"⬇️ Loading Voice: {code}")
-            path = hf_hub_download(repo_id=MODEL_REPO, filename=f"voices/{code}.bin")
-            VOICE_CACHE[code] = np.fromfile(path, dtype=np.float32).reshape(-1, 1, 256)
-        except:
-            if 'af_bella' not in VOICE_CACHE:
-                p = hf_hub_download(repo_id=MODEL_REPO, filename="voices/af_bella.bin")
-                VOICE_CACHE['af_bella'] = np.fromfile(p, dtype=np.float32).reshape(-1, 1, 256)
-            return VOICE_CACHE['af_bella']
-    return VOICE_CACHE[code]
-# 4. ONNX Engine
-model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-sess_options = ort.SessionOptions()
-sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
-sess_options.intra_op_num_threads = 0
-sess_options.inter_op_num_threads = 0
-SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
-print("✅ ENGINE READY")
-# --- CORE LOGIC (Shared by UI and API) ---
-def safe_g2p(text):
-    """Safely convert text to phonemes, handling errors gracefully"""
     if not text or not text.strip():
-        return []
-    # Special replacements
-    if "Kokoro" in text:
-        text = text.replace("Kokoro", "kˈOkəɹO")
     try:
-        phonemes, _ = G2P(text)
-        # Filter out invalid tokens
-        tokens = []
-        for p in phonemes:
-            token = TOKENIZER.get(p)
-            if token is not None and token > 0:
-                tokens.append(token)
-        return tokens
     except Exception as e:
-        print(f"⚠️ G2P error for '{text[:30]}...': {e}")
-        return []
-@lru_cache(maxsize=5000)
-def get_tokens(text):
-    return safe_g2p(text)
 def trim_silence(audio, threshold=0.01):
-    if audio.size == 0: return audio
     mask = np.abs(audio) > threshold
-    if not np.any(mask): return audio
     start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
     return audio[max(0, start-50) : min(len(audio), end+50)]
-def infer(text, voice_name, speed):
-    if not text.strip(): return None
-    ids = get_tokens(text)[:510]
-    if not ids: return None
-    voice = get_voice(voice_name)
-    style = voice[min(len(ids), voice.shape[0]-1)]
-    try:
-        audio = SESSION.run(None, {
-            "input_ids": np.array([[0] + ids + [0]], dtype=np.int64),
-            "style": style,
-            "speed": np.array([speed], dtype=np.float32)
-        })[0]
-        return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
-    except Exception as e:
-        print(f"⚠️ Inference error: {e}")
-        return None
 def tuned_splitter(text):
     chunks = re.split(r'([.,!?;:\n]+)', text)
     buffer = ""
     chunk_count = 0
@@ -433,81 +388,85 @@ def tuned_splitter(text):
                 buffer = ""
     if buffer.strip():
         yield buffer.strip()
 def stream_generator(text, voice_name, speed):
-    print("--- START STREAM ---")
-    get_voice(voice_name)
     for i, chunk in enumerate(tuned_splitter(text)):
         t0 = time.time()
-        audio = infer(chunk, voice_name, speed)
-        if audio:
             dur = time.time() - t0
             print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
-            yield audio
     print("--- END STREAM ---")
 # --- UI DEFINITION ---
 with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
     with gr.Row():
         with gr.Column():
-            text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! This is a test of the Kokoro TTS system.")
             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
     btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 # --- API INTEGRATION ---
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
-def g2p_task(text):
-    """Thread-safe G2P task"""
-    return safe_g2p(text)
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
         job = await INFERENCE_QUEUE.get()
-        tokens, style, speed, ws = job
         try:
             if ws.client_state.value > 1:
                 continue
-            input_ids = np.array([[0, *tokens[:510], 0]], dtype=np.int64)
-            style_vec = style[min(len(tokens), style.shape[0]-1)]
             audio = await loop.run_in_executor(
-                INFERENCE_EXECUTOR,
-                lambda: SESSION.run(None, {
-                    "input_ids": input_ids,
-                    "style": style_vec,
-                    "speed": np.array([speed], dtype=np.float32)
-                })[0]
             )
-            pcm_bytes = (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16).tobytes()
-            try:
-                await ws.send_bytes(pcm_bytes)
-            except Exception:
-                pass
         except Exception as e:
             print(f"⚠️ API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
     voice_key = "af_bella"
     speed = 1.0
-    loop = asyncio.get_running_loop()
     print(f"✅ Client connected: {ws.client}")
     async def keep_alive():
         while True:
             try:
@@ -517,6 +476,7 @@ async def websocket_endpoint(ws: WebSocket):
                 break
     heartbeat_task = asyncio.create_task(keep_alive())
     try:
         while True:
             try:
@@ -527,10 +487,10 @@ async def websocket_endpoint(ws: WebSocket):
             except Exception as e:
                 print(f"⚠️ Connection lost: {e}")
                 break
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-                get_voice(voice_name)
                 voice_key = voice_code
                 speed = float(data.get("speed", speed))
@@ -538,27 +498,20 @@ async def websocket_endpoint(ws: WebSocket):
                 text = data["text"]
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
-                        try:
-                            tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
-                            if tokens:
-                                style = VOICE_CACHE.get(voice_key)
-                                if style is None:
-                                    get_voice(voice_key)
-                                    style = VOICE_CACHE.get(voice_key)
-                                await INFERENCE_QUEUE.put((tokens, style, speed, ws))
-                        except Exception as e:
-                            print(f"⚠️ G2P task error: {e}")
             if "flush" in data:
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
         import traceback
         traceback.print_exc()
     finally:
         heartbeat_task.cancel()
 # --- FINAL MOUNT ---
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
     uvicorn.run(final_app, host="0.0.0.0", port=7860)

 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import json
 import time
 import re
 import numpy as np
 import gradio as gr
 from functools import lru_cache
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import asyncio
 import uvloop
 import uvicorn
 from concurrent.futures import ThreadPoolExecutor
+# 🔥 USE KOKORO PIPELINE INSTEAD OF RAW MISAKI
+from kokoro import KPipeline
 # --- CONFIGURATION ---
+SAMPLE_RATE = 24000
 # --- VOICE UI ---
 VOICE_CHOICES = {
     '🇺🇸 🚺 Heart': 'af_heart', '🇺🇸 🚺 Bella': 'af_bella', '🇺🇸 🚺 Nicole': 'af_nicole',
     '🇬🇧 🚹 George': 'bm_george', '🇬🇧 🚹 Fable': 'bm_fable', '🇬🇧 🚹 Lewis': 'bm_lewis',
     '🇬🇧 🚹 Daniel': 'bm_daniel',
 }
 # --- ENGINE ---
+print("🚀 BOOTING KOKORO PIPELINE ENGINE...")
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+# Initialize KPipeline - this handles espeak fallback automatically!
+PIPELINE = KPipeline(lang_code='a')  # 'a' = American English
+print("✅ KOKORO PIPELINE READY")
+# --- CORE LOGIC ---
+def generate_audio(text, voice_name, speed):
+    """Generate audio using KPipeline - handles all phonemes properly!"""
     if not text or not text.strip():
+        return None
+    voice = VOICE_CHOICES.get(voice_name, voice_name)
     try:
+        # KPipeline returns generator of (graphemes, phonemes, audio)
+        audio_chunks = []
+        for gs, ps, audio in PIPELINE(text, voice=voice, speed=speed):
+            if audio is not None and len(audio) > 0:
+                audio_chunks.append(audio)
+        if not audio_chunks:
+            return None
+        # Concatenate all audio chunks
+        full_audio = np.concatenate(audio_chunks)
+        return full_audio
     except Exception as e:
+        print(f"⚠️ Audio generation error: {e}")
+        return None
 def trim_silence(audio, threshold=0.01):
+    if audio is None or audio.size == 0:
+        return audio
     mask = np.abs(audio) > threshold
+    if not np.any(mask):
+        return audio
     start, end = np.argmax(mask), len(mask) - np.argmax(mask[::-1])
     return audio[max(0, start-50) : min(len(audio), end+50)]
 def tuned_splitter(text):
+    """Split text into chunks for streaming"""
     chunks = re.split(r'([.,!?;:\n]+)', text)
     buffer = ""
     chunk_count = 0
                 buffer = ""
     if buffer.strip():
         yield buffer.strip()
 def stream_generator(text, voice_name, speed):
+    """Generate audio stream for Gradio UI"""
+    print(f"--- START STREAM: {text[:50]}... ---")
     for i, chunk in enumerate(tuned_splitter(text)):
         t0 = time.time()
+        audio = generate_audio(chunk, voice_name, speed)
+        if audio is not None and len(audio) > 0:
+            audio = trim_silence(audio)
             dur = time.time() - t0
             print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
+            # Convert to int16 for audio output
+            audio_int16 = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
+            yield (SAMPLE_RATE, audio_int16)
     print("--- END STREAM ---")
 # --- UI DEFINITION ---
 with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M with KPipeline (Proper Name Support!)")
     with gr.Row():
         with gr.Column():
+            text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! My name is Yaman and I work at Willo. Testing pronunciation of names!")
             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
     btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 # --- API INTEGRATION ---
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
 async def audio_engine_loop():
+    """Background worker that processes audio requests"""
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
         job = await INFERENCE_QUEUE.get()
+        text, voice, speed, ws = job
         try:
             if ws.client_state.value > 1:
                 continue
+            # Generate audio using KPipeline (in thread to not block)
             audio = await loop.run_in_executor(
+                INFERENCE_EXECUTOR,
+                lambda: generate_audio(text, voice, speed)
             )
+            if audio is not None and len(audio) > 0:
+                audio = trim_silence(audio)
+                pcm_bytes = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
+                try:
+                    await ws.send_bytes(pcm_bytes)
+                except Exception:
+                    pass
         except Exception as e:
             print(f"⚠️ API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
     voice_key = "af_bella"
     speed = 1.0
     print(f"✅ Client connected: {ws.client}")
     async def keep_alive():
         while True:
             try:
                 break
     heartbeat_task = asyncio.create_task(keep_alive())
     try:
         while True:
             try:
             except Exception as e:
                 print(f"⚠️ Connection lost: {e}")
                 break
             if "config" in data:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 voice_key = voice_code
                 speed = float(data.get("speed", speed))
                 text = data["text"]
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
+                        await INFERENCE_QUEUE.put((chunk, voice_key, speed, ws))
             if "flush" in data:
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
         import traceback
         traceback.print_exc()
     finally:
         heartbeat_task.cancel()
 # --- FINAL MOUNT ---
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
     uvicorn.run(final_app, host="0.0.0.0", port=7860)