Spaces:

auralodyssey
/

api

Sleeping

App Files Files Community

auralodyssey commited on Jan 5

Commit

b8af37a

verified ·

1 Parent(s): be1838a

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -31

app.py CHANGED Viewed

@@ -329,10 +329,17 @@ VOICE_CHOICES = {
 }
 # --- ENGINE ---
 print("🚀 BOOTING HIGH-RAM ENGINE...")
-# Enable fast networking immediately
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-# 1. Phonemizer - 🔥 FIXED: Added espeak fallback for proper nouns!
-G2P = en.G2P(trf=False, british=False, fallback='espeak')  # ← THIS IS THE KEY FIX!
 # 2. Tokenizer
 vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
 with open(vocab_path, "r", encoding="utf-8") as f:
@@ -363,17 +370,30 @@ sess_options.inter_op_num_threads = 0
 SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
 print("✅ ENGINE READY")
 # --- CORE LOGIC (Shared by UI and API) ---
 @lru_cache(maxsize=5000)
 def get_tokens(text):
-    if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
-    phonemes, _ = G2P(text)
-    # 🔥 FIXED: Filter out invalid tokens (prevents audio gaps)
-    tokens = []
-    for p in phonemes:
-        token = TOKENIZER.get(p)
-        if token is not None and token > 0:
-            tokens.append(token)
-    return tokens
 def trim_silence(audio, threshold=0.01):
     if audio.size == 0: return audio
     mask = np.abs(audio) > threshold
@@ -393,7 +413,9 @@ def infer(text, voice_name, speed):
             "speed": np.array([speed], dtype=np.float32)
         })[0]
         return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
-    except: return None
 def tuned_splitter(text):
     chunks = re.split(r'([.,!?;:\n]+)', text)
     buffer = ""
@@ -427,7 +449,7 @@ with gr.Blocks(title="Kokoro TTS") as app:
     gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
     with gr.Row():
         with gr.Column():
-            text_in = gr.Textbox(label="Input Text", lines=3, value="The system is live. Use the Gradio UI for testing, or connect to /ws/audio for the API.")
             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
@@ -440,15 +462,8 @@ INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
 def g2p_task(text):
-    if "Kokoro" in text: text = text.replace("Kokoro", "kˈOkəɹO")
-    phonemes, _ = G2P(text)
-    # 🔥 FIXED: Filter out invalid tokens
-    tokens = []
-    for p in phonemes:
-        token = TOKENIZER.get(p)
-        if token is not None and token > 0:
-            tokens.append(token)
-    return tokens
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
@@ -480,7 +495,7 @@ async def audio_engine_loop():
                 pass
         except Exception as e:
-            print(f"API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
@@ -523,19 +538,24 @@ async def websocket_endpoint(ws: WebSocket):
                 text = data["text"]
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
-                        tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
-                        if tokens:
-                            style = VOICE_CACHE.get(voice_key)
-                            if style is None:
-                                get_voice(voice_key)
                                 style = VOICE_CACHE.get(voice_key)
-                            await INFERENCE_QUEUE.put((tokens, style, speed, ws))
             if "flush" in data:
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
     finally:
         heartbeat_task.cancel()
 # --- FINAL MOUNT ---

 }
 # --- ENGINE ---
 print("🚀 BOOTING HIGH-RAM ENGINE...")
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+# 1. Phonemizer - Try with espeak fallback, fall back to None if it fails
+try:
+    from misaki.espeak import EspeakFallback
+    espeak_fallback = EspeakFallback()
+    G2P = en.G2P(trf=False, british=False, fallback=espeak_fallback)
+    print("✅ G2P initialized with espeak fallback")
+except Exception as e:
+    print(f"⚠️ Could not load espeak fallback: {e}")
+    G2P = en.G2P(trf=False, british=False, fallback=None)
+    print("✅ G2P initialized without fallback")
 # 2. Tokenizer
 vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=TOKENIZER_FILE)
 with open(vocab_path, "r", encoding="utf-8") as f:
 SESSION = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
 print("✅ ENGINE READY")
 # --- CORE LOGIC (Shared by UI and API) ---
+def safe_g2p(text):
+    """Safely convert text to phonemes, handling errors gracefully"""
+    if not text or not text.strip():
+        return []
+    # Special replacements
+    if "Kokoro" in text:
+        text = text.replace("Kokoro", "kˈOkəɹO")
+    try:
+        phonemes, _ = G2P(text)
+        # Filter out invalid tokens
+        tokens = []
+        for p in phonemes:
+            token = TOKENIZER.get(p)
+            if token is not None and token > 0:
+                tokens.append(token)
+        return tokens
+    except Exception as e:
+        print(f"⚠️ G2P error for '{text[:30]}...': {e}")
+        return []
 @lru_cache(maxsize=5000)
 def get_tokens(text):
+    return safe_g2p(text)
 def trim_silence(audio, threshold=0.01):
     if audio.size == 0: return audio
     mask = np.abs(audio) > threshold
             "speed": np.array([speed], dtype=np.float32)
         })[0]
         return 24000, (np.clip(trim_silence(audio[0]), -1.0, 1.0) * 32767).astype(np.int16)
+    except Exception as e:
+        print(f"⚠️ Inference error: {e}")
+        return None
 def tuned_splitter(text):
     chunks = re.split(r'([.,!?;:\n]+)', text)
     buffer = ""
     gr.Markdown("## ⚡ Kokoro-82M (High-RAM Tuned)")
     with gr.Row():
         with gr.Column():
+            text_in = gr.Textbox(label="Input Text", lines=3, value="Hello! This is a test of the Kokoro TTS system.")
             voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
 G2P_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
 def g2p_task(text):
+    """Thread-safe G2P task"""
+    return safe_g2p(text)
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
                 pass
         except Exception as e:
+            print(f"⚠️ API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
                 text = data["text"]
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
+                        try:
+                            tokens = await loop.run_in_executor(G2P_EXECUTOR, g2p_task, chunk)
+                            if tokens:
                                 style = VOICE_CACHE.get(voice_key)
+                                if style is None:
+                                    get_voice(voice_key)
+                                    style = VOICE_CACHE.get(voice_key)
+                                await INFERENCE_QUEUE.put((tokens, style, speed, ws))
+                        except Exception as e:
+                            print(f"⚠️ G2P task error: {e}")
             if "flush" in data:
                 pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
+        import traceback
+        traceback.print_exc()
     finally:
         heartbeat_task.cancel()
 # --- FINAL MOUNT ---