Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Jan 6

Commit

20fe4d6

verified ·

1 Parent(s): 83977c6

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -150

app.py CHANGED Viewed

@@ -295,8 +295,8 @@
 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
-import time
 import re
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
@@ -308,30 +308,31 @@ import uvicorn
 import torch
 from kokoro import KPipeline
-# Optional speed boost on HF Linux
 try:
-    import uvloop  # type: ignore
-    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 except Exception:
     pass
-print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
-# Keep CPU threads predictable
 try:
-    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
-    torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
 except Exception:
     pass
-# ------------------------------------------------------------
-# OFFICIAL PIPELINES (per your pasted docs)
-# ------------------------------------------------------------
-PIPELINES = {
-    "a": KPipeline(lang_code="a"),  # 🇺🇸 American English
-    "b": KPipeline(lang_code="b"),  # 🇬🇧 British English
-}
 VOICE_CHOICES = {
     "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
     "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
@@ -347,88 +348,61 @@ VOICE_CHOICES = {
 def voice_to_lang_code(voice_code: str) -> str:
     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
-        return "b"
-    return "a"
-# ------------------------------------------------------------
-# TEXT NORMALIZATION (stays within the docs you pasted)
-# Docs show: [Kokoro](/kˈOkəɹO/)
-# ------------------------------------------------------------
 def normalize_text(text: str) -> str:
     if not text:
-        return text
     return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
-# ------------------------------------------------------------
-# FAST-FIRST-AUDIO SPLITTER (your old technique)
-# Progressive thresholds so first chunk is quick.
-# Also includes a fallback to cut long text even without punctuation.
-# ------------------------------------------------------------
-_PUNCT_END = re.compile(r"[.,!?;:\n]$")
-def tuned_splitter(text: str):
-    text = (text or "").strip()
     if not text:
-        return
-    parts = re.split(r"([.,!?;:\n]+)", text)
-    buffer = ""
-    chunk_count = 0
-    def threshold_for(n: int) -> int:
-        if n == 0:
-            return 60   # fast first audio
-        if n == 1:
-            return 120
-        if n == 2:
-            return 180
-        return 260
-    for part in parts:
-        buffer += part
-        threshold = threshold_for(chunk_count)
-        # Emit when punctuation boundary is hit and buffer is big enough
-        if _PUNCT_END.search(buffer) and len(buffer) >= threshold:
-            out = buffer.strip()
-            if out:
-                yield out
-                chunk_count += 1
-                buffer = ""
-                continue
-        # Fallback: if no punctuation for too long, cut at last space
-        hard_max = 320 if chunk_count == 0 else 520
-        if len(buffer) >= hard_max:
-            cut = buffer.rfind(" ")
-            if cut > 40:
-                out = buffer[:cut].strip()
-                rest = buffer[cut:].strip()
-                if out:
-                    yield out
-                    chunk_count += 1
-                buffer = rest
-            else:
-                out = buffer.strip()
-                if out:
-                    yield out
-                    chunk_count += 1
-                buffer = ""
-    if buffer.strip():
-        yield buffer.strip()
-# ------------------------------------------------------------
-# AUDIO CONVERSION FIX
-# Fixes: "'Tensor' object has no attribute 'astype'"
-# ------------------------------------------------------------
 def audio_to_int16_np(audio):
     if isinstance(audio, torch.Tensor):
         audio = audio.detach().cpu()
         audio = torch.clamp(audio, -1.0, 1.0)
-        audio_i16 = (audio * 32767.0).to(torch.int16)
-        return audio_i16.numpy()
     audio = np.asarray(audio)
     audio = np.clip(audio, -1.0, 1.0)
@@ -437,88 +411,121 @@ def audio_to_int16_np(audio):
 def audio_to_pcm_bytes(audio) -> bytes:
     return audio_to_int16_np(audio).tobytes()
-# ------------------------------------------------------------
-# OFFICIAL GENERATION (exact pattern from your docs)
 # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
-# ------------------------------------------------------------
-def kokoro_generate(chunk: str, voice_code: str, speed: float):
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
-    generator = pipeline(
-        chunk,
-        voice=voice_code,
-        speed=float(speed),
-        split_pattern=r"\n+",
-    )
-    for _, _, audio in generator:
-        yield audio
-# ------------------------------------------------------------
-# WARMUP
-# Moves the first-call latency to startup instead of first user request.
-# ------------------------------------------------------------
 def warmup():
     try:
-        for _ in kokoro_generate("Hello.", "af_bella", 1.0):
             break
-        print("✅ WARMUP DONE")
     except Exception as e:
         print(f"⚠️ WARMUP FAILED: {e}")
-# ------------------------------------------------------------
-# GRADIO STREAM
-# ------------------------------------------------------------
-def gradio_stream_generator(text, voice_name, speed):
     voice_code = VOICE_CHOICES.get(voice_name, voice_name)
     text = normalize_text(text)
-    print("--- START UI STREAM ---")
-    for i, chunk in enumerate(tuned_splitter(text)):
-        t0 = time.time()
-        for audio in kokoro_generate(chunk, voice_code, speed):
-            dur = time.time() - t0
-            print(f"⚡ UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
-            yield 24000, audio_to_int16_np(audio)
-    print("--- END UI STREAM ---")
-# ------------------------------------------------------------
-# FASTAPI + WEBSOCKET QUEUE
-# ------------------------------------------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-INFERENCE_QUEUE = asyncio.Queue()
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
-        ws, voice_code, speed, chunk = await INFERENCE_QUEUE.get()
-        try:
-            if ws.client_state.value > 1:
-                continue
-            def _run_and_pack():
-                frames = []
-                for audio in kokoro_generate(chunk, voice_code, speed):
-                    frames.append(audio_to_pcm_bytes(audio))
-                return frames
-            frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run_and_pack)
-            for frame in frames:
                 try:
-                    await ws.send_bytes(frame)
                 except Exception:
-                    break
-        except Exception as e:
-            print(f"API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
-    # Warmup in executor so startup does not block event loop
     loop = asyncio.get_running_loop()
     await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
     asyncio.create_task(audio_engine_loop())
@@ -559,25 +566,21 @@ async def websocket_endpoint(ws: WebSocket):
                 speed = float(data.get("speed", speed))
             if "text" in data:
-                text = normalize_text(data["text"])
-                # Enqueue fast first chunk first
-                for chunk in tuned_splitter(text):
-                    if chunk.strip():
-                        await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
             if "flush" in data:
                 pass
-    except Exception as e:
-        print(f"🔥 Critical WS Error: {e}")
     finally:
         heartbeat_task.cancel()
-# ------------------------------------------------------------
-# GRADIO UI
-# ------------------------------------------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Fast First Audio)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
@@ -595,7 +598,7 @@ with gr.Blocks(title="Kokoro TTS") as app:
         with gr.Column():
             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
-    btn.click(gradio_stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 final_app = gr.mount_gradio_app(api, app, path="/")

 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import re
+import time
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import torch
 from kokoro import KPipeline
+# ----------------------------
+# HARD LIMIT CPU THREADS (2 vCPU box)
+# ----------------------------
+os.environ.setdefault("OMP_NUM_THREADS", "2")
+os.environ.setdefault("MKL_NUM_THREADS", "2")
+os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
 try:
+    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
+    torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
 except Exception:
     pass
+# Optional: uvloop for faster event loop on HF Linux
 try:
+    import uvloop  # type: ignore
+    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 except Exception:
     pass
+print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
+# ----------------------------
+# VOICES
+# ----------------------------
 VOICE_CHOICES = {
     "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
     "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
 def voice_to_lang_code(voice_code: str) -> str:
     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
+        return "b"  # British
+    return "a"      # American
+# ----------------------------
+# PIPELINES (keep hot in RAM)
+# ----------------------------
+PIPELINES = {
+    "a": KPipeline(lang_code="a"),
+    "b": KPipeline(lang_code="b"),
+}
+# ----------------------------
+# TEXT NORMALIZATION (matches your pasted official docs)
+# ----------------------------
 def normalize_text(text: str) -> str:
     if not text:
+        return ""
     return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
+# ----------------------------
+# LOW LATENCY SEGMENTATION
+# One pipeline call per request.
+# We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
+# We also force a small first segment for fast first audio.
+# ----------------------------
+_SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
+def inject_newlines_for_fast_stream(text: str) -> str:
+    text = normalize_text(text).strip()
     if not text:
+        return ""
+    # Sentence boundaries -> newline so official split_pattern can segment
+    text = _SENT_BOUNDARY.sub(r"\1\n", text)
+    # Also split on existing multi-newlines
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    # Guarantee a small first segment for low time-to-first-audio
+    if "\n" not in text and len(text) > 90:
+        cut = text.rfind(" ", 0, 70)
+        if cut < 35:
+            cut = 70
+        text = text[:cut].strip() + "\n" + text[cut:].strip()
+    return text
+# ----------------------------
+# AUDIO CONVERSION (fast, safe)
+# ----------------------------
 def audio_to_int16_np(audio):
     if isinstance(audio, torch.Tensor):
         audio = audio.detach().cpu()
         audio = torch.clamp(audio, -1.0, 1.0)
+        return (audio * 32767.0).to(torch.int16).numpy()
     audio = np.asarray(audio)
     audio = np.clip(audio, -1.0, 1.0)
 def audio_to_pcm_bytes(audio) -> bytes:
     return audio_to_int16_np(audio).tobytes()
+# ----------------------------
+# OFFICIAL GENERATION PATH (single pipeline call)
 # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
+# ----------------------------
+def kokoro_generator_full(text: str, voice_code: str, speed: float):
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
+    text = inject_newlines_for_fast_stream(text)
+    if not text:
+        return
+    with torch.inference_mode():
+        generator = pipeline(
+            text,
+            voice=voice_code,
+            speed=float(speed),
+            split_pattern=r"\n+",
+        )
+        for _, _, audio in generator:
+            yield audio
+# ----------------------------
+# WARMUP (pay cold-start cost at boot)
+# ----------------------------
 def warmup():
     try:
+        t0 = time.time()
+        for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
             break
+        print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
     except Exception as e:
         print(f"⚠️ WARMUP FAILED: {e}")
+# ----------------------------
+# GRADIO UI STREAM
+# ----------------------------
+def gradio_stream(text, voice_name, speed):
     voice_code = VOICE_CHOICES.get(voice_name, voice_name)
     text = normalize_text(text)
+    i = 0
+    t0 = time.time()
+    for audio in kokoro_generator_full(text, voice_code, speed):
+        if i == 0:
+            print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
+        i += 1
+        yield 24000, audio_to_int16_np(audio)
+# ----------------------------
+# FASTAPI WS ENGINE
+# Single worker thread for actual generation.
+# Stream frames to client as soon as they exist.
+# No buffering a full list before sending.
+# ----------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
+        ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
+        # Skip dead clients early
+        if ws.client_state.value > 1:
+            continue
+        frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
+        def _worker():
+            try:
+                for audio in kokoro_generator_full(text, voice_code, speed):
+                    b = audio_to_pcm_bytes(audio)
+                    # backpressure aware
+                    while True:
+                        try:
+                            loop.call_soon_threadsafe(frame_q.put_nowait, b)
+                            break
+                        except Exception:
+                            time.sleep(0.001)
+                loop.call_soon_threadsafe(frame_q.put_nowait, None)
+            except Exception as e:
+                print(f"API Worker Error: {e}")
                 try:
+                    loop.call_soon_threadsafe(frame_q.put_nowait, None)
                 except Exception:
+                    pass
+        INFERENCE_EXECUTOR.submit(_worker)
+        first_sent = False
+        started = time.time()
+        while True:
+            frame = await frame_q.get()
+            if frame is None:
+                break
+            if ws.client_state.value > 1:
+                break
+            try:
+                await ws.send_bytes(frame)
+                if not first_sent:
+                    print(f"⚡ API first audio in {time.time() - started:.2f}s")
+                    first_sent = True
+            except Exception:
+                break
 @api.on_event("startup")
 async def startup():
     loop = asyncio.get_running_loop()
     await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
     asyncio.create_task(audio_engine_loop())
                 speed = float(data.get("speed", speed))
             if "text" in data:
+                text = normalize_text(data.get("text", ""))
+                if text.strip():
+                    await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
             if "flush" in data:
                 pass
     finally:
         heartbeat_task.cancel()
+# ----------------------------
+# GRADIO APP
+# ----------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
         with gr.Column():
             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+    btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 final_app = gr.mount_gradio_app(api, app, path="/")