Spaces:

auralodyssey
/

api

Sleeping

App Files Files Community

auralodyssey commited on Jan 6

Commit

83977c6

verified ·

1 Parent(s): 7576e85

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -37

app.py CHANGED Viewed

@@ -317,13 +317,19 @@ except Exception:
 print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
 # ------------------------------------------------------------
-# OFFICIAL PIPELINES (per docs you pasted)
-# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
 # ------------------------------------------------------------
 PIPELINES = {
-    "a": KPipeline(lang_code="a"),
-    "b": KPipeline(lang_code="b"),
 }
 VOICE_CHOICES = {
@@ -340,57 +346,84 @@ VOICE_CHOICES = {
 }
 def voice_to_lang_code(voice_code: str) -> str:
-    # bf_ / bm_ are British
     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
         return "b"
     return "a"
 # ------------------------------------------------------------
-# TEXT HELPERS (sticking to your pasted docs format)
-# Use IPA markup like: [Kokoro](/kˈOkəɹO/)
 # ------------------------------------------------------------
 def normalize_text(text: str) -> str:
     if not text:
         return text
-    # Your docs show this exact IPA form for Kokoro
-    text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
-    return text
 # ------------------------------------------------------------
-# CHUNKING
-# Main goal: avoid tiny chunks that cause audible discontinuity.
 # ------------------------------------------------------------
-_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
 def tuned_splitter(text: str):
     text = (text or "").strip()
     if not text:
         return
-    parts = [p.strip() for p in _SENT_SPLIT.split(text) if p and p.strip()]
-    buf = ""
-    for p in parts:
-        if not buf:
-            buf = p
-            continue
-        # Grow chunks to reduce boundary artifacts
-        if len(buf) < 220:
-            buf = f"{buf} {p}"
-            continue
-        yield buf
-        buf = p
-    if buf:
-        yield buf
 # ------------------------------------------------------------
 # AUDIO CONVERSION FIX
 # Fixes: "'Tensor' object has no attribute 'astype'"
 # ------------------------------------------------------------
 def audio_to_int16_np(audio):
-    # audio can be torch.Tensor or np.ndarray
     if isinstance(audio, torch.Tensor):
         audio = audio.detach().cpu()
         audio = torch.clamp(audio, -1.0, 1.0)
@@ -405,25 +438,34 @@ def audio_to_pcm_bytes(audio) -> bytes:
     return audio_to_int16_np(audio).tobytes()
 # ------------------------------------------------------------
-# OFFICIAL GENERATION (per your docs)
 # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
 # ------------------------------------------------------------
 def kokoro_generate(chunk: str, voice_code: str, speed: float):
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
-    # Keep split_pattern exactly in the spirit of your docs
-    # Our own splitter already splits on sentence/newlines, so this stays light.
     generator = pipeline(
         chunk,
         voice=voice_code,
         speed=float(speed),
         split_pattern=r"\n+",
     )
     for _, _, audio in generator:
         yield audio
 # ------------------------------------------------------------
 # GRADIO STREAM
 # ------------------------------------------------------------
@@ -442,7 +484,6 @@ def gradio_stream_generator(text, voice_name, speed):
 # ------------------------------------------------------------
 # FASTAPI + WEBSOCKET QUEUE
-# Keep it single-file on CPU to stay stable under load.
 # ------------------------------------------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
@@ -454,7 +495,6 @@ async def audio_engine_loop():
     while True:
         ws, voice_code, speed, chunk = await INFERENCE_QUEUE.get()
         try:
             if ws.client_state.value > 1:
                 continue
@@ -478,6 +518,9 @@ async def audio_engine_loop():
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
@@ -517,6 +560,7 @@ async def websocket_endpoint(ws: WebSocket):
             if "text" in data:
                 text = normalize_text(data["text"])
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
                         await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
@@ -533,7 +577,7 @@ async def websocket_endpoint(ws: WebSocket):
 # GRADIO UI
 # ------------------------------------------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(

 print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
+# Keep CPU threads predictable
+try:
+    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
+    torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
+except Exception:
+    pass
 # ------------------------------------------------------------
+# OFFICIAL PIPELINES (per your pasted docs)
 # ------------------------------------------------------------
 PIPELINES = {
+    "a": KPipeline(lang_code="a"),  # 🇺🇸 American English
+    "b": KPipeline(lang_code="b"),  # 🇬🇧 British English
 }
 VOICE_CHOICES = {
 }
 def voice_to_lang_code(voice_code: str) -> str:
     if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
         return "b"
     return "a"
 # ------------------------------------------------------------
+# TEXT NORMALIZATION (stays within the docs you pasted)
+# Docs show: [Kokoro](/kˈOkəɹO/)
 # ------------------------------------------------------------
 def normalize_text(text: str) -> str:
     if not text:
         return text
+    return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
 # ------------------------------------------------------------
+# FAST-FIRST-AUDIO SPLITTER (your old technique)
+# Progressive thresholds so first chunk is quick.
+# Also includes a fallback to cut long text even without punctuation.
 # ------------------------------------------------------------
+_PUNCT_END = re.compile(r"[.,!?;:\n]$")
 def tuned_splitter(text: str):
     text = (text or "").strip()
     if not text:
         return
+    parts = re.split(r"([.,!?;:\n]+)", text)
+    buffer = ""
+    chunk_count = 0
+    def threshold_for(n: int) -> int:
+        if n == 0:
+            return 60   # fast first audio
+        if n == 1:
+            return 120
+        if n == 2:
+            return 180
+        return 260
+    for part in parts:
+        buffer += part
+        threshold = threshold_for(chunk_count)
+        # Emit when punctuation boundary is hit and buffer is big enough
+        if _PUNCT_END.search(buffer) and len(buffer) >= threshold:
+            out = buffer.strip()
+            if out:
+                yield out
+                chunk_count += 1
+                buffer = ""
+                continue
+        # Fallback: if no punctuation for too long, cut at last space
+        hard_max = 320 if chunk_count == 0 else 520
+        if len(buffer) >= hard_max:
+            cut = buffer.rfind(" ")
+            if cut > 40:
+                out = buffer[:cut].strip()
+                rest = buffer[cut:].strip()
+                if out:
+                    yield out
+                    chunk_count += 1
+                buffer = rest
+            else:
+                out = buffer.strip()
+                if out:
+                    yield out
+                    chunk_count += 1
+                buffer = ""
+    if buffer.strip():
+        yield buffer.strip()
 # ------------------------------------------------------------
 # AUDIO CONVERSION FIX
 # Fixes: "'Tensor' object has no attribute 'astype'"
 # ------------------------------------------------------------
 def audio_to_int16_np(audio):
     if isinstance(audio, torch.Tensor):
         audio = audio.detach().cpu()
         audio = torch.clamp(audio, -1.0, 1.0)
     return audio_to_int16_np(audio).tobytes()
 # ------------------------------------------------------------
+# OFFICIAL GENERATION (exact pattern from your docs)
 # generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
 # ------------------------------------------------------------
 def kokoro_generate(chunk: str, voice_code: str, speed: float):
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
     generator = pipeline(
         chunk,
         voice=voice_code,
         speed=float(speed),
         split_pattern=r"\n+",
     )
     for _, _, audio in generator:
         yield audio
+# ------------------------------------------------------------
+# WARMUP
+# Moves the first-call latency to startup instead of first user request.
+# ------------------------------------------------------------
+def warmup():
+    try:
+        for _ in kokoro_generate("Hello.", "af_bella", 1.0):
+            break
+        print("✅ WARMUP DONE")
+    except Exception as e:
+        print(f"⚠️ WARMUP FAILED: {e}")
 # ------------------------------------------------------------
 # GRADIO STREAM
 # ------------------------------------------------------------
 # ------------------------------------------------------------
 # FASTAPI + WEBSOCKET QUEUE
 # ------------------------------------------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
     while True:
         ws, voice_code, speed, chunk = await INFERENCE_QUEUE.get()
         try:
             if ws.client_state.value > 1:
                 continue
 @api.on_event("startup")
 async def startup():
+    # Warmup in executor so startup does not block event loop
+    loop = asyncio.get_running_loop()
+    await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
             if "text" in data:
                 text = normalize_text(data["text"])
+                # Enqueue fast first chunk first
                 for chunk in tuned_splitter(text):
                     if chunk.strip():
                         await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
 # GRADIO UI
 # ------------------------------------------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Fast First Audio)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(