Spaces:

auralodyssey
/

api

Sleeping

App Files Files Community

auralodyssey commited on Jan 6

Commit

7576e85

verified ·

1 Parent(s): 4daf7c6

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -181

app.py CHANGED Viewed

@@ -295,12 +295,9 @@
 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
-import re
-import json
 import time
 import asyncio
-import uvloop
-from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
@@ -309,217 +306,145 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uvicorn
 import torch
-import soundfile as sf
-from huggingface_hub import hf_hub_download
 from kokoro import KPipeline
-# -----------------------------
-# HF SPACE REALITY SETTINGS
-# -----------------------------
-# Free CPU Basic is small, so keep concurrency controlled.
-torch.set_num_threads(max(1, int(os.environ.get("TORCH_NUM_THREADS", "2"))))
-asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
-# -----------------------------
-# VOICES
-# -----------------------------
-VOICE_CHOICES = {
-    "🇺🇸 🚺 Heart": "af_heart",
-    "🇺🇸 🚺 Bella": "af_bella",
-    "🇺🇸 🚺 Nicole": "af_nicole",
-    "🇺🇸 🚺 Aoede": "af_aoede",
-    "🇺🇸 🚺 Kore": "af_kore",
-    "🇺🇸 🚺 Sarah": "af_sarah",
-    "🇺🇸 🚺 Nova": "af_nova",
-    "🇺🇸 🚺 Sky": "af_sky",
-    "🇺🇸 🚺 Alloy": "af_alloy",
-    "🇺🇸 🚺 Jessica": "af_jessica",
-    "🇺🇸 🚺 River": "af_river",
-    "🇺🇸 🚹 Michael": "am_michael",
-    "🇺🇸 🚹 Fenrir": "am_fenrir",
-    "🇺🇸 🚹 Puck": "am_puck",
-    "🇺🇸 🚹 Echo": "am_echo",
-    "🇺🇸 🚹 Eric": "am_eric",
-    "🇺🇸 🚹 Liam": "am_liam",
-    "🇺🇸 🚹 Onyx": "am_onyx",
-    "🇺🇸 🚹 Santa": "am_santa",
-    "🇺🇸 🚹 Adam": "am_adam",
-    "🇬🇧 🚺 Emma": "bf_emma",
-    "🇬🇧 🚺 Isabella": "bf_isabella",
-    "🇬🇧 🚺 Alice": "bf_alice",
-    "🇬🇧 🚺 Lily": "bf_lily",
-    "🇬🇧 🚹 George": "bm_george",
-    "🇬🇧 🚹 Fable": "bm_fable",
-    "🇬🇧 🚹 Lewis": "bm_lewis",
-    "🇬🇧 🚹 Daniel": "bm_daniel",
-}
-# Kokoro official repo for weights + voices
-KOKORO_REPO = "hexgrad/Kokoro-82M"
-# -----------------------------
-# PIPELINES
-# lang_code must match voice family. :contentReference[oaicite:7]{index=7}
-# -----------------------------
 PIPELINES = {
-    "a": KPipeline(lang_code="a"),  # American English
-    "b": KPipeline(lang_code="b"),  # British English
 }
-# -----------------------------
-# OPTIONAL: preload spacy model if present
-# prevents runtime download surprises
-# -----------------------------
-try:
-    import spacy
-    spacy.load("en_core_web_sm")
-except Exception:
-    pass
-# -----------------------------
-# VOICE CACHE (torch tensors)
-# -----------------------------
-VOICE_TENSOR_CACHE = {}
 def voice_to_lang_code(voice_code: str) -> str:
-    # af_ / am_ => 'a', bf_ / bm_ => 'b'
-    if voice_code.startswith("b"):
         return "b"
     return "a"
-def get_voice_tensor(voice_code: str):
-    if voice_code in VOICE_TENSOR_CACHE:
-        return VOICE_TENSOR_CACHE[voice_code]
-    path = hf_hub_download(
-        repo_id=KOKORO_REPO,
-        filename=f"voices/{voice_code}.pt",
-    )
-    # weights_only True is recommended by torch warning text in your logs
-    vt = torch.load(path, map_location="cpu", weights_only=True)
-    VOICE_TENSOR_CACHE[voice_code] = vt
-    return vt
-# -----------------------------
-# TEXT NORMALIZATION
-# Stops “skipping” for many brand names by avoiding OOD token collapse.
-# Also makes acronyms pronounceable.
-# -----------------------------
-_ACRONYM_RE = re.compile(r"\b([A-Z]{2,})\b")
-_CAMEL_RE = re.compile(r"([a-z])([A-Z])")
-_DIGIT_WORD_RE = re.compile(r"\b(\d+)([A-Za-z]+)\b")
-def normalize_text_for_kokoro(text: str) -> str:
     if not text:
         return text
-    # Keep your special Kokoro pronunciation trick
     text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
-    # Split CamelCase: OpenAI -> Open AI
-    text = _CAMEL_RE.sub(r"\1 \2", text)
-    # Handle 2FA -> "2 F A" (first split digits+letters)
-    text = _DIGIT_WORD_RE.sub(r"\1 \2", text)
-    # Acronyms: API -> "A P I"
-    def _spell(m):
-        s = m.group(1)
-        return " ".join(list(s))
-    text = _ACRONYM_RE.sub(_spell, text)
     return text
-# -----------------------------
 # CHUNKING
-# Fewer micro-chunks reduces stalls under load.
-# -----------------------------
 _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
-def chunk_text(text: str, min_chars: int = 240, max_chars: int = 520):
-    text = text.strip()
     if not text:
         return
-    parts = _SENT_SPLIT.split(text)
     buf = ""
     for p in parts:
-        if not p:
             continue
-        if len(buf) + len(p) + 1 <= max_chars:
-            buf = (buf + " " + p).strip()
-            if len(buf) < min_chars:
-                continue
-            yield buf
-            buf = ""
-        else:
-            if buf:
-                yield buf
-            buf = p.strip()
-            if len(buf) >= min_chars:
-                yield buf
-                buf = ""
     if buf:
         yield buf
-# -----------------------------
-# AUDIO UTILS
-# Avoid trimming per-chunk to prevent audible “missing” regions.
-# Do optional gentle trim only on final concatenated output if needed.
-# -----------------------------
-def float_to_int16(audio_f32: np.ndarray) -> np.ndarray:
-    audio_f32 = np.clip(audio_f32, -1.0, 1.0)
-    return (audio_f32 * 32767.0).astype(np.int16)
-# -----------------------------
-# CORE SYNTH
-# Uses official generator API. :contentReference[oaicite:8]{index=8}
-# -----------------------------
-def kokoro_generate_stream(text: str, voice_code: str, speed: float):
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
-    voice_tensor = get_voice_tensor(voice_code)
-    # We already chunk ourselves, so keep split_pattern simple.
-    # If you pass a strong splitter here, you will double-split and create micro audio pieces.
     generator = pipeline(
-        text,
-        voice=voice_tensor,
         speed=float(speed),
-        split_pattern=r"$^",  # split nothing
     )
     for _, _, audio in generator:
-        # audio is float array at 24kHz
         yield audio
-# -----------------------------
 # GRADIO STREAM
-# -----------------------------
 def gradio_stream_generator(text, voice_name, speed):
     voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-    text = normalize_text_for_kokoro(text)
-    # warm voice cache
-    get_voice_tensor(voice_code)
-    for i, chunk in enumerate(chunk_text(text)):
         t0 = time.time()
-        # generator yields 1 item because split_pattern disables splitting
-        for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
             dur = time.time() - t0
             print(f"⚡ UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
-            yield 24000, float_to_int16(audio_f32)
-# -----------------------------
-# FASTAPI WS
-# -----------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
@@ -534,13 +459,13 @@ async def audio_engine_loop():
             if ws.client_state.value > 1:
                 continue
-            def _run():
-                out = []
-                for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
-                    out.append(float_to_int16(audio_f32).tobytes())
-                return out
-            frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run)
             for frame in frames:
                 try:
@@ -589,13 +514,10 @@ async def websocket_endpoint(ws: WebSocket):
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 speed = float(data.get("speed", speed))
-                get_voice_tensor(voice_code)
             if "text" in data:
-                raw = data["text"]
-                text = normalize_text_for_kokoro(raw)
-                # Bigger chunks reduces stalls under load
-                for chunk in chunk_text(text):
                     if chunk.strip():
                         await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
@@ -607,17 +529,17 @@ async def websocket_endpoint(ws: WebSocket):
     finally:
         heartbeat_task.cancel()
-# -----------------------------
 # GRADIO UI
-# -----------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
-    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, HF CPU-friendly)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
                 label="Input Text",
                 lines=3,
-                value="The system is live. Use the UI or connect to /ws/audio.",
             )
             voice_in = gr.Dropdown(
                 list(VOICE_CHOICES.keys()),
@@ -634,4 +556,4 @@ with gr.Blocks(title="Kokoro TTS") as app:
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
-    uvicorn.run(final_app, host="0.0.0.0", port=7860)

 # if __name__ == "__main__":
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import time
+import re
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import uvicorn
 import torch
 from kokoro import KPipeline
+# Optional speed boost on HF Linux
+try:
+    import uvloop  # type: ignore
+    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+except Exception:
+    pass
 print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
+# ------------------------------------------------------------
+# OFFICIAL PIPELINES (per docs you pasted)
+# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
+# ------------------------------------------------------------
 PIPELINES = {
+    "a": KPipeline(lang_code="a"),
+    "b": KPipeline(lang_code="b"),
 }
+VOICE_CHOICES = {
+    "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
+    "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
+    "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
+    "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
+    "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
+    "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
+    "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
+    "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
+    "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
+    "🇬🇧 🚹 Daniel": "bm_daniel",
+}
 def voice_to_lang_code(voice_code: str) -> str:
+    # bf_ / bm_ are British
+    if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
         return "b"
     return "a"
+# ------------------------------------------------------------
+# TEXT HELPERS (sticking to your pasted docs format)
+# Use IPA markup like: [Kokoro](/kˈOkəɹO/)
+# ------------------------------------------------------------
+def normalize_text(text: str) -> str:
     if not text:
         return text
+    # Your docs show this exact IPA form for Kokoro
     text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
     return text
+# ------------------------------------------------------------
 # CHUNKING
+# Main goal: avoid tiny chunks that cause audible discontinuity.
+# ------------------------------------------------------------
 _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
+def tuned_splitter(text: str):
+    text = (text or "").strip()
     if not text:
         return
+    parts = [p.strip() for p in _SENT_SPLIT.split(text) if p and p.strip()]
     buf = ""
     for p in parts:
+        if not buf:
+            buf = p
+            continue
+        # Grow chunks to reduce boundary artifacts
+        if len(buf) < 220:
+            buf = f"{buf} {p}"
             continue
+        yield buf
+        buf = p
     if buf:
         yield buf
+# ------------------------------------------------------------
+# AUDIO CONVERSION FIX
+# Fixes: "'Tensor' object has no attribute 'astype'"
+# ------------------------------------------------------------
+def audio_to_int16_np(audio):
+    # audio can be torch.Tensor or np.ndarray
+    if isinstance(audio, torch.Tensor):
+        audio = audio.detach().cpu()
+        audio = torch.clamp(audio, -1.0, 1.0)
+        audio_i16 = (audio * 32767.0).to(torch.int16)
+        return audio_i16.numpy()
+    audio = np.asarray(audio)
+    audio = np.clip(audio, -1.0, 1.0)
+    return (audio * 32767.0).astype(np.int16)
+def audio_to_pcm_bytes(audio) -> bytes:
+    return audio_to_int16_np(audio).tobytes()
+# ------------------------------------------------------------
+# OFFICIAL GENERATION (per your docs)
+# generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
+# ------------------------------------------------------------
+def kokoro_generate(chunk: str, voice_code: str, speed: float):
     lang_code = voice_to_lang_code(voice_code)
     pipeline = PIPELINES[lang_code]
+    # Keep split_pattern exactly in the spirit of your docs
+    # Our own splitter already splits on sentence/newlines, so this stays light.
     generator = pipeline(
+        chunk,
+        voice=voice_code,
         speed=float(speed),
+        split_pattern=r"\n+",
     )
     for _, _, audio in generator:
         yield audio
+# ------------------------------------------------------------
 # GRADIO STREAM
+# ------------------------------------------------------------
 def gradio_stream_generator(text, voice_name, speed):
     voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+    text = normalize_text(text)
+    print("--- START UI STREAM ---")
+    for i, chunk in enumerate(tuned_splitter(text)):
         t0 = time.time()
+        for audio in kokoro_generate(chunk, voice_code, speed):
             dur = time.time() - t0
             print(f"⚡ UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
+            yield 24000, audio_to_int16_np(audio)
+    print("--- END UI STREAM ---")
+# ------------------------------------------------------------
+# FASTAPI + WEBSOCKET QUEUE
+# Keep it single-file on CPU to stay stable under load.
+# ------------------------------------------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
 INFERENCE_QUEUE = asyncio.Queue()
             if ws.client_state.value > 1:
                 continue
+            def _run_and_pack():
+                frames = []
+                for audio in kokoro_generate(chunk, voice_code, speed):
+                    frames.append(audio_to_pcm_bytes(audio))
+                return frames
+            frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run_and_pack)
             for frame in frames:
                 try:
                 voice_name = data.get("voice", "🇺🇸 🚺 Bella")
                 voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 speed = float(data.get("speed", speed))
             if "text" in data:
+                text = normalize_text(data["text"])
+                for chunk in tuned_splitter(text):
                     if chunk.strip():
                         await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
     finally:
         heartbeat_task.cancel()
+# ------------------------------------------------------------
 # GRADIO UI
+# ------------------------------------------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
                 label="Input Text",
                 lines=3,
+                value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
             )
             voice_in = gr.Dropdown(
                 list(VOICE_CHOICES.keys()),
 final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
+    uvicorn.run(final_app, host="0.0.0.0", port=7860)