Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Jan 6

Commit

a649960

verified ·

1 Parent(s): 20fe4d6

Update app.py

Browse files

Files changed (1) hide show

app.py +240 -203

app.py CHANGED Viewed

@@ -298,244 +298,280 @@ import os
 import re
 import time
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import gradio as gr
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uvicorn
-import torch
 from kokoro import KPipeline
-# ----------------------------
-# HARD LIMIT CPU THREADS (2 vCPU box)
-# ----------------------------
 os.environ.setdefault("OMP_NUM_THREADS", "2")
 os.environ.setdefault("MKL_NUM_THREADS", "2")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
-try:
-    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "2")))
-    torch.set_num_interop_threads(int(os.environ.get("TORCH_NUM_INTEROP_THREADS", "1")))
-except Exception:
-    pass
-# Optional: uvloop for faster event loop on HF Linux
-try:
-    import uvloop  # type: ignore
-    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-except Exception:
-    pass
-print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE, LOW LATENCY)")
-# ----------------------------
-# VOICES
-# ----------------------------
 VOICE_CHOICES = {
-    "🇺🇸 🚺 Heart": "af_heart", "🇺🇸 🚺 Bella": "af_bella", "🇺🇸 🚺 Nicole": "af_nicole",
-    "🇺🇸 🚺 Aoede": "af_aoede", "🇺🇸 🚺 Kore": "af_kore", "🇺🇸 🚺 Sarah": "af_sarah",
-    "🇺🇸 🚺 Nova": "af_nova", "🇺🇸 🚺 Sky": "af_sky", "🇺🇸 🚺 Alloy": "af_alloy",
-    "🇺🇸 🚺 Jessica": "af_jessica", "🇺🇸 🚺 River": "af_river", "🇺🇸 🚹 Michael": "am_michael",
-    "🇺🇸 🚹 Fenrir": "am_fenrir", "🇺🇸 🚹 Puck": "am_puck", "🇺🇸 🚹 Echo": "am_echo",
-    "🇺🇸 🚹 Eric": "am_eric", "🇺🇸 🚹 Liam": "am_liam", "🇺🇸 🚹 Onyx": "am_onyx",
-    "🇺🇸 🚹 Santa": "am_santa", "🇺🇸 🚹 Adam": "am_adam", "🇬🇧 🚺 Emma": "bf_emma",
-    "🇬🇧 🚺 Isabella": "bf_isabella", "🇬🇧 🚺 Alice": "bf_alice", "🇬🇧 🚺 Lily": "bf_lily",
-    "🇬🇧 🚹 George": "bm_george", "🇬🇧 🚹 Fable": "bm_fable", "🇬🇧 🚹 Lewis": "bm_lewis",
     "🇬🇧 🚹 Daniel": "bm_daniel",
 }
-def voice_to_lang_code(voice_code: str) -> str:
-    if voice_code.startswith("bf_") or voice_code.startswith("bm_"):
-        return "b"  # British
-    return "a"      # American
-# ----------------------------
-# PIPELINES (keep hot in RAM)
-# ----------------------------
-PIPELINES = {
-    "a": KPipeline(lang_code="a"),
-    "b": KPipeline(lang_code="b"),
-}
-# ----------------------------
-# TEXT NORMALIZATION (matches your pasted official docs)
-# ----------------------------
-def normalize_text(text: str) -> str:
-    if not text:
-        return ""
-    return text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
-# ----------------------------
-# LOW LATENCY SEGMENTATION
-# One pipeline call per request.
-# We inject newlines to let split_pattern=r"\n+" split inside Kokoro.
-# We also force a small first segment for fast first audio.
-# ----------------------------
-_SENT_BOUNDARY = re.compile(r"([.!?;:])\s+")
-def inject_newlines_for_fast_stream(text: str) -> str:
-    text = normalize_text(text).strip()
-    if not text:
-        return ""
-    # Sentence boundaries -> newline so official split_pattern can segment
-    text = _SENT_BOUNDARY.sub(r"\1\n", text)
-    # Also split on existing multi-newlines
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    # Guarantee a small first segment for low time-to-first-audio
-    if "\n" not in text and len(text) > 90:
-        cut = text.rfind(" ", 0, 70)
-        if cut < 35:
-            cut = 70
-        text = text[:cut].strip() + "\n" + text[cut:].strip()
-    return text
-# ----------------------------
-# AUDIO CONVERSION (fast, safe)
-# ----------------------------
-def audio_to_int16_np(audio):
     if isinstance(audio, torch.Tensor):
-        audio = audio.detach().cpu()
-        audio = torch.clamp(audio, -1.0, 1.0)
-        return (audio * 32767.0).to(torch.int16).numpy()
-    audio = np.asarray(audio)
-    audio = np.clip(audio, -1.0, 1.0)
-    return (audio * 32767.0).astype(np.int16)
-def audio_to_pcm_bytes(audio) -> bytes:
-    return audio_to_int16_np(audio).tobytes()
-# ----------------------------
-# OFFICIAL GENERATION PATH (single pipeline call)
-# generator = pipeline(text, voice='af_heart', speed=1, split_pattern=r'\n+')
-# ----------------------------
-def kokoro_generator_full(text: str, voice_code: str, speed: float):
-    lang_code = voice_to_lang_code(voice_code)
-    pipeline = PIPELINES[lang_code]
-    text = inject_newlines_for_fast_stream(text)
-    if not text:
-        return
     with torch.inference_mode():
-        generator = pipeline(
-            text,
-            voice=voice_code,
             speed=float(speed),
-            split_pattern=r"\n+",
         )
-        for _, _, audio in generator:
-            yield audio
-# ----------------------------
-# WARMUP (pay cold-start cost at boot)
-# ----------------------------
 def warmup():
     try:
         t0 = time.time()
-        for _ in kokoro_generator_full("Hello.", "af_bella", 1.0):
-            break
-        print(f"✅ WARMUP DONE in {time.time() - t0:.2f}s")
     except Exception as e:
-        print(f"⚠️ WARMUP FAILED: {e}")
-# ----------------------------
-# GRADIO UI STREAM
-# ----------------------------
-def gradio_stream(text, voice_name, speed):
-    voice_code = VOICE_CHOICES.get(voice_name, voice_name)
-    text = normalize_text(text)
-    i = 0
-    t0 = time.time()
-    for audio in kokoro_generator_full(text, voice_code, speed):
-        if i == 0:
-            print(f"⚡ UI first audio in {time.time() - t0:.2f}s")
-        i += 1
-        yield 24000, audio_to_int16_np(audio)
-# ----------------------------
-# FASTAPI WS ENGINE
-# Single worker thread for actual generation.
-# Stream frames to client as soon as they exist.
-# No buffering a full list before sending.
-# ----------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
-INFERENCE_QUEUE: asyncio.Queue = asyncio.Queue()
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
-        ws, voice_code, speed, text = await INFERENCE_QUEUE.get()
-        # Skip dead clients early
-        if ws.client_state.value > 1:
-            continue
-        frame_q: asyncio.Queue = asyncio.Queue(maxsize=6)
-        def _worker():
-            try:
-                for audio in kokoro_generator_full(text, voice_code, speed):
-                    b = audio_to_pcm_bytes(audio)
-                    # backpressure aware
-                    while True:
-                        try:
-                            loop.call_soon_threadsafe(frame_q.put_nowait, b)
-                            break
-                        except Exception:
-                            time.sleep(0.001)
-                loop.call_soon_threadsafe(frame_q.put_nowait, None)
-            except Exception as e:
-                print(f"API Worker Error: {e}")
-                try:
-                    loop.call_soon_threadsafe(frame_q.put_nowait, None)
-                except Exception:
-                    pass
-        INFERENCE_EXECUTOR.submit(_worker)
-        first_sent = False
-        started = time.time()
-        while True:
-            frame = await frame_q.get()
-            if frame is None:
-                break
-            if ws.client_state.value > 1:
-                break
             try:
-                await ws.send_bytes(frame)
-                if not first_sent:
-                    print(f"⚡ API first audio in {time.time() - started:.2f}s")
-                    first_sent = True
             except Exception:
-                break
 @api.on_event("startup")
 async def startup():
-    loop = asyncio.get_running_loop()
-    await loop.run_in_executor(INFERENCE_EXECUTOR, warmup)
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
-    voice_code = "af_bella"
     speed = 1.0
     print(f"✅ Client connected: {ws.client}")
@@ -554,51 +590,52 @@ async def websocket_endpoint(ws: WebSocket):
             try:
                 data = await ws.receive_json()
             except WebSocketDisconnect:
-                print("❌ Client disconnected cleanly")
                 break
-            except Exception as e:
-                print(f"⚠️ Connection lost: {e}")
                 break
             if "config" in data:
-                voice_name = data.get("voice", "🇺🇸 🚺 Bella")
-                voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 speed = float(data.get("speed", speed))
             if "text" in data:
-                text = normalize_text(data.get("text", ""))
-                if text.strip():
-                    await INFERENCE_QUEUE.put((ws, voice_code, speed, text))
             if "flush" in data:
                 pass
     finally:
         heartbeat_task.cancel()
-# ----------------------------
-# GRADIO APP
-# ----------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
     gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
                 label="Input Text",
-                lines=3,
-                value="The system is live. Use the Gradio UI, or connect to /ws/audio.",
-            )
-            voice_in = gr.Dropdown(
-                list(VOICE_CHOICES.keys()),
-                value="🇺🇸 🚺 Bella",
-                label="Voice",
             )
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
-    btn.click(gradio_stream, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 final_app = gr.mount_gradio_app(api, app, path="/")

 import re
 import time
 import asyncio
+import uvloop
 import numpy as np
 import gradio as gr
+import torch
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from concurrent.futures import ThreadPoolExecutor
 import uvicorn
+# Official pipeline
 from kokoro import KPipeline
+# -------------------------
+# CPU + runtime tuning
+# -------------------------
+# Keep these conservative. HF CPU is usually 2 vCPU.
 os.environ.setdefault("OMP_NUM_THREADS", "2")
 os.environ.setdefault("MKL_NUM_THREADS", "2")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
+torch.set_num_threads(2)
+torch.set_num_interop_threads(1)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+SAMPLE_RATE = 24000
+# -------------------------
+# Voices (use Kokoro voice ids)
+# -------------------------
 VOICE_CHOICES = {
+    "🇺🇸 🚺 Heart": "af_heart",
+    "🇺🇸 🚺 Bella": "af_bella",
+    "🇺🇸 🚺 Nicole": "af_nicole",
+    "🇺🇸 🚺 Aoede": "af_aoede",
+    "🇺🇸 🚺 Kore": "af_kore",
+    "🇺🇸 🚺 Sarah": "af_sarah",
+    "🇺🇸 🚺 Nova": "af_nova",
+    "🇺🇸 🚺 Sky": "af_sky",
+    "🇺🇸 🚺 Alloy": "af_alloy",
+    "🇺🇸 🚺 Jessica": "af_jessica",
+    "🇺🇸 🚺 River": "af_river",
+    "🇺🇸 🚹 Michael": "am_michael",
+    "🇺🇸 🚹 Fenrir": "am_fenrir",
+    "🇺🇸 🚹 Puck": "am_puck",
+    "🇺🇸 🚹 Echo": "am_echo",
+    "🇺🇸 🚹 Eric": "am_eric",
+    "🇺🇸 🚹 Liam": "am_liam",
+    "🇺🇸 🚹 Onyx": "am_onyx",
+    "🇺🇸 🚹 Santa": "am_santa",
+    "🇺🇸 🚹 Adam": "am_adam",
+    "🇬🇧 🚺 Emma": "bf_emma",
+    "🇬🇧 🚺 Isabella": "bf_isabella",
+    "🇬🇧 🚺 Alice": "bf_alice",
+    "🇬🇧 🚺 Lily": "bf_lily",
+    "🇬🇧 🚹 George": "bm_george",
+    "🇬🇧 🚹 Fable": "bm_fable",
+    "🇬🇧 🚹 Lewis": "bm_lewis",
     "🇬🇧 🚹 Daniel": "bm_daniel",
 }
+DEFAULT_VOICE_UI = "🇺🇸 🚺 Bella"
+DEFAULT_VOICE = VOICE_CHOICES[DEFAULT_VOICE_UI]
+# -------------------------
+# Kokoro pipeline (global)
+# -------------------------
+print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
+PIPELINE = KPipeline(lang_code="a")
+# -------------------------
+# Helpers
+# -------------------------
+def _to_numpy_audio(audio):
+    # Kokoro may return a torch.Tensor or numpy array
     if isinstance(audio, torch.Tensor):
+        return audio.detach().cpu().numpy()
+    return np.asarray(audio)
+def _float_to_int16(audio_f32):
+    audio_f32 = np.clip(audio_f32, -1.0, 1.0).astype(np.float32)
+    return (audio_f32 * 32767.0).astype(np.int16)
+def trim_silence(audio_f32, threshold=0.01, pad=240):
+    # audio_f32 is float32, shape [N]
+    if audio_f32.size == 0:
+        return audio_f32
+    mask = np.abs(audio_f32) > threshold
+    if not np.any(mask):
+        return audio_f32
+    start = int(np.argmax(mask))
+    end = int(len(mask) - np.argmax(mask[::-1]))
+    start = max(0, start - pad)
+    end = min(len(audio_f32), end + pad)
+    return audio_f32[start:end]
+def crossfade_concat(a, b, overlap=1200):
+    # overlap ~ 50ms at 24k
+    if a is None:
+        return b
+    if b is None:
+        return a
+    if len(a) < overlap or len(b) < overlap:
+        return np.concatenate([a, b])
+    fade_out = np.linspace(1.0, 0.0, overlap, dtype=np.float32)
+    fade_in = 1.0 - fade_out
+    a_tail = a[-overlap:] * fade_out
+    b_head = b[:overlap] * fade_in
+    mixed = a_tail + b_head
+    return np.concatenate([a[:-overlap], mixed, b[overlap:]])
+def tuned_splitter(text):
+    # First chunk small for fast first packet, later chunks larger for efficiency
+    parts = re.split(r"([.,!?;:\n]+)", text)
+    buf = ""
+    chunk_idx = 0
+    for p in parts:
+        buf += p
+        if chunk_idx == 0:
+            threshold = 80
+        elif chunk_idx == 1:
+            threshold = 140
+        elif chunk_idx == 2:
+            threshold = 220
+        else:
+            threshold = 320
+        if re.search(r"[.,!?;:\n]$", buf) and len(buf) >= threshold:
+            s = buf.strip()
+            if s:
+                yield s
+                chunk_idx += 1
+            buf = ""
+    s = buf.strip()
+    if s:
+        yield s
+def normalize_names_minimally(text):
+    # Cheap heuristics to reduce skipped acronyms and CamelCase
+    # 1) Split ALLCAPS as letters: "AI" -> "A I"
+    text = re.sub(r"\b([A-Z]{2,})\b", lambda m: " ".join(list(m.group(1))), text)
+    # 2) Split CamelCase boundaries: "OpenAI" -> "Open AI"
+    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
+    # Keep your Kokoro IPA hint example
+    text = text.replace("Kokoro", "Kokoro")  # keep as-is unless you inject IPA tags in client
+    return text
+def synthesize_one_chunk(chunk, voice_id, speed):
+    # Make sure no nested splitting happens inside a chunk
     with torch.inference_mode():
+        gen = PIPELINE(
+            chunk,
+            voice=voice_id,
             speed=float(speed),
+            split_pattern=r"\n+",  # chunk text has no newlines in practice
         )
+        # gen yields (gs, ps, audio)
+        out_audio = None
+        for _, _, audio in gen:
+            audio_np = _to_numpy_audio(audio).astype(np.float32)
+            audio_np = trim_silence(audio_np)
+            out_audio = crossfade_concat(out_audio, audio_np, overlap=1200)
+        return out_audio
+# -------------------------
+# Warmup to remove cold start latency
+# -------------------------
 def warmup():
     try:
         t0 = time.time()
+        _ = synthesize_one_chunk("Warmup.", DEFAULT_VOICE, 1.0)
+        dt = time.time() - t0
+        print(f"✅ Warmup done in {dt:.2f}s")
     except Exception as e:
+        print(f"⚠️ Warmup failed: {e}")
+# Run warmup in background thread once
+WARMUP_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+WARMUP_EXECUTOR.submit(warmup)
+# -------------------------
+# Streaming strategy
+# -------------------------
+def stream_generator(text, voice_ui, speed):
+    voice_id = VOICE_CHOICES.get(voice_ui, DEFAULT_VOICE)
+    text = normalize_names_minimally(text)
+    print("--- START UI STREAM ---")
+    first = True
+    # Buffer audio after the first packet to reduce gaps from too many tiny yields
+    buffer_audio = None
+    buffer_min_seconds = 0.9
+    for chunk_idx, chunk in enumerate(tuned_splitter(text)):
+        t0 = time.time()
+        audio_f32 = synthesize_one_chunk(chunk, voice_id, speed)
+        if audio_f32 is None or len(audio_f32) == 0:
+            continue
+        dt = time.time() - t0
+        print(f"⚡ UI chunk {chunk_idx}: {len(chunk)} chars in {dt:.2f}s")
+        if first:
+            # First packet: yield immediately for low perceived latency
+            first = False
+            yield (SAMPLE_RATE, _float_to_int16(audio_f32))
+            continue
+        buffer_audio = crossfade_concat(buffer_audio, audio_f32, overlap=1200)
+        if buffer_audio is not None:
+            if len(buffer_audio) >= int(buffer_min_seconds * SAMPLE_RATE):
+                yield (SAMPLE_RATE, _float_to_int16(buffer_audio))
+                buffer_audio = None
+    if buffer_audio is not None and len(buffer_audio) > 0:
+        yield (SAMPLE_RATE, _float_to_int16(buffer_audio))
+    print("--- END UI STREAM ---")
+# -------------------------
+# API (FastAPI + WS)
+# -------------------------
 api = FastAPI()
+# One inference worker is the right call on 2 vCPU
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
+INFERENCE_QUEUE = asyncio.Queue()
 async def audio_engine_loop():
     print("⚡ API AUDIO PIPELINE STARTED")
     loop = asyncio.get_running_loop()
     while True:
+        job = await INFERENCE_QUEUE.get()
+        text, voice_id, speed, ws = job
+        try:
+            if ws.client_state.value > 1:
+                continue
+            # Run synthesis in the single worker thread
+            audio_f32 = await loop.run_in_executor(
+                INFERENCE_EXECUTOR,
+                lambda: synthesize_one_chunk(text, voice_id, speed),
+            )
+            if audio_f32 is None or len(audio_f32) == 0:
+                continue
+            pcm = _float_to_int16(audio_f32).tobytes()
             try:
+                await ws.send_bytes(pcm)
             except Exception:
+                pass
+        except Exception as e:
+            print(f"API Engine Error: {e}")
 @api.on_event("startup")
 async def startup():
     asyncio.create_task(audio_engine_loop())
 @api.websocket("/ws/audio")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
+    voice_id = DEFAULT_VOICE
     speed = 1.0
+    loop = asyncio.get_running_loop()
     print(f"✅ Client connected: {ws.client}")
             try:
                 data = await ws.receive_json()
             except WebSocketDisconnect:
                 break
+            except Exception:
                 break
             if "config" in data:
+                voice_ui = data.get("voice", DEFAULT_VOICE_UI)
+                voice_id = VOICE_CHOICES.get(voice_ui, DEFAULT_VOICE)
                 speed = float(data.get("speed", speed))
             if "text" in data:
+                raw = data["text"]
+                raw = normalize_names_minimally(raw)
+                # First chunk tiny, rest larger, same as UI
+                for chunk in tuned_splitter(raw):
+                    if not chunk.strip():
+                        continue
+                    await INFERENCE_QUEUE.put((chunk, voice_id, speed, ws))
             if "flush" in data:
                 pass
+    except Exception as e:
+        print(f"🔥 Critical WS Error: {e}")
     finally:
         heartbeat_task.cancel()
+# -------------------------
+# Gradio UI
+# -------------------------
 with gr.Blocks(title="Kokoro TTS") as app:
     gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Low Latency)")
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
                 label="Input Text",
+                lines=4,
+                value="The system is live. Use the UI or connect to /ws/audio.",
             )
+            voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value=DEFAULT_VOICE_UI, label="Voice")
             speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
             btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+    btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
 final_app = gr.mount_gradio_app(api, app, path="/")