Spaces:

auralodyssey
/

api

Sleeping

App Files Files Community

auralodyssey commited on Jan 6

Commit

4daf7c6

verified ·

1 Parent(s): 38881c9

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -196

app.py CHANGED Viewed

@@ -296,219 +296,228 @@
 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import re
 import time
 import asyncio
 import uvloop
 import numpy as np
 import gradio as gr
-import torch
-from functools import lru_cache
-from huggingface_hub import hf_hub_download
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uvicorn
-from concurrent.futures import ThreadPoolExecutor
-# Force Gradio SSR off to avoid Node proxy issues on Spaces
-# You can also set GRADIO_SSR_MODE=0 in Space Variables
-os.environ.setdefault("GRADIO_SSR_MODE", "0")
-# -----------------------
-# HF free tier constraints
-# -----------------------
-# CPU Basic = 2 vCPU, 16GB RAM. Expect queuing under load. :contentReference[oaicite:5]{index=5}
-# -----------------------
-# Kokoro official pipeline
-# -----------------------
-from kokoro import KPipeline
-# -----------------------
-# Voice UI (same mapping)
-# -----------------------
 VOICE_CHOICES = {
-    '🇺🇸 🚺 Heart': 'af_heart', '🇺🇸 🚺 Bella': 'af_bella', '🇺🇸 🚺 Nicole': 'af_nicole',
-    '🇺🇸 🚺 Aoede': 'af_aoede', '🇺🇸 🚺 Kore': 'af_kore', '🇺🇸 🚺 Sarah': 'af_sarah',
-    '🇺🇸 🚺 Nova': 'af_nova', '🇺🇸 🚺 Sky': 'af_sky', '🇺🇸 🚺 Alloy': 'af_alloy',
-    '🇺🇸 🚺 Jessica': 'af_jessica', '🇺🇸 🚺 River': 'af_river', '🇺🇸 🚹 Michael': 'am_michael',
-    '🇺🇸 🚹 Fenrir': 'am_fenrir', '🇺🇸 🚹 Puck': 'am_puck', '🇺🇸 🚹 Echo': 'am_echo',
-    '🇺🇸 🚹 Eric': 'am_eric', '🇺🇸 🚹 Liam': 'am_liam', '🇺🇸 🚹 Onyx': 'am_onyx',
-    '🇺🇸 🚹 Santa': 'am_santa', '🇺🇸 🚹 Adam': 'am_adam', '🇬🇧 🚺 Emma': 'bf_emma',
-    '🇬🇧 🚺 Isabella': 'bf_isabella', '🇬🇧 🚺 Alice': 'bf_alice', '🇬🇧 🚺 Lily': 'bf_lily',
-    '🇬🇧 🚹 George': 'bm_george', '🇬🇧 🚹 Fable': 'bm_fable', '🇬🇧 🚹 Lewis': 'bm_lewis',
-    '🇬🇧 🚹 Daniel': 'bm_daniel',
 }
-VOICE_REPO = "hexgrad/Kokoro-82M"  # voices/*.pt live here :contentReference[oaicite:6]{index=6}
-print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
-asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-# Torch CPU tuning for HF 2 vCPU
-torch.set_num_threads(int(os.getenv("TORCH_NUM_THREADS", "2")))
-torch.set_num_interop_threads(int(os.getenv("TORCH_NUM_INTEROP_THREADS", "1")))
-torch.backends.mkldnn.enabled = True
-# Use American English by default to match your US voices
-PIPELINE = KPipeline(lang_code="a")
-# Voice cache (load once, reuse)
-VOICE_TENSOR_CACHE = {}
-def get_voice_tensor(voice_name: str):
-    code = VOICE_CHOICES.get(voice_name, voice_name)
-    if code not in VOICE_TENSOR_CACHE:
-        path = hf_hub_download(repo_id=VOICE_REPO, filename=f"voices/{code}.pt")
-        VOICE_TENSOR_CACHE[code] = torch.load(path, map_location="cpu")
-    return VOICE_TENSOR_CACHE[code]
-# -----------------------
-# Text normalization to stop name skipping
-# -----------------------
-_ACRONYM = re.compile(r"\b[A-Z]{2,}\b")
-_CAMEL = re.compile(r"\b([A-Z][a-z]+)([A-Z][A-Za-z]+)\b")
-_VER = re.compile(r"\b(v|V)(\d+(\.\d+)*)\b")
-_GPT = re.compile(r"\bGPT[- ]?(\d+)\b", re.IGNORECASE)
-def normalize_for_tts(text: str) -> str:
     if not text:
         return text
-    # Preserve Kokoro IPA hint pattern shown in model card :contentReference[oaicite:7]{index=7}
     text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
-    # Turn GPT-5 into "G P T 5"
-    text = _GPT.sub(lambda m: "G P T " + m.group(1), text)
-    # Turn v1.0 into "version 1.0"
-    text = _VER.sub(lambda m: "version " + m.group(2), text)
-    # Split CamelCase: OpenAI -> Open AI, DeepInfra -> Deep Infra
-    # Repeat a few times to handle longer chains
-    for _ in range(3):
-        text2 = _CAMEL.sub(r"\1 \2", text)
-        if text2 == text:
-            break
-        text = text2
-    # Spell acronyms: YC -> Y C, EF -> E F
-    text = _ACRONYM.sub(lambda m: " ".join(list(m.group(0))), text)
     return text
-# -----------------------
-# Safer trimming and anti-gap padding
-# -----------------------
-def trim_silence(audio: np.ndarray, threshold=0.003):
-    if audio.size == 0:
-        return audio
-    mask = np.abs(audio) > threshold
-    if not np.any(mask):
-        return audio
-    start = int(np.argmax(mask))
-    end = int(len(mask) - np.argmax(mask[::-1]))
-    # Keep a little context so words do not get clipped
-    pad = 120
-    return audio[max(0, start - pad): min(len(audio), end + pad)]
-SAMPLE_RATE = 24000
-INTER_CHUNK_SIL_MS = 40  # reduces “teleport” effect between chunks
-def wav_chunk_from_text(text: str, voice_name: str, speed: float):
-    text = normalize_for_tts(text).strip()
-    if not text:
-        return None
-    voice_tensor = get_voice_tensor(voice_name)
-    # Do not let kokoro split again, you already split upstream
-    gen = PIPELINE(
         text,
         voice=voice_tensor,
         speed=float(speed),
-        split_pattern=r"$^"
     )
-    # pipeline yields (gs, ps, audio) :contentReference[oaicite:8]{index=8}
-    try:
-        _, _, audio = next(iter(gen))
-    except StopIteration:
-        return None
-    # audio is float32 [-1,1]
-    audio = np.asarray(audio, dtype=np.float32)
-    # For very short chunks, trimming can remove quiet consonants
-    if len(text) >= 40:
-        audio = trim_silence(audio, threshold=0.003)
-    # Add a tiny silence buffer to hide boundary artifacts
-    if INTER_CHUNK_SIL_MS > 0:
-        pad = np.zeros(int(SAMPLE_RATE * (INTER_CHUNK_SIL_MS / 1000.0)), dtype=np.float32)
-        audio = np.concatenate([audio, pad], axis=0)
-    pcm = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16)
-    return SAMPLE_RATE, pcm
-# -----------------------
-# Your tuned splitter, kept
-# -----------------------
-def tuned_splitter(text):
-    chunks = re.split(r'([.,!?;:\n]+)', text)
-    buffer = ""
-    chunk_count = 0
-    for part in chunks:
-        buffer += part
-        if chunk_count == 0:
-            threshold = 50
-        elif chunk_count == 1:
-            threshold = 100
-        elif chunk_count == 2:
-            threshold = 150
-        else:
-            threshold = 250
-        if re.search(r'[.,!?;:\n]$', buffer) and len(buffer) >= threshold:
-            if buffer.strip():
-                yield buffer
-                chunk_count += 1
-                buffer = ""
-    if buffer.strip():
-        yield buffer.strip()
-# -----------------------
-# Streaming generator (Gradio UI)
-# -----------------------
-def stream_generator(text, voice_name, speed):
-    get_voice_tensor(voice_name)
-    for i, chunk in enumerate(tuned_splitter(text)):
         t0 = time.time()
-        out = wav_chunk_from_text(chunk, voice_name, speed)
-        if out:
             dur = time.time() - t0
-            print(f"⚡ Chunk {i}: {len(chunk)} chars in {dur:.2f}s")
-            yield out
-# -----------------------
-# Gradio UI
-# -----------------------
-with gr.Blocks(title="Kokoro TTS", ssr_mode=False) as app:
-    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, Streamed)")
-    with gr.Row():
-        with gr.Column():
-            text_in = gr.Textbox(
-                label="Input Text",
-                lines=3,
-                value="The system is live. Use the UI or connect to /ws/audio."
-            )
-            voice_in = gr.Dropdown(list(VOICE_CHOICES.keys()), value='🇺🇸 🚺 Bella', label="Voice")
-            speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
-            btn = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
-    btn.click(stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
-# -----------------------
-# FastAPI + WebSocket (kept)
-# -----------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
@@ -519,24 +528,25 @@ async def audio_engine_loop():
     loop = asyncio.get_running_loop()
     while True:
-        voice_name, speed, chunk, ws = await INFERENCE_QUEUE.get()
         try:
             if ws.client_state.value > 1:
                 continue
-            # Run CPU-heavy synth in the executor so WS stays responsive
-            out = await loop.run_in_executor(
-                INFERENCE_EXECUTOR,
-                lambda: wav_chunk_from_text(chunk, voice_name, speed)
-            )
-            if out is None:
-                continue
-            sr, pcm = out
-            # Send metadata once per chunk so client can validate format
-            await ws.send_json({"type": "chunk", "sr": sr, "format": "pcm_s16le"})
-            await ws.send_bytes(pcm.tobytes())
         except Exception as e:
             print(f"API Engine Error: {e}")
@@ -549,7 +559,7 @@ async def startup():
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
-    voice_name = '🇺🇸 🚺 Bella'
     speed = 1.0
     print(f"✅ Client connected: {ws.client}")
@@ -576,23 +586,52 @@ async def websocket_endpoint(ws: WebSocket):
                 break
             if "config" in data:
-                voice_name = data.get("voice", voice_name)
                 speed = float(data.get("speed", speed))
-                get_voice_tensor(voice_name)
             if "text" in data:
-                text = data["text"]
-                for chunk in tuned_splitter(text):
                     if chunk.strip():
-                        await INFERENCE_QUEUE.put((voice_name, speed, chunk, ws))
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
     finally:
         heartbeat_task.cancel()
-# Mount gradio onto FastAPI, SSR off to avoid Node proxy issues :contentReference[oaicite:9]{index=9}
-final_app = gr.mount_gradio_app(api, app, path="/", ssr_mode=False)
 if __name__ == "__main__":
-    uvicorn.run(final_app, host="0.0.0.0", port=7860)

 #     uvicorn.run(final_app, host="0.0.0.0", port=7860)
 import os
 import re
+import json
 import time
 import asyncio
 import uvloop
+from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import gradio as gr
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uvicorn
+import torch
+import soundfile as sf
+from huggingface_hub import hf_hub_download
+from kokoro import KPipeline
+# -----------------------------
+# HF SPACE REALITY SETTINGS
+# -----------------------------
+# Free CPU Basic is small, so keep concurrency controlled.
+torch.set_num_threads(max(1, int(os.environ.get("TORCH_NUM_THREADS", "2"))))
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+print("🚀 BOOTING KOKORO (OFFICIAL PIPELINE)")
+# -----------------------------
+# VOICES
+# -----------------------------
 VOICE_CHOICES = {
+    "🇺🇸 🚺 Heart": "af_heart",
+    "🇺🇸 🚺 Bella": "af_bella",
+    "🇺🇸 🚺 Nicole": "af_nicole",
+    "🇺🇸 🚺 Aoede": "af_aoede",
+    "🇺🇸 🚺 Kore": "af_kore",
+    "🇺🇸 🚺 Sarah": "af_sarah",
+    "🇺🇸 🚺 Nova": "af_nova",
+    "🇺🇸 🚺 Sky": "af_sky",
+    "🇺🇸 🚺 Alloy": "af_alloy",
+    "🇺🇸 🚺 Jessica": "af_jessica",
+    "🇺🇸 🚺 River": "af_river",
+    "🇺🇸 🚹 Michael": "am_michael",
+    "🇺🇸 🚹 Fenrir": "am_fenrir",
+    "🇺🇸 🚹 Puck": "am_puck",
+    "🇺🇸 🚹 Echo": "am_echo",
+    "🇺🇸 🚹 Eric": "am_eric",
+    "🇺🇸 🚹 Liam": "am_liam",
+    "🇺🇸 🚹 Onyx": "am_onyx",
+    "🇺🇸 🚹 Santa": "am_santa",
+    "🇺🇸 🚹 Adam": "am_adam",
+    "🇬🇧 🚺 Emma": "bf_emma",
+    "🇬🇧 🚺 Isabella": "bf_isabella",
+    "🇬🇧 🚺 Alice": "bf_alice",
+    "🇬🇧 🚺 Lily": "bf_lily",
+    "🇬🇧 🚹 George": "bm_george",
+    "🇬🇧 🚹 Fable": "bm_fable",
+    "🇬🇧 🚹 Lewis": "bm_lewis",
+    "🇬🇧 🚹 Daniel": "bm_daniel",
 }
+# Kokoro official repo for weights + voices
+KOKORO_REPO = "hexgrad/Kokoro-82M"
+# -----------------------------
+# PIPELINES
+# lang_code must match voice family. :contentReference[oaicite:7]{index=7}
+# -----------------------------
+PIPELINES = {
+    "a": KPipeline(lang_code="a"),  # American English
+    "b": KPipeline(lang_code="b"),  # British English
+}
+# -----------------------------
+# OPTIONAL: preload spacy model if present
+# prevents runtime download surprises
+# -----------------------------
+try:
+    import spacy
+    spacy.load("en_core_web_sm")
+except Exception:
+    pass
+# -----------------------------
+# VOICE CACHE (torch tensors)
+# -----------------------------
+VOICE_TENSOR_CACHE = {}
+def voice_to_lang_code(voice_code: str) -> str:
+    # af_ / am_ => 'a', bf_ / bm_ => 'b'
+    if voice_code.startswith("b"):
+        return "b"
+    return "a"
+def get_voice_tensor(voice_code: str):
+    if voice_code in VOICE_TENSOR_CACHE:
+        return VOICE_TENSOR_CACHE[voice_code]
+    path = hf_hub_download(
+        repo_id=KOKORO_REPO,
+        filename=f"voices/{voice_code}.pt",
+    )
+    # weights_only True is recommended by torch warning text in your logs
+    vt = torch.load(path, map_location="cpu", weights_only=True)
+    VOICE_TENSOR_CACHE[voice_code] = vt
+    return vt
+# -----------------------------
+# TEXT NORMALIZATION
+# Stops “skipping” for many brand names by avoiding OOD token collapse.
+# Also makes acronyms pronounceable.
+# -----------------------------
+_ACRONYM_RE = re.compile(r"\b([A-Z]{2,})\b")
+_CAMEL_RE = re.compile(r"([a-z])([A-Z])")
+_DIGIT_WORD_RE = re.compile(r"\b(\d+)([A-Za-z]+)\b")
+def normalize_text_for_kokoro(text: str) -> str:
     if not text:
         return text
+    # Keep your special Kokoro pronunciation trick
     text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
+    # Split CamelCase: OpenAI -> Open AI
+    text = _CAMEL_RE.sub(r"\1 \2", text)
+    # Handle 2FA -> "2 F A" (first split digits+letters)
+    text = _DIGIT_WORD_RE.sub(r"\1 \2", text)
+    # Acronyms: API -> "A P I"
+    def _spell(m):
+        s = m.group(1)
+        return " ".join(list(s))
+    text = _ACRONYM_RE.sub(_spell, text)
     return text
+# -----------------------------
+# CHUNKING
+# Fewer micro-chunks reduces stalls under load.
+# -----------------------------
+_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
+def chunk_text(text: str, min_chars: int = 240, max_chars: int = 520):
+    text = text.strip()
+    if not text:
+        return
+    parts = _SENT_SPLIT.split(text)
+    buf = ""
+    for p in parts:
+        if not p:
+            continue
+        if len(buf) + len(p) + 1 <= max_chars:
+            buf = (buf + " " + p).strip()
+            if len(buf) < min_chars:
+                continue
+            yield buf
+            buf = ""
+        else:
+            if buf:
+                yield buf
+            buf = p.strip()
+            if len(buf) >= min_chars:
+                yield buf
+                buf = ""
+    if buf:
+        yield buf
+# -----------------------------
+# AUDIO UTILS
+# Avoid trimming per-chunk to prevent audible “missing” regions.
+# Do optional gentle trim only on final concatenated output if needed.
+# -----------------------------
+def float_to_int16(audio_f32: np.ndarray) -> np.ndarray:
+    audio_f32 = np.clip(audio_f32, -1.0, 1.0)
+    return (audio_f32 * 32767.0).astype(np.int16)
+# -----------------------------
+# CORE SYNTH
+# Uses official generator API. :contentReference[oaicite:8]{index=8}
+# -----------------------------
+def kokoro_generate_stream(text: str, voice_code: str, speed: float):
+    lang_code = voice_to_lang_code(voice_code)
+    pipeline = PIPELINES[lang_code]
+    voice_tensor = get_voice_tensor(voice_code)
+    # We already chunk ourselves, so keep split_pattern simple.
+    # If you pass a strong splitter here, you will double-split and create micro audio pieces.
+    generator = pipeline(
         text,
         voice=voice_tensor,
         speed=float(speed),
+        split_pattern=r"$^",  # split nothing
     )
+    for _, _, audio in generator:
+        # audio is float array at 24kHz
+        yield audio
+# -----------------------------
+# GRADIO STREAM
+# -----------------------------
+def gradio_stream_generator(text, voice_name, speed):
+    voice_code = VOICE_CHOICES.get(voice_name, voice_name)
+    text = normalize_text_for_kokoro(text)
+    # warm voice cache
+    get_voice_tensor(voice_code)
+    for i, chunk in enumerate(chunk_text(text)):
         t0 = time.time()
+        # generator yields 1 item because split_pattern disables splitting
+        for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
             dur = time.time() - t0
+            print(f"⚡ UI chunk {i}: {len(chunk)} chars in {dur:.2f}s")
+            yield 24000, float_to_int16(audio_f32)
+# -----------------------------
+# FASTAPI WS
+# -----------------------------
 api = FastAPI()
 INFERENCE_EXECUTOR = ThreadPoolExecutor(max_workers=1)
     loop = asyncio.get_running_loop()
     while True:
+        ws, voice_code, speed, chunk = await INFERENCE_QUEUE.get()
         try:
             if ws.client_state.value > 1:
                 continue
+            def _run():
+                out = []
+                for audio_f32 in kokoro_generate_stream(chunk, voice_code, speed):
+                    out.append(float_to_int16(audio_f32).tobytes())
+                return out
+            frames = await loop.run_in_executor(INFERENCE_EXECUTOR, _run)
+            for frame in frames:
+                try:
+                    await ws.send_bytes(frame)
+                except Exception:
+                    break
         except Exception as e:
             print(f"API Engine Error: {e}")
 async def websocket_endpoint(ws: WebSocket):
     await ws.accept()
+    voice_code = "af_bella"
     speed = 1.0
     print(f"✅ Client connected: {ws.client}")
                 break
             if "config" in data:
+                voice_name = data.get("voice", "🇺🇸 🚺 Bella")
+                voice_code = VOICE_CHOICES.get(voice_name, voice_name)
                 speed = float(data.get("speed", speed))
+                get_voice_tensor(voice_code)
             if "text" in data:
+                raw = data["text"]
+                text = normalize_text_for_kokoro(raw)
+                # Bigger chunks reduces stalls under load
+                for chunk in chunk_text(text):
                     if chunk.strip():
+                        await INFERENCE_QUEUE.put((ws, voice_code, speed, chunk))
+            if "flush" in data:
+                pass
     except Exception as e:
         print(f"🔥 Critical WS Error: {e}")
     finally:
         heartbeat_task.cancel()
+# -----------------------------
+# GRADIO UI
+# -----------------------------
+with gr.Blocks(title="Kokoro TTS") as app:
+    gr.Markdown("## ⚡ Kokoro-82M (Official Pipeline, HF CPU-friendly)")
+    with gr.Row():
+        with gr.Column():
+            text_in = gr.Textbox(
+                label="Input Text",
+                lines=3,
+                value="The system is live. Use the UI or connect to /ws/audio.",
+            )
+            voice_in = gr.Dropdown(
+                list(VOICE_CHOICES.keys()),
+                value="🇺🇸 🚺 Bella",
+                label="Voice",
+            )
+            speed_in = gr.Slider(0.5, 2.0, value=1.0, label="Speed")
+            btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            audio_out = gr.Audio(streaming=True, autoplay=True, label="Audio Stream")
+    btn.click(gradio_stream_generator, inputs=[text_in, voice_in, speed_in], outputs=[audio_out])
+final_app = gr.mount_gradio_app(api, app, path="/")
 if __name__ == "__main__":
+    uvicorn.run(final_app, host="0.0.0.0", port=7860)