piper

Sleeping

App Files Files Community

Percy3822 commited on Sep 5, 2025

Commit

a5fe67c

verified ·

1 Parent(s): 06dd823

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -203

app.py CHANGED Viewed

@@ -1,239 +1,232 @@
-import os
-import json
-import time
-import wave
-from pathlib import Path
-from typing import Optional, Dict, Any
-import uvicorn
-import requests
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
-from fastapi.responses import JSONResponse
-from fastapi.staticfiles import StaticFiles
-# Piper (CPU TTS)
-from piper.voice import PiperVoice
-# ------------ Config ------------
-BASE_DIR = Path(os.getenv("BASE_DIR", "/tmp/brain_app")).resolve()
-FILES_DIR = (BASE_DIR / "files").resolve()
-FILES_DIR.mkdir(parents=True, exist_ok=True)
-VOICE_DIR = Path(os.getenv("VOICE_DIR", "/home/user/voices")).resolve()
 VOICE_DIR.mkdir(parents=True, exist_ok=True)
-DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-lessac-high")
-DEFAULT_SR = 22050
-DEFAULT_CHANNELS = 1
-# Hugging Face Piper voice (Lessac high quality)
-PIPER_HF_BASE = (
-    "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/"
-    "en/en_US/lessac/high"
 )
-app = FastAPI(title="ActualTTS (CPU Piper, streaming)")
-app.mount("/file", StaticFiles(directory=str(FILES_DIR)), name="file")
-# ------------ Voice Loader ------------
-_loaded_voices: Dict[str, PiperVoice] = {}
-def list_voices() -> list[str]:
-    return sorted([p.stem for p in VOICE_DIR.glob("*.onnx")])
-def ensure_voice(voice: str) -> tuple[Path, Path]:
     """
-    Ensure .onnx and .onnx.json for 'voice' exist. If not and voice is
-    en_US-lessac-high, download them automatically.
     """
-    onnx = VOICE_DIR / f"{voice}.onnx"
-    cfg  = VOICE_DIR / f"{voice}.onnx.json"
     if onnx.exists() and cfg.exists():
         return onnx, cfg
-    if voice == "en_US-lessac-high":
-        files = {
-            onnx: f"{PIPER_HF_BASE}/en_US-lessac-high.onnx",
-            cfg:  f"{PIPER_HF_BASE}/en_US-lessac-high.onnx.json",
-        }
-        for path, url in files.items():
-            r = requests.get(url, timeout=180)
             r.raise_for_status()
-            path.write_bytes(r.content)
-        return onnx, cfg
-    raise FileNotFoundError(f"Voice '{voice}' not found in {VOICE_DIR}")
-def get_voice(voice: str) -> PiperVoice:
-    if voice in _loaded_voices:
-        return _loaded_voices[voice]
-    onnx, cfg = ensure_voice(voice)
-    v = PiperVoice.load(str(onnx), config_path=str(cfg))
-    _loaded_voices[voice] = v
-    return v
-# ------------ Helpers ------------
-def prosody(body: Dict[str, Any]) -> dict:
     """
-    Extract prosody params with natural defaults.
     """
-    return {
-        "length_scale": float(body.get("length_scale", 1.12)),   # a bit slower
-        "noise_scale":  float(body.get("noise_scale", 0.33)),    # reduce buzz
-        "noise_w":      float(body.get("noise_w", 0.8)),         # stabilize
-        "sentence_silence": float(body.get("sentence_pause", 0.18)),
-    }
-def write_wav_int16(path: Path, sr: int, pcm: bytes, channels: int = 1):
-    with wave.open(str(path), "wb") as wf:
-        wf.setnchannels(channels)
-        wf.setsampwidth(2)  # int16
-        wf.setframerate(sr)
-        wf.writeframes(pcm)
-# ------------ Routes ------------
 @app.get("/health")
 def health():
     return {
         "ok": True,
-        "engine": "piper-tts (CPU)",
-        "default_voice": DEFAULT_VOICE,
         "voice_dir": str(VOICE_DIR),
-        "available_voices": list_voices(),
-        "files_dir": str(FILES_DIR),
-        "tip": "Use WS /ws/tts for streaming or POST /speak for one-shot",
     }
 @app.post("/speak")
-async def speak(request: Request):
     """
-    JSON body:
-    {
-      "text": "Hello world",
-      "voice": "en_US-lessac-high",
-      "rate_wpm": 165,
-      "length_scale": 1.12, "noise_scale": 0.33, "noise_w": 0.8, "sentence_pause": 0.18
-    }
-    Returns: { ok, audio_url, sr, channels }
     """
-    try:
-        body = await request.json()
-    except Exception:
         return JSONResponse(
-            status_code=400,
-            content={"ok": False, "error": "Invalid JSON body"}
         )
-    text: str = str(body.get("text", "")).strip()
-    if not text:
-        return JSONResponse(status_code=400, content={"ok": False, "error": "Missing text"})
-    voice = body.get("voice", DEFAULT_VOICE)
-    try:
-        voice_obj = get_voice(voice)
-    except Exception as e:
-        return JSONResponse(status_code=400, content={
-            "ok": False,
-            "error": f"Voice '{voice}' not found. Available: {list_voices()}"
-        })
-    # Piper ignores WPM internally; we simulate tempo via length_scale default.
-    p = prosody(body)
-    sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
-    ch = DEFAULT_CHANNELS
-    # Synthesize to PCM buffer
-    pcm_chunks: list[bytes] = []
-    for chunk in voice_obj.synthesize_stream(
-        text,
-        length_scale=p["length_scale"],
-        noise_scale=p["noise_scale"],
-        noise_w=p["noise_w"],
-        sentence_silence=p["sentence_silence"],
-    ):
-        if isinstance(chunk, (bytes, bytearray)):
-            pcm_chunks.append(bytes(chunk))
-    # small tail-silence to avoid cut offs
-    tail_ms = int(0.22 * 1000)
-    tail_frames = int(sr * (tail_ms / 1000.0))
-    pcm_chunks.append(b"\x00" * (tail_frames * ch * 2))
-    pcm = b"".join(pcm_chunks)
-    fname = f"tts-{int(time.time()*1000)}.wav"
-    fpath = FILES_DIR / fname
-    write_wav_int16(fpath, sr, pcm, channels=ch)
-    return {
-        "ok": True,
-        "audio_url": f"/file/{fname}",
-        "sr": sr,
-        "channels": ch,
-    }
 @app.websocket("/ws/tts")
 async def ws_tts(ws: WebSocket):
-    """
-    Protocol:
-      <- {"event":"init","voice":"en_US-lessac-high"}
-      <- {"event":"speak","text":"Hello there...","length_scale":1.12,...}
-      -> {"event":"ready","sr":22050,"channels":1}
-      -> <binary PCM16> ... many frames ...
-      -> {"event":"done"}
-    """
     await ws.accept()
-    voice_name = DEFAULT_VOICE
-    voice_obj: Optional[PiperVoice] = None
-    sr = DEFAULT_SR
-    channels = DEFAULT_CHANNELS
     try:
-        while True:
-            raw = await ws.receive_text()
-            msg = json.loads(raw)
-            if msg.get("event") == "init":
-                voice_name = msg.get("voice", DEFAULT_VOICE)
-                voice_obj = get_voice(voice_name)
-                sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
-                await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
-            elif msg.get("event") == "speak":
-                if not voice_obj:
-                    voice_obj = get_voice(voice_name)
-                    sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
-                    await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
-                text = str(msg.get("text", "")).strip()
-                if not text:
-                    await ws.send_text(json.dumps({"event": "error", "detail": "Missing text"}))
-                    continue
-                p = prosody(msg)
-                # Stream PCM16 chunks as binary frames
-                for chunk in voice_obj.synthesize_stream(
-                    text,
-                    length_scale=p["length_scale"],
-                    noise_scale=p["noise_scale"],
-                    noise_w=p["noise_w"],
-                    sentence_silence=p["sentence_silence"],
-                ):
-                    if isinstance(chunk, (bytes, bytearray)):
-                        await ws.send_bytes(bytes(chunk))
-                # tail-silence 220 ms
-                tail_frames = int(sr * 0.22)
-                await ws.send_bytes(b"\x00" * (tail_frames * channels * 2))
-                await ws.send_text(json.dumps({"event": "done"}))
-            else:
-                await ws.send_text(json.dumps({"event": "error", "detail": "Unknown event"}))
     except WebSocketDisconnect:
         pass
@@ -242,6 +235,7 @@ async def ws_tts(ws: WebSocket):
             await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
         except Exception:
             pass
-if __name__ == "__main-_":
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

+import asyncio, json, os, pathlib, re, time, uuid, shutil, tempfile, subprocess
+from typing import Optional
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Body
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+# ---------- CONFIG ----------
+BASE_DIR     = pathlib.Path(os.environ.get("FILES_DIR", "/tmp/tts_app/files"))
+VOICE_DIR    = pathlib.Path(os.environ.get("VOICE_DIR", "/home/user/voices"))
+DEFAULT_VOICE= os.environ.get("DEFAULT_VOICE", "en_US-libritts-high")
+PIPER_BIN    = os.environ.get("PIPER_BIN", "piper")  # comes from piper-tts wheel
+BASE_DIR.mkdir(parents=True, exist_ok=True)
 VOICE_DIR.mkdir(parents=True, exist_ok=True)
+# Rhasspy Piper voice hub (static URLs; no API key needed)
+# We'll lazy-download exactly two files: .onnx + .onnx.json
+VOICE_INDEX = {
+    "en_US-libritts-high": "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/libritts/high/en_US-libritts-high",
+    "en_US-lessac-high":   "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/lessac/high/en_US-lessac-high",
+}
+# Safe defaults for naturalness (a bit slower/clearer)
+DEFAULT_OPTS = dict(
+    sentence_pause=0.18,   # seconds between sentences
+    length_scale=1.12,     # pacing
+    noise_scale=0.33,      # prosody randomness
+    noise_w=0.8            # breathiness
 )
+# ---------- UTIL ----------
+async def ensure_voice(voice_name: str) -> tuple[pathlib.Path, pathlib.Path]:
     """
+    Ensure required voice files exist locally:
+      VOICE_DIR/<name>.onnx
+      VOICE_DIR/<name>.onnx.json
+    Returns (onnx_path, cfg_path).
     """
+    stem = VOICE_INDEX.get(voice_name, VOICE_INDEX[DEFAULT_VOICE])
+    onnx = VOICE_DIR / f"{voice_name}.onnx"
+    cfg  = VOICE_DIR / f"{voice_name}.onnx.json"
     if onnx.exists() and cfg.exists():
         return onnx, cfg
+    # Pick the right base URL from VOICE_INDEX; fallback to default
+    base = VOICE_INDEX.get(voice_name) or VOICE_INDEX[DEFAULT_VOICE]
+    import requests
+    for ext in (".onnx", ".onnx.json"):
+        url = base + ext
+        dst = VOICE_DIR / f"{voice_name}{ext}"
+        if not dst.exists():
+            r = requests.get(url, timeout=120)
             r.raise_for_status()
+            dst.write_bytes(r.content)
+    return onnx, cfg
+def _piper_cmd(onnx_path: pathlib.Path, cfg_path: pathlib.Path,
+               out_wav: Optional[pathlib.Path]=None,
+               raw: bool=False,
+               opts: Optional[dict]=None):
     """
+    Build Piper CLI command.
+    If raw=True, output is 16-bit PCM raw to stdout. Otherwise WAV to --output_file.
     """
+    cmd = [PIPER_BIN, "--model", str(onnx_path), "--config", str(cfg_path)]
+    opts = opts or {}
+    # pacing and quality knobs; if provided by user, we’ll pass them
+    if "length_scale" in opts: cmd += ["--length_scale", str(opts["length_scale"])]
+    if "noise_scale"  in opts: cmd += ["--noise_scale",  str(opts["noise_scale"])]
+    if "noise_w"      in opts: cmd += ["--noise_w",      str(opts["noise_w"])]
+    if raw:
+        cmd += ["--output-raw"]
+    else:
+        assert out_wav is not None
+        cmd += ["--output_file", str(out_wav)]
+    # Let Piper auto sentence-split; we add extra pause at client side if needed
+    return cmd
+async def run_piper_stream(text: str, onnx: pathlib.Path, cfg: pathlib.Path, ws: WebSocket, opts: dict):
+    """
+    Stream raw PCM16 from Piper stdout to the websocket as binary frames.
+    We send a small JSON "ready" event first with suggested sample rate/ch.
+    """
+    # Piper default SR is 22050 mono for most US voices
+    ready_evt = {"event": "ready", "sr": 22050, "channels": 1}
+    await ws.send_text(json.dumps(ready_evt))
+    # Start Piper process
+    cmd = _piper_cmd(onnx, cfg, raw=True, opts=opts)
+    # Piper takes the text from stdin
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdin=asyncio.subprocess.PIPE,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    # Write text then close stdin to let Piper synthesize
+    proc.stdin.write(text.encode("utf-8"))
+    await proc.stdin.drain()
+    proc.stdin.close()
+    # Read stdout in chunks and forward to client
+    try:
+        while True:
+            chunk = await proc.stdout.read(8192)
+            if not chunk:
+                break
+            await ws.send_bytes(chunk)
+    finally:
+        # Drain any remaining stderr (optional debugging)
+        try:
+            _ = await asyncio.wait_for(proc.stderr.read(), timeout=0.1)
+        except asyncio.TimeoutError:
+            pass
+    await proc.wait()
+    await ws.send_text(json.dumps({"event": "done"}))
+# ---------- APP ----------
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[""], allow_credentials=True, allow_methods=[""], allow_headers=["*"],
+)
 @app.get("/health")
 def health():
+    voices = []
+    for k in VOICE_INDEX.keys():
+        onnx = VOICE_DIR / f"{k}.onnx"
+        cfg  = VOICE_DIR / f"{k}.onnx.json"
+        if onnx.exists() and cfg.exists():
+            voices.append(k)
     return {
         "ok": True,
+        "engine": "piper-tts (CLI, CPU)",
+        "default_voice": DEFAULT_VOICE if (VOICE_DIR / f"{DEFAULT_VOICE}.onnx").exists() else None,
         "voice_dir": str(VOICE_DIR),
+        "available_voices": voices,
+        "files_dir": str(BASE_DIR),
     }
 @app.post("/speak")
+async def speak(
+    text: str = Body(...),
+    voice: Optional[str] = Body(None),
+    length_scale: Optional[float] = Body(None),
+    noise_scale: Optional[float] = Body(None),
+    noise_w: Optional[float] = Body(None),
+):
     """
+    One-shot synthesis to WAV file. Returns a short-lived /file/<name>.wav URL.
     """
+    vname = voice or DEFAULT_VOICE
+    onnx, cfg = await asyncio.to_thread(ensure_voice, vname)  # Run blocking download in thread
+    # Build opts
+    opts = dict(DEFAULT_OPTS)
+    if length_scale is not None: opts["length_scale"] = length_scale
+    if noise_scale  is not None: opts["noise_scale"]  = noise_scale
+    if noise_w      is not None: opts["noise_w"]      = noise_w
+    # Output path
+    BASE_DIR.mkdir(parents=True, exist_ok=True)
+    out_wav = BASE_DIR / f"tts-{int(time.time()*1000)}.wav"
+    # Run Piper to file (non-streaming)
+    cmd = _piper_cmd(onnx, cfg, out_wav=out_wav, raw=False, opts=opts)
+    proc = await asyncio.create_subprocess_exec(
+        *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.DEVNULL, stderr=asyncio.subprocess.PIPE
+    )
+    proc.stdin.write(text.encode("utf-8"))
+    await proc.stdin.drain()
+    proc.stdin.close()
+    stderr_txt = (await proc.stderr.read()).decode("utf-8", "ignore")
+    rc = await proc.wait()
+    if rc != 0:
         return JSONResponse(
+            status_code=500,
+            content={"ok": False, "error": "Piper synthesis failed", "detail": stderr_txt[:4000]},
         )
+    return {"ok": True, "audio_url": f"/file/{out_wav.name}"}
 @app.websocket("/ws/tts")
 async def ws_tts(ws: WebSocket):
     await ws.accept()
+    voice = DEFAULT_VOICE
+    opts = dict(DEFAULT_OPTS)
+    text = ""
     try:
+        # Expect: {"event":"init","voice":..., "length_scale":..., "noise_scale":..., "noise_w":...}
+        # then:   {"event":"speak","text": "..."}
+        # (you can send init and speak back-to-back)
+        # 1) Wait for init
+        init_raw = await ws.receive_text()
+        init = json.loads(init_raw)
+        if init.get("event") != "init":
+            await ws.send_text(json.dumps({"event": "error", "detail": "First message must be {'event':'init',...}"}))
+            await ws.close(code=1002)
+            return
+        voice = init.get("voice") or DEFAULT_VOICE
+        for k in ("length_scale", "noise_scale", "noise_w"):
+            if k in init and init[k] is not None:
+                opts[k] = init[k]
+        # 2) Wait for speak
+        speak_raw = await ws.receive_text()
+        sp = json.loads(speak_raw)
+        if sp.get("event") != "speak":
+            await ws.send_text(json.dumps({"event": "error", "detail": "Expected {'event':'speak','text':...}"}))
+            await ws.close(code=1002)
+            return
+        text = sp.get("text") or ""
+        if not text.strip():
+            await ws.send_text(json.dumps({"event": "error", "detail": "Empty text"}))
+            await ws.close(code=1002)
+            return
+        # 3) Ensure voice files then stream Piper stdout
+        onnx, cfg = await asyncio.to_thread(ensure_voice, voice)
+        await run_piper_stream(text, onnx, cfg, ws, opts)
     except WebSocketDisconnect:
         pass
             await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
         except Exception:
             pass
+        try:
+            await ws.close(code=1011)
+        except Exception:
+            pass