piper

Sleeping

App Files Files Community

Percy3822 commited on Sep 5, 2025

Commit

9d33e4a

verified ·

1 Parent(s): 90ba8a3

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -131

app.py CHANGED Viewed

@@ -1,152 +1,247 @@
-import os, io, time, json, asyncio, tempfile, wave, uuid, shlex
-from typing import Optional
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Body
-from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse
-from fastapi.middleware.cors import CORSMiddleware
-import anyio
-APP_NAME = "ActualTTS (espeak-ng)"
-BASE_DIR = "/tmp/actual_tts"
-FILES_DIR = os.path.join(BASE_DIR, "files")
-os.makedirs(FILES_DIR, exist_ok=True)
-DEFAULT_VOICE = "en-us"  # espeak voice id
-DEFAULT_RATE_WPM = 170   # speaking speed
-app = FastAPI(title=APP_NAME)
-# CORS: allow local client
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-@app.get("/", response_class=PlainTextResponse)
-def root():
-    return "OK"
 @app.get("/health")
 def health():
     return {
         "ok": True,
-        "engine": "espeak-ng",
         "default_voice": DEFAULT_VOICE,
-        "files_dir": FILES_DIR,
-        "tip": "WebSocket /ws/tts (init, then speak), or POST /speak",
     }
-def espeak_cmd(text: str, voice: str = DEFAULT_VOICE, rate_wpm: int = DEFAULT_RATE_WPM):
-    # --stdout makes espeak-ng write a valid WAV to stdout
-    # -v voice, -s speed(WPM)
-    # We quote text via shell=False + pass list args
-    return ["espeak-ng", "--stdout", "-v", voice, "-s", str(rate_wpm), text]
-async def synth_to_file(text: str, voice: str, rate_wpm: int) -> str:
-    """Run espeak-ng once, capture its WAV, write to a file, return path."""
-    tmp_path = os.path.join(FILES_DIR, f"tts-{int(time.time()*1000)}.wav")
-    proc = await asyncio.create_subprocess_exec(
-        *espeak_cmd(text, voice, rate_wpm),
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-    )
-    wav_bytes, err = await proc.communicate()
-    if proc.returncode != 0 or not wav_bytes:
-        raise RuntimeError(f"espeak-ng failed rc={proc.returncode}, err={err.decode('utf-8','ignore')}")
-    with open(tmp_path, "wb") as f:
-        f.write(wav_bytes)
-    return tmp_path
 @app.post("/speak")
-async def speak_post(payload: dict = Body(...)):
-    text: str = payload.get("text", "")
-    voice: str = payload.get("voice") or DEFAULT_VOICE
-    rate_wpm: int = int(payload.get("rate_wpm", DEFAULT_RATE_WPM))
-    if not text.strip():
-        return JSONResponse({"ok": False, "error": "no text"}, status_code=400)
     try:
-        path = await synth_to_file(text, voice, rate_wpm)
     except Exception as e:
-        return JSONResponse({"ok": False, "error": "Synthesis failed", "detail": str(e)}, status_code=500)
-    rel = f"/file/{os.path.basename(path)}"
-    return {"ok": True, "audio_url": rel}
-@app.get("/file/{fname}")
-async def get_file(fname: str):
-    path = os.path.join(FILES_DIR, fname)
-    if not os.path.isfile(path):
-        return JSONResponse({"ok": False, "error": "not found"}, status_code=404)
-    return FileResponse(path, media_type="audio/wav")
-# --------- WebSocket Streaming TTS ----------
-# Protocol:
-#   Client sends: {"event":"init","voice":"en-us","rate_wpm":170}
-#   Server replies: {"event":"ready","sr":22050}
-#   Client sends: {"event":"speak","text":"..."}
-#   Server streams: binary frames with WAV bytes as they are produced
-#                   then {"event":"done"}
 @app.websocket("/ws/tts")
 async def ws_tts(ws: WebSocket):
     await ws.accept()
-    voice = DEFAULT_VOICE
-    rate_wpm = DEFAULT_RATE_WPM
     try:
-        # Expect init first
-        first = await ws.receive_text()
-        msg = json.loads(first)
-        if msg.get("event") != "init":
-            await ws.send_text(json.dumps({"event":"error","detail":"first message must be {'event':'init',...}"}))
-            await ws.close(code=1002)
-            return
-        if "voice" in msg and msg["voice"]:
-            voice = msg["voice"]
-        if "rate_wpm" in msg:
-            try:
-                rate_wpm = int(msg["rate_wpm"])
-            except:
-                pass
-        # Tell client our sample-rate. espeak-ng emits 22050 Hz PCM.
-        await ws.send_text(json.dumps({"event":"ready","sr":22050}))
-        # Wait for speak
-        nxt = await ws.receive_text()
-        data = json.loads(nxt)
-        if data.get("event") != "speak" or not data.get("text"):
-            await ws.send_text(json.dumps({"event":"error","detail":"need {'event':'speak','text':...}"}))
-            await ws.close()
-            return
-        text = data["text"]
-        # Spawn espeak-ng and stream its stdout as binary chunks
-        proc = await asyncio.create_subprocess_exec(
-            *espeak_cmd(text, voice, rate_wpm),
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-        )
-        # espeak-ng writes a full WAV header then data. We just forward in chunks.
-        try:
-            while True:
-                chunk = await proc.stdout.read(4096)
-                if not chunk:
-                    break
-                await ws.send_bytes(chunk)
-            rc = await proc.wait()
-            if rc != 0:
-                err = (await proc.stderr.read()).decode("utf-8","ignore")
-                await ws.send_text(json.dumps({"event":"error","detail":f"espeak-ng failed rc={rc}: {err[:200]}" }))
-            await ws.send_text(json.dumps({"event":"done"}))
-        finally:
-            with anyio.move_on_after(0.1):
-                proc.kill()
     except WebSocketDisconnect:
         pass
     except Exception as e:
-        with anyio.move_on_after(0.1):
-            await ws.send_text(json.dumps({"event":"error","detail":str(e)}))
-        with anyio.move_on_after(0.1):
-            await ws.close()

+import os
+import json
+import time
+import wave
+from pathlib import Path
+from typing import Optional, Dict, Any
+import uvicorn
+import requests
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
+from fastapi.responses import JSONResponse
+from fastapi.staticfiles import StaticFiles
+# Piper (CPU TTS)
+from piper.voice import PiperVoice
+# ------------ Config ------------
+BASE_DIR = Path(os.getenv("BASE_DIR", "/tmp/brain_app")).resolve()
+FILES_DIR = (BASE_DIR / "files").resolve()
+FILES_DIR.mkdir(parents=True, exist_ok=True)
+VOICE_DIR = Path(os.getenv("VOICE_DIR", "/home/user/voices")).resolve()
+VOICE_DIR.mkdir(parents=True, exist_ok=True)
+DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-lessac-high")
+DEFAULT_SR = 22050
+DEFAULT_CHANNELS = 1
+# Hugging Face Piper voice (Lessac high quality)
+PIPER_HF_BASE = (
+    "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/"
+    "en/en_US/lessac/high"
 )
+app = FastAPI(title="ActualTTS (CPU Piper, streaming)")
+app.mount("/file", StaticFiles(directory=str(FILES_DIR)), name="file")
+# ------------ Voice Loader ------------
+_loaded_voices: Dict[str, PiperVoice] = {}
+def list_voices() -> list[str]:
+    return sorted([p.stem for p in VOICE_DIR.glob("*.onnx")])
+def ensure_voice(voice: str) -> tuple[Path, Path]:
+    """
+    Ensure .onnx and .onnx.json for 'voice' exist. If not and voice is
+    en_US-lessac-high, download them automatically.
+    """
+    onnx = VOICE_DIR / f"{voice}.onnx"
+    cfg  = VOICE_DIR / f"{voice}.onnx.json"
+    if onnx.exists() and cfg.exists():
+        return onnx, cfg
+    if voice == "en_US-lessac-high":
+        files = {
+            onnx: f"{PIPER_HF_BASE}/en_US-lessac-high.onnx",
+            cfg:  f"{PIPER_HF_BASE}/en_US-lessac-high.onnx.json",
+        }
+        for path, url in files.items():
+            r = requests.get(url, timeout=180)
+            r.raise_for_status()
+            path.write_bytes(r.content)
+        return onnx, cfg
+    raise FileNotFoundError(f"Voice '{voice}' not found in {VOICE_DIR}")
+def get_voice(voice: str) -> PiperVoice:
+    if voice in _loaded_voices:
+        return _loaded_voices[voice]
+    onnx, cfg = ensure_voice(voice)
+    v = PiperVoice.load(str(onnx), config_path=str(cfg))
+    _loaded_voices[voice] = v
+    return v
+# ------------ Helpers ------------
+def prosody(body: Dict[str, Any]) -> dict:
+    """
+    Extract prosody params with natural defaults.
+    """
+    return {
+        "length_scale": float(body.get("length_scale", 1.12)),   # a bit slower
+        "noise_scale":  float(body.get("noise_scale", 0.33)),    # reduce buzz
+        "noise_w":      float(body.get("noise_w", 0.8)),         # stabilize
+        "sentence_silence": float(body.get("sentence_pause", 0.18)),
+    }
+def write_wav_int16(path: Path, sr: int, pcm: bytes, channels: int = 1):
+    with wave.open(str(path), "wb") as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(2)  # int16
+        wf.setframerate(sr)
+        wf.writeframes(pcm)
+# ------------ Routes ------------
 @app.get("/health")
 def health():
     return {
         "ok": True,
+        "engine": "piper-tts (CPU)",
         "default_voice": DEFAULT_VOICE,
+        "voice_dir": str(VOICE_DIR),
+        "available_voices": list_voices(),
+        "files_dir": str(FILES_DIR),
+        "tip": "Use WS /ws/tts for streaming or POST /speak for one-shot",
     }
 @app.post("/speak")
+async def speak(request: Request):
+    """
+    JSON body:
+    {
+      "text": "Hello world",
+      "voice": "en_US-lessac-high",
+      "rate_wpm": 165,
+      "length_scale": 1.12, "noise_scale": 0.33, "noise_w": 0.8, "sentence_pause": 0.18
+    }
+    Returns: { ok, audio_url, sr, channels }
+    """
+    try:
+        body = await request.json()
+    except Exception:
+        return JSONResponse(
+            status_code=400,
+            content={"ok": False, "error": "Invalid JSON body"}
+        )
+    text: str = str(body.get("text", "")).strip()
+    if not text:
+        return JSONResponse(status_code=400, content={"ok": False, "error": "Missing text"})
+    voice = body.get("voice", DEFAULT_VOICE)
     try:
+        voice_obj = get_voice(voice)
     except Exception as e:
+        return JSONResponse(status_code=400, content={
+            "ok": False,
+            "error": f"Voice '{voice}' not found. Available: {list_voices()}"
+        })
+    # Piper ignores WPM internally; we simulate tempo via length_scale default.
+    p = prosody(body)
+    sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
+    ch = DEFAULT_CHANNELS
+    # Synthesize to PCM buffer
+    pcm_chunks: list[bytes] = []
+    for chunk in voice_obj.synthesize_stream(
+        text,
+        length_scale=p["length_scale"],
+        noise_scale=p["noise_scale"],
+        noise_w=p["noise_w"],
+        sentence_silence=p["sentence_silence"],
+    ):
+        if isinstance(chunk, (bytes, bytearray)):
+            pcm_chunks.append(bytes(chunk))
+    # small tail-silence to avoid cut offs
+    tail_ms = int(0.22 * 1000)
+    tail_frames = int(sr * (tail_ms / 1000.0))
+    pcm_chunks.append(b"\x00" * (tail_frames * ch * 2))
+    pcm = b"".join(pcm_chunks)
+    fname = f"tts-{int(time.time()*1000)}.wav"
+    fpath = FILES_DIR / fname
+    write_wav_int16(fpath, sr, pcm, channels=ch)
+    return {
+        "ok": True,
+        "audio_url": f"/file/{fname}",
+        "sr": sr,
+        "channels": ch,
+    }
 @app.websocket("/ws/tts")
 async def ws_tts(ws: WebSocket):
+    """
+    Protocol:
+      <- {"event":"init","voice":"en_US-lessac-high"}
+      <- {"event":"speak","text":"Hello there...","length_scale":1.12,...}
+      -> {"event":"ready","sr":22050,"channels":1}
+      -> <binary PCM16> ... many frames ...
+      -> {"event":"done"}
+    """
     await ws.accept()
+    voice_name = DEFAULT_VOICE
+    voice_obj: Optional[PiperVoice] = None
+    sr = DEFAULT_SR
+    channels = DEFAULT_CHANNELS
     try:
+        while True:
+            raw = await ws.receive_text()
+            msg = json.loads(raw)
+            if msg.get("event") == "init":
+                voice_name = msg.get("voice", DEFAULT_VOICE)
+                voice_obj = get_voice(voice_name)
+                sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
+                await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
+            elif msg.get("event") == "speak":
+                if not voice_obj:
+                    voice_obj = get_voice(voice_name)
+                    sr = int(getattr(voice_obj, "sample_rate_hz", DEFAULT_SR))
+                    await ws.send_text(json.dumps({"event": "ready", "sr": sr, "channels": channels}))
+                text = str(msg.get("text", "")).strip()
+                if not text:
+                    await ws.send_text(json.dumps({"event": "error", "detail": "Missing text"}))
+                    continue
+                p = prosody(msg)
+                # Stream PCM16 chunks as binary frames
+                for chunk in voice_obj.synthesize_stream(
+                    text,
+                    length_scale=p["length_scale"],
+                    noise_scale=p["noise_scale"],
+                    noise_w=p["noise_w"],
+                    sentence_silence=p["sentence_silence"],
+                ):
+                    if isinstance(chunk, (bytes, bytearray)):
+                        await ws.send_bytes(bytes(chunk))
+                # tail-silence 220 ms
+                tail_frames = int(sr * 0.22)
+                await ws.send_bytes(b"\x00" * (tail_frames * channels * 2))
+                await ws.send_text(json.dumps({"event": "done"}))
+            else:
+                await ws.send_text(json.dumps({"event": "error", "detail": "Unknown event"}))
     except WebSocketDisconnect:
         pass
     except Exception as e:
+        try:
+            await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
+        except Exception:
+            pass
+if __name__ == "__main-_":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)