piper

Sleeping

App Files Files Community

Percy3822 commited on Sep 5, 2025

Commit

fa99ff3

verified ·

1 Parent(s): 976e3b5

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -170

app.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import asyncio
 import json
 import os
-import re
-import shlex
-import subprocess
 import tarfile
 import time
 from pathlib import Path
-from typing import Optional, Dict, Any
 import uvicorn
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
@@ -17,7 +14,7 @@ from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse
 # Writable directory picker
 # -------------------------
-def pick_writable_dir(*candidates: Path) -> Path:
     errs = []
     for p in candidates:
         if not p:
@@ -25,171 +22,129 @@ def pick_writable_dir(*candidates: Path) -> Path:
         try:
             p.mkdir(parents=True, exist_ok=True)
             probe = p / ".probe"
-            with open(probe, "wb") as f:
-                f.write(b"ok")
             probe.unlink(missing_ok=True)
             return p
         except Exception as e:
             errs.append(f"{p}: {type(e)._name_}({e})")
-    raise RuntimeError("No writable dir. Tried:\n" + "\n".join(errs))
-# Honors env overrides; then tries common writable places on HF Spaces
 ENV_DIR = os.getenv("TTS_DATA_DIR")
-VOICES_DIR = None
-FILES_DIR = None
-def init_dirs():
-    global VOICES_DIR, FILES_DIR
-    cand_voices = []
-    if ENV_DIR:
-        cand_voices.append(Path(ENV_DIR) / "voices")
-    cand_voices += [
-        Path("/home/user/.cache/actualtts/voices"),
-        Path("/home/user/voices"),
-        Path("/tmp/actualtts/voices"),
-        Path("/dev/shm/actualtts_voices"),
-    ]
-    VOICES_DIR = pick_writable_dir(*cand_voices)
-    cand_files = []
-    if ENV_DIR:
-        cand_files.append(Path(ENV_DIR) / "files")
-    cand_files += [
-        Path("/home/user/.cache/actualtts/files"),
-        Path("/tmp/actualtts/files"),
-        Path("/dev/shm/actualtts_files"),
-    ]
-    FILES_DIR = pick_writable_dir(*cand_files)
-init_dirs()
 # -------------------------
 # Piper CLI integration
 # -------------------------
-# Piper binary is preinstalled in CPU Spaces images that have Piper CLI.
-# If your image differs, set PIPER_BIN env to the correct path.
 PIPER_BIN = os.getenv("PIPER_BIN", "piper")
-# A small catalog of good CPU voices (VITS-based) hosted on HF.
 HF_VOICES: Dict[str, str] = {
     "en_US-libritts-high": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-libritts-high.onnx.tar.gz",
     "en_US-lessac-high":   "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-lessac-high.onnx.tar.gz",
     "en_US-amy-medium":    "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-amy-medium.onnx.tar.gz",
 }
 DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-libritts-high")
-DEFAULT_SR = 22050  # Piper typically outputs 22050 Hz
-DEFAULT_CHANNELS = 1
-_HTTP_CLIENT = None  # lazy import only if needed
-def _http():
-    global _HTTP_CLIENT
-    if _HTTP_CLIENT is None:
         import requests
-        _HTTP_CLIENT = requests.Session()
-        _HTTP_CLIENT.headers.update({"User-Agent": "ActualTTS/CPU"})
-    return _HTTP_CLIENT
 def ensure_voice(voice_id: str) -> Dict[str, Path]:
-    """
-    Ensure the given voice is present locally. If missing, download and extract.
-    Returns {"model": Path, "config": Path}
-    """
     vdir = VOICES_DIR / voice_id
     model = vdir / f"{voice_id}.onnx"
-    config = vdir / f"{voice_id}.onnx.json"
-    if model.exists() and config.exists():
-        return {"model": model, "config": config}
     url = HF_VOICES.get(voice_id)
     if not url:
-        # Try a heuristic: accept 'en-us' as a generic alias
-        if voice_id.lower() in ("en-us", "en_us", "english"):
-            voice_id = "en_US-libritts-high"
-            url = HF_VOICES[voice_id]
-            vdir = VOICES_DIR / voice_id
-            model = vdir / f"{voice_id}.onnx"
-            config = vdir / f"{voice_id}.onnx.json"
-        else:
-            raise RuntimeError(f"Unknown/unsupported voice '{voice_id}'. Known: {list(HF_VOICES)}")
     vdir.mkdir(parents=True, exist_ok=True)
     tar_path = vdir / f"{voice_id}.onnx.tar.gz"
-    # Download
-    r = _http().get(url, timeout=120, stream=True)
     r.raise_for_status()
     with open(tar_path, "wb") as f:
         for chunk in r.iter_content(1 << 16):
             if chunk:
                 f.write(chunk)
-    # Extract
     with tarfile.open(tar_path, "r:gz") as tf:
         tf.extractall(vdir)
     tar_path.unlink(missing_ok=True)
-    if not model.exists() or not config.exists():
-        raise RuntimeError(f"Voice files not found after extraction: {voice_id}")
-    return {"model": model, "config": config}
 def build_piper_cmd(text: str, voice_id: str, to_stdout: bool, out_path: Optional[Path] = None,
-                    length_scale: float = 1.10, noise_scale: float = 0.35, noise_w: float = 0.90) -> list:
     vc = ensure_voice(voice_id)
-    args = [
         PIPER_BIN,
         "-m", str(vc["model"]),
         "-c", str(vc["config"]),
-        "-q",  # quieter logs
         "--length_scale", str(length_scale),
-        "--noise_scale", str(noise_scale),
-        "--noise_w", str(noise_w),
     ]
     if to_stdout:
-        args += ["-f", "-"]
     else:
         if out_path is None:
-            raise ValueError("out_path is required when to_stdout=False")
-        args += ["-f", str(out_path)]
-    return args
-async def run_piper_to_file(text: str, voice_id: str, out_path: Path,
-                            length_scale: float, noise_scale: float, noise_w: float) -> None:
-    cmd = build_piper_cmd(text, voice_id, to_stdout=False, out_path=out_path,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
-    # write the text
     await proc.stdin.write(text.encode("utf-8"))
     await proc.stdin.drain()
     proc.stdin.close()
     await proc.wait()
     if proc.returncode != 0:
         stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
-        raise RuntimeError(f"Piper failed (code {proc.returncode}). Stderr:\n{stderr}")
-async def run_piper_stream(text: str, voice_id: str, websocket: WebSocket,
-                           length_scale: float, noise_scale: float, noise_w: float) -> None:
-    """
-    Stream binary PCM16 via WS while Piper renders to stdout.
-    """
-    cmd = build_piper_cmd(text, voice_id, to_stdout=True,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
-    # Tell client we're ready (so it can open its audio device early)
-    await websocket.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CHANNELS}))
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
@@ -197,39 +152,44 @@ async def run_piper_stream(text: str, voice_id: str, websocket: WebSocket,
     await proc.stdin.drain()
     proc.stdin.close()
     try:
         while True:
             chunk = await proc.stdout.read(4096)
             if not chunk:
                 break
-            # Piper emits WAV unless given raw flag; CLI doesn't expose raw PCM easily.
-            # However, Piper's default stdout with -f - is WAV. Many clients can accept WAV frames incrementally.
-            # Your client plays raw PCM; so we keep WAV to avoid truncation and let the client handle it,
-            # or we can strip the first 44 bytes (WAV header) once and then stream the rest as PCM16.
-            # We'll do header-strip-once below:
-            # Detect/strip WAV header (44 bytes) exactly once per stream:
-            if len(chunk) >= 44 and chunk[0:4] == b"RIFF" and chunk[8:12] == b"WAVE":
-                # Skip the 44-byte header (simple WAV)
-                chunk = chunk[44:]
-                if not chunk:
-                    continue
-            await websocket.send_bytes(chunk)
         await proc.wait()
         if proc.returncode != 0:
             stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
-            await websocket.send_text(json.dumps({"event": "error", "detail": stderr}))
         else:
-            await websocket.send_text(json.dumps({"event": "done"}))
     except WebSocketDisconnect:
         try:
             proc.kill()
         except Exception:
             pass
 # ---------------
 # FastAPI wiring
 # ---------------
@@ -238,31 +198,26 @@ app = FastAPI(title="ActualTTS (CPU)")
 @app.get("/health")
 def health():
-    def list_voices():
-        out = []
-        for child in VOICES_DIR.iterdir():
-            if not child.is_dir():
-                continue
-            name = child.name
-            if (child / f"{name}.onnx").exists() and (child / f"{name}.onnx.json").exists():
-                out.append(name)
-        return out
     return {
         "ok": True,
         "engine": "piper-tts (CLI, CPU)",
         "default_voice": DEFAULT_VOICE,
         "voice_dir": str(VOICES_DIR),
-        "available_voices": list_voices(),
         "files_dir": str(FILES_DIR),
     }
 @app.get("/")
 def root():
     return PlainTextResponse("ActualTTS (CPU) — use POST /speak or WS /ws/tts")
 @app.get("/file/{name}")
 def get_file(name: str):
     path = FILES_DIR / name
@@ -270,20 +225,18 @@ def get_file(name: str):
         return JSONResponse({"ok": False, "error": "not found"}, status_code=404)
     return FileResponse(path)
 @app.post("/speak")
 async def speak(request: Request):
     """
-    JSON body:
-      {
-        "text": "Hello world",
-        "voice": "en_US-libritts-high",
-        "length_scale": 1.10,
-        "noise_scale": 0.35,
-        "noise_w": 0.90
-      }
-    Returns:
-      {"ok": true, "audio_url": "/file/tts-<ts>.wav"}
     """
     try:
         body = await request.json()
@@ -295,33 +248,26 @@ async def speak(request: Request):
         return JSONResponse({"detail": "Missing text"}, status_code=400)
     voice = (body.get("voice") or DEFAULT_VOICE).strip()
-    length_scale = float(body.get("length_scale", 1.10))
-    noise_scale = float(body.get("noise_scale", 0.35))
-    noise_w = float(body.get("noise_w", 0.90))
-    # prepare output
     ts = int(time.time() * 1000)
     out_path = FILES_DIR / f"tts-{ts}.wav"
     try:
-        # ensure voice (download if needed)
         ensure_voice(voice)
-        # run piper CLI to a wav file
-        await run_piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w)
     except Exception as e:
         return JSONResponse({"ok": False, "error": str(e)}, status_code=500)
     return {"ok": True, "audio_url": f"/file/{out_path.name}"}
 @app.websocket("/ws/tts")
 async def ws_tts(ws: WebSocket):
     await ws.accept()
     voice = DEFAULT_VOICE
-    length_scale = 1.10
-    noise_scale = 0.35
-    noise_w = 0.90
-    text_to_speak: Optional[str] = None
     try:
         while True:
@@ -330,34 +276,27 @@ async def ws_tts(ws: WebSocket):
                 data = json.loads(msg)
             except Exception:
                 continue
             ev = data.get("event")
             if ev == "init":
                 voice = (data.get("voice") or voice).strip()
-                # allow optional tuning via WS
                 if "length_scale" in data: length_scale = float(data["length_scale"])
                 if "noise_scale"  in data: noise_scale  = float(data["noise_scale"])
                 if "noise_w"      in data: noise_w      = float(data["noise_w"])
-                # ensure voice now so we can send ready immediately
                 try:
                     ensure_voice(voice)
                 except Exception as e:
                     await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
                     await ws.close()
                     return
-                await ws.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CHANNELS}))
             elif ev == "speak":
-                text_to_speak = (data.get("text") or "").strip()
-                if not text_to_speak:
                     await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
                     continue
-                # stream via stdout
-                await run_piper_stream(text_to_speak, voice, ws, length_scale, noise_scale, noise_w)
-            else:
-                # ignore unknown events
-                pass
     except WebSocketDisconnect:
         return
     except Exception as e:
@@ -370,7 +309,5 @@ async def ws_tts(ws: WebSocket):
         except Exception:
             pass
 if __name__ == "__main__":
-    # For local debug; Spaces uses Entrypoint/Cmd
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

 import asyncio
 import json
 import os
 import tarfile
 import time
 from pathlib import Path
+from typing import Optional, Dict
 import uvicorn
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
 # Writable directory picker
 # -------------------------
+def pick_writable_dir(candidates):
     errs = []
     for p in candidates:
         if not p:
         try:
             p.mkdir(parents=True, exist_ok=True)
             probe = p / ".probe"
+            probe.write_bytes(b"ok")
             probe.unlink(missing_ok=True)
             return p
         except Exception as e:
             errs.append(f"{p}: {type(e)._name_}({e})")
+    raise RuntimeError("No writable dir. Tried:\n  " + "\n  ".join(errs))
+# Only pick truly safe/writable locations on HF Spaces
 ENV_DIR = os.getenv("TTS_DATA_DIR")
+VOICE_CANDIDATES = [
+    Path("/tmp/actualtts/voices"),
+    Path("/dev/shm/actualtts_voices"),
+    Path(ENV_DIR) / "voices" if ENV_DIR else None,
+]
+FILE_CANDIDATES = [
+    Path("/tmp/actualtts/files"),
+    Path("/dev/shm/actualtts_files"),
+    Path(ENV_DIR) / "files" if ENV_DIR else None,
+]
+VOICES_DIR = pick_writable_dir([p for p in VOICE_CANDIDATES if p])
+FILES_DIR  = pick_writable_dir([p for p in FILE_CANDIDATES if p])
 # -------------------------
 # Piper CLI integration
 # -------------------------
 PIPER_BIN = os.getenv("PIPER_BIN", "piper")
+# Good human-y CPU voices hosted by Rhasspy (VITS-based)
 HF_VOICES: Dict[str, str] = {
     "en_US-libritts-high": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-libritts-high.onnx.tar.gz",
     "en_US-lessac-high":   "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-lessac-high.onnx.tar.gz",
     "en_US-amy-medium":    "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-amy-medium.onnx.tar.gz",
 }
 DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-libritts-high")
+DEFAULT_SR = 22050
+DEFAULT_CH = 1
+_http = None
+def http():
+    global _http
+    if _http is None:
         import requests
+        _http = requests.Session()
+        _http.headers.update({"User-Agent": "ActualTTS/CPU"})
+    return _http
 def ensure_voice(voice_id: str) -> Dict[str, Path]:
+    """Download+extract voice once. Returns paths to model/config."""
+    # Alias common generic tags
+    if voice_id.lower() in ("en-us", "en_us", "english"):
+        voice_id = "en_US-libritts-high"
     vdir = VOICES_DIR / voice_id
     model = vdir / f"{voice_id}.onnx"
+    cfg   = vdir / f"{voice_id}.onnx.json"
+    if model.exists() and cfg.exists():
+        return {"model": model, "config": cfg}
     url = HF_VOICES.get(voice_id)
     if not url:
+        raise RuntimeError(f"Unknown voice '{voice_id}'. Known: {list(HF_VOICES)}")
     vdir.mkdir(parents=True, exist_ok=True)
     tar_path = vdir / f"{voice_id}.onnx.tar.gz"
+    r = http().get(url, timeout=180, stream=True)
     r.raise_for_status()
     with open(tar_path, "wb") as f:
         for chunk in r.iter_content(1 << 16):
             if chunk:
                 f.write(chunk)
     with tarfile.open(tar_path, "r:gz") as tf:
         tf.extractall(vdir)
     tar_path.unlink(missing_ok=True)
+    if not model.exists() or not cfg.exists():
+        raise RuntimeError(f"Voice files missing after extraction for '{voice_id}'")
+    return {"model": model, "config": cfg}
 def build_piper_cmd(text: str, voice_id: str, to_stdout: bool, out_path: Optional[Path] = None,
+                    length_scale: float = 1.08, noise_scale: float = 0.35, noise_w: float = 0.90) -> list:
     vc = ensure_voice(voice_id)
+    cmd = [
         PIPER_BIN,
         "-m", str(vc["model"]),
         "-c", str(vc["config"]),
+        "-q",
         "--length_scale", str(length_scale),
+        "--noise_scale",  str(noise_scale),
+        "--noise_w",      str(noise_w),
     ]
     if to_stdout:
+        cmd += ["-f", "-"]   # write WAV to stdout
     else:
         if out_path is None:
+            raise ValueError("out_path required when to_stdout=False")
+        cmd += ["-f", str(out_path)]
+    return cmd
+async def piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w):
+    cmd = build_piper_cmd(text, voice, to_stdout=False, out_path=out_path,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
     await proc.stdin.write(text.encode("utf-8"))
     await proc.stdin.drain()
     proc.stdin.close()
     await proc.wait()
     if proc.returncode != 0:
         stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
+        raise RuntimeError(f"Piper failed (code {proc.returncode}).\n{stderr}")
+async def piper_stream_stdout(text, voice, ws: WebSocket, length_scale, noise_scale, noise_w):
+    """Stream WAV from Piper stdout over WS, stripping the WAV header once even if split."""
+    # Notify client early so it can open audio device
+    await ws.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CH}))
+    cmd = build_piper_cmd(text, voice, to_stdout=True,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
     await proc.stdin.drain()
     proc.stdin.close()
+    header_needed = True
+    header_buf = bytearray()
     try:
         while True:
             chunk = await proc.stdout.read(4096)
             if not chunk:
                 break
+            if header_needed:
+                header_buf.extend(chunk)
+                if len(header_buf) < 44:
+                    # keep reading until we have at least a WAV header
+                    continue
+                # If it really is a WAV stream, strip the 44-byte header
+                if header_buf[:4] == b"RIFF" and header_buf[8:12] == b"WAVE":
+                    payload = header_buf[44:]
+                else:
+                    payload = bytes(header_buf)  # not a WAV? just pass through
+                header_buf.clear()
+                header_needed = False
+                if payload:
+                    await ws.send_bytes(payload)
+            else:
+                await ws.send_bytes(chunk)
         await proc.wait()
         if proc.returncode != 0:
             stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
+            await ws.send_text(json.dumps({"event": "error", "detail": stderr}))
         else:
+            await ws.send_text(json.dumps({"event": "done"}))
     except WebSocketDisconnect:
         try:
             proc.kill()
         except Exception:
             pass
 # ---------------
 # FastAPI wiring
 # ---------------
 @app.get("/health")
 def health():
+    voices = []
+    for child in VOICES_DIR.iterdir():
+        if not child.is_dir():
+            continue
+        name = child.name
+        if (child / f"{name}.onnx").exists() and (child / f"{name}.onnx.json").exists():
+            voices.append(name)
     return {
         "ok": True,
         "engine": "piper-tts (CLI, CPU)",
         "default_voice": DEFAULT_VOICE,
         "voice_dir": str(VOICES_DIR),
+        "available_voices": voices,
         "files_dir": str(FILES_DIR),
     }
 @app.get("/")
 def root():
     return PlainTextResponse("ActualTTS (CPU) — use POST /speak or WS /ws/tts")
 @app.get("/file/{name}")
 def get_file(name: str):
     path = FILES_DIR / name
         return JSONResponse({"ok": False, "error": "not found"}, status_code=404)
     return FileResponse(path)
 @app.post("/speak")
 async def speak(request: Request):
     """
+    Body (JSON):
+    {
+      "text": "Hello",
+      "voice": "en_US-libritts-high",
+      "length_scale": 1.08,
+      "noise_scale": 0.35,
+      "noise_w": 0.90
+    }
+    => {"ok": true, "audio_url": "/file/tts-XXXX.wav"}
     """
     try:
         body = await request.json()
         return JSONResponse({"detail": "Missing text"}, status_code=400)
     voice = (body.get("voice") or DEFAULT_VOICE).strip()
+    length_scale = float(body.get("length_scale", 1.08))
+    noise_scale  = float(body.get("noise_scale", 0.35))
+    noise_w      = float(body.get("noise_w", 0.90))
     ts = int(time.time() * 1000)
     out_path = FILES_DIR / f"tts-{ts}.wav"
     try:
         ensure_voice(voice)
+        await piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w)
     except Exception as e:
         return JSONResponse({"ok": False, "error": str(e)}, status_code=500)
     return {"ok": True, "audio_url": f"/file/{out_path.name}"}
 @app.websocket("/ws/tts")
 async def ws_tts(ws: WebSocket):
     await ws.accept()
     voice = DEFAULT_VOICE
+    length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90
     try:
         while True:
                 data = json.loads(msg)
             except Exception:
                 continue
             ev = data.get("event")
             if ev == "init":
                 voice = (data.get("voice") or voice).strip()
                 if "length_scale" in data: length_scale = float(data["length_scale"])
                 if "noise_scale"  in data: noise_scale  = float(data["noise_scale"])
                 if "noise_w"      in data: noise_w      = float(data["noise_w"])
                 try:
                     ensure_voice(voice)
                 except Exception as e:
                     await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
                     await ws.close()
                     return
+                # 'ready' is sent inside piper_stream_stdout too, but send an early ping-less notice:
+                await ws.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CH}))
             elif ev == "speak":
+                text = (data.get("text") or "").strip()
+                if not text:
                     await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
                     continue
+                await piper_stream_stdout(text, voice, ws, length_scale, noise_scale, noise_w)
+            # ignore other events
     except WebSocketDisconnect:
         return
     except Exception as e:
         except Exception:
             pass
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)