piper

Sleeping

App Files Files Community

Percy3822 commited on Sep 5, 2025

Commit

976e3b5

verified ·

1 Parent(s): ff0f504

Update app.py

Browse files

Files changed (1) hide show

app.py +352 -93

app.py CHANGED Viewed

@@ -1,117 +1,376 @@
-# app.py
-import os, io, time, uuid, shutil, tempfile
-from pathlib import Path
-from fastapi import FastAPI, Request
-from fastapi.responses import FileResponse, StreamingResponse, JSONResponse
-from fastapi.websockets import WebSocket
-from pydantic import BaseModel
 import subprocess
-import wave
-app = FastAPI()
-# ========== CONFIG ==========
-ROOT_DIR = Path("/tmp/tts_app")
-VOICES_DIR = ROOT_DIR / "voices"
-FILES_DIR = ROOT_DIR / "files"
-VOICES_DIR.mkdir(parents=True, exist_ok=True)
-FILES_DIR.mkdir(parents=True, exist_ok=True)
-DEFAULT_VOICE = "en_US-libritts-high"
-DEFAULT_SR = 22050
-# ========== HEALTH ==========
 @app.get("/health")
-async def health():
     return {
         "ok": True,
         "engine": "piper-tts (CLI, CPU)",
         "default_voice": DEFAULT_VOICE,
         "voice_dir": str(VOICES_DIR),
-        "available_voices": [f.stem for f in VOICES_DIR.glob("*.onnx")],
         "files_dir": str(FILES_DIR),
     }
-# ========== SPEAK (HTTP) ==========
-class SpeakRequest(BaseModel):
-    text: str
-    voice: str = DEFAULT_VOICE
-    rate_wpm: int = 170
-    length_scale: float = 1.0
-    noise_scale: float = 0.33
-    noise_w: float = 0.5
 @app.post("/speak")
-async def speak(req: SpeakRequest):
-    out_path = FILES_DIR / f"{uuid.uuid4().hex}.wav"
-    voice_path = VOICES_DIR / f"{req.voice}.onnx"
-    if not voice_path.exists():
-        return JSONResponse({"error": "Voice not found."}, status_code=404)
-    cmd = [
-        "piper",
-        "--model", str(voice_path),
-        "--output_file", str(out_path),
-        "--text", req.text,
-        "--length_scale", str(req.length_scale),
-        "--noise_scale", str(req.noise_scale),
-        "--noise_w", str(req.noise_w),
-    ]
-    subprocess.run(cmd, check=True)
-    return FileResponse(out_path, media_type="audio/wav")
-# ========== STREAM (WebSocket) ==========
 @app.websocket("/ws/tts")
-async def tts_stream(websocket: WebSocket):
-    await websocket.accept()
     voice = DEFAULT_VOICE
-    settings = {
-        "length_scale": 1.0,
-        "noise_scale": 0.33,
-        "noise_w": 0.5
-    }
-    temp_file = FILES_DIR / f"{uuid.uuid4().hex}.wav"
-    wave_writer = wave.open(str(temp_file), 'wb')
-    wave_writer.setnchannels(1)
-    wave_writer.setsampwidth(2)
-    wave_writer.setframerate(DEFAULT_SR)
     try:
         while True:
-            data = await websocket.receive_text()
-            if data.startswith("{") and "text" in data:
-                import json
-                payload = json.loads(data)
-                text = payload.get("text", "")
-                voice = payload.get("voice", DEFAULT_VOICE)
-                settings["length_scale"] = float(payload.get("length_scale", 1.0))
-                settings["noise_scale"] = float(payload.get("noise_scale", 0.33))
-                settings["noise_w"] = float(payload.get("noise_w", 0.5))
-                tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                cmd = [
-                    "piper",
-                    "--model", str(VOICES_DIR / f"{voice}.onnx"),
-                    "--output_file", tmp.name,
-                    "--text", text,
-                    "--length_scale", str(settings["length_scale"]),
-                    "--noise_scale", str(settings["noise_scale"]),
-                    "--noise_w", str(settings["noise_w"]),
-                ]
-                subprocess.run(cmd, check=True)
-                with open(tmp.name, "rb") as f:
-                    audio = f.read()
-                    await websocket.send_bytes(audio)
-                tmp.close()
-                os.unlink(tmp.name)
     except Exception as e:
-        print(f"[TTS WS Error] {e}")
-    finally:
-        wave_writer.close()
-        await websocket.close()

+import asyncio
+import json
+import os
+import re
+import shlex
 import subprocess
+import tarfile
+import time
+from pathlib import Path
+from typing import Optional, Dict, Any
+import uvicorn
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
+from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse
+# -------------------------
+# Writable directory picker
+# -------------------------
+def pick_writable_dir(*candidates: Path) -> Path:
+    errs = []
+    for p in candidates:
+        if not p:
+            continue
+        try:
+            p.mkdir(parents=True, exist_ok=True)
+            probe = p / ".probe"
+            with open(probe, "wb") as f:
+                f.write(b"ok")
+            probe.unlink(missing_ok=True)
+            return p
+        except Exception as e:
+            errs.append(f"{p}: {type(e)._name_}({e})")
+    raise RuntimeError("No writable dir. Tried:\n" + "\n".join(errs))
+# Honors env overrides; then tries common writable places on HF Spaces
+ENV_DIR = os.getenv("TTS_DATA_DIR")
+VOICES_DIR = None
+FILES_DIR = None
+def init_dirs():
+    global VOICES_DIR, FILES_DIR
+    cand_voices = []
+    if ENV_DIR:
+        cand_voices.append(Path(ENV_DIR) / "voices")
+    cand_voices += [
+        Path("/home/user/.cache/actualtts/voices"),
+        Path("/home/user/voices"),
+        Path("/tmp/actualtts/voices"),
+        Path("/dev/shm/actualtts_voices"),
+    ]
+    VOICES_DIR = pick_writable_dir(*cand_voices)
+    cand_files = []
+    if ENV_DIR:
+        cand_files.append(Path(ENV_DIR) / "files")
+    cand_files += [
+        Path("/home/user/.cache/actualtts/files"),
+        Path("/tmp/actualtts/files"),
+        Path("/dev/shm/actualtts_files"),
+    ]
+    FILES_DIR = pick_writable_dir(*cand_files)
+init_dirs()
+# -------------------------
+# Piper CLI integration
+# -------------------------
+# Piper binary is preinstalled in CPU Spaces images that have Piper CLI.
+# If your image differs, set PIPER_BIN env to the correct path.
+PIPER_BIN = os.getenv("PIPER_BIN", "piper")
+# A small catalog of good CPU voices (VITS-based) hosted on HF.
+HF_VOICES: Dict[str, str] = {
+    "en_US-libritts-high": "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-libritts-high.onnx.tar.gz",
+    "en_US-lessac-high":   "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-lessac-high.onnx.tar.gz",
+    "en_US-amy-medium":    "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US-amy-medium.onnx.tar.gz",
+}
+DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-libritts-high")
+DEFAULT_SR = 22050  # Piper typically outputs 22050 Hz
+DEFAULT_CHANNELS = 1
+_HTTP_CLIENT = None  # lazy import only if needed
+def _http():
+    global _HTTP_CLIENT
+    if _HTTP_CLIENT is None:
+        import requests
+        _HTTP_CLIENT = requests.Session()
+        _HTTP_CLIENT.headers.update({"User-Agent": "ActualTTS/CPU"})
+    return _HTTP_CLIENT
+def ensure_voice(voice_id: str) -> Dict[str, Path]:
+    """
+    Ensure the given voice is present locally. If missing, download and extract.
+    Returns {"model": Path, "config": Path}
+    """
+    vdir = VOICES_DIR / voice_id
+    model = vdir / f"{voice_id}.onnx"
+    config = vdir / f"{voice_id}.onnx.json"
+    if model.exists() and config.exists():
+        return {"model": model, "config": config}
+    url = HF_VOICES.get(voice_id)
+    if not url:
+        # Try a heuristic: accept 'en-us' as a generic alias
+        if voice_id.lower() in ("en-us", "en_us", "english"):
+            voice_id = "en_US-libritts-high"
+            url = HF_VOICES[voice_id]
+            vdir = VOICES_DIR / voice_id
+            model = vdir / f"{voice_id}.onnx"
+            config = vdir / f"{voice_id}.onnx.json"
+        else:
+            raise RuntimeError(f"Unknown/unsupported voice '{voice_id}'. Known: {list(HF_VOICES)}")
+    vdir.mkdir(parents=True, exist_ok=True)
+    tar_path = vdir / f"{voice_id}.onnx.tar.gz"
+    # Download
+    r = _http().get(url, timeout=120, stream=True)
+    r.raise_for_status()
+    with open(tar_path, "wb") as f:
+        for chunk in r.iter_content(1 << 16):
+            if chunk:
+                f.write(chunk)
+    # Extract
+    with tarfile.open(tar_path, "r:gz") as tf:
+        tf.extractall(vdir)
+    tar_path.unlink(missing_ok=True)
+    if not model.exists() or not config.exists():
+        raise RuntimeError(f"Voice files not found after extraction: {voice_id}")
+    return {"model": model, "config": config}
+def build_piper_cmd(text: str, voice_id: str, to_stdout: bool, out_path: Optional[Path] = None,
+                    length_scale: float = 1.10, noise_scale: float = 0.35, noise_w: float = 0.90) -> list:
+    vc = ensure_voice(voice_id)
+    args = [
+        PIPER_BIN,
+        "-m", str(vc["model"]),
+        "-c", str(vc["config"]),
+        "-q",  # quieter logs
+        "--length_scale", str(length_scale),
+        "--noise_scale", str(noise_scale),
+        "--noise_w", str(noise_w),
+    ]
+    if to_stdout:
+        args += ["-f", "-"]
+    else:
+        if out_path is None:
+            raise ValueError("out_path is required when to_stdout=False")
+        args += ["-f", str(out_path)]
+    return args
+async def run_piper_to_file(text: str, voice_id: str, out_path: Path,
+                            length_scale: float, noise_scale: float, noise_w: float) -> None:
+    cmd = build_piper_cmd(text, voice_id, to_stdout=False, out_path=out_path,
+                          length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
+    proc = await asyncio.create_subprocess_exec(
+        *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+    )
+    # write the text
+    await proc.stdin.write(text.encode("utf-8"))
+    await proc.stdin.drain()
+    proc.stdin.close()
+    await proc.wait()
+    if proc.returncode != 0:
+        stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
+        raise RuntimeError(f"Piper failed (code {proc.returncode}). Stderr:\n{stderr}")
+async def run_piper_stream(text: str, voice_id: str, websocket: WebSocket,
+                           length_scale: float, noise_scale: float, noise_w: float) -> None:
+    """
+    Stream binary PCM16 via WS while Piper renders to stdout.
+    """
+    cmd = build_piper_cmd(text, voice_id, to_stdout=True,
+                          length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
+    # Tell client we're ready (so it can open its audio device early)
+    await websocket.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CHANNELS}))
+    proc = await asyncio.create_subprocess_exec(
+        *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+    )
+    await proc.stdin.write(text.encode("utf-8"))
+    await proc.stdin.drain()
+    proc.stdin.close()
+    try:
+        while True:
+            chunk = await proc.stdout.read(4096)
+            if not chunk:
+                break
+            # Piper emits WAV unless given raw flag; CLI doesn't expose raw PCM easily.
+            # However, Piper's default stdout with -f - is WAV. Many clients can accept WAV frames incrementally.
+            # Your client plays raw PCM; so we keep WAV to avoid truncation and let the client handle it,
+            # or we can strip the first 44 bytes (WAV header) once and then stream the rest as PCM16.
+            # We'll do header-strip-once below:
+            # Detect/strip WAV header (44 bytes) exactly once per stream:
+            if len(chunk) >= 44 and chunk[0:4] == b"RIFF" and chunk[8:12] == b"WAVE":
+                # Skip the 44-byte header (simple WAV)
+                chunk = chunk[44:]
+                if not chunk:
+                    continue
+            await websocket.send_bytes(chunk)
+        await proc.wait()
+        if proc.returncode != 0:
+            stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
+            await websocket.send_text(json.dumps({"event": "error", "detail": stderr}))
+        else:
+            await websocket.send_text(json.dumps({"event": "done"}))
+    except WebSocketDisconnect:
+        try:
+            proc.kill()
+        except Exception:
+            pass
+# ---------------
+# FastAPI wiring
+# ---------------
+app = FastAPI(title="ActualTTS (CPU)")
 @app.get("/health")
+def health():
+    def list_voices():
+        out = []
+        for child in VOICES_DIR.iterdir():
+            if not child.is_dir():
+                continue
+            name = child.name
+            if (child / f"{name}.onnx").exists() and (child / f"{name}.onnx.json").exists():
+                out.append(name)
+        return out
     return {
         "ok": True,
         "engine": "piper-tts (CLI, CPU)",
         "default_voice": DEFAULT_VOICE,
         "voice_dir": str(VOICES_DIR),
+        "available_voices": list_voices(),
         "files_dir": str(FILES_DIR),
     }
+@app.get("/")
+def root():
+    return PlainTextResponse("ActualTTS (CPU) — use POST /speak or WS /ws/tts")
+@app.get("/file/{name}")
+def get_file(name: str):
+    path = FILES_DIR / name
+    if not path.exists():
+        return JSONResponse({"ok": False, "error": "not found"}, status_code=404)
+    return FileResponse(path)
 @app.post("/speak")
+async def speak(request: Request):
+    """
+    JSON body:
+      {
+        "text": "Hello world",
+        "voice": "en_US-libritts-high",
+        "length_scale": 1.10,
+        "noise_scale": 0.35,
+        "noise_w": 0.90
+      }
+    Returns:
+      {"ok": true, "audio_url": "/file/tts-<ts>.wav"}
+    """
+    try:
+        body = await request.json()
+    except Exception:
+        return JSONResponse({"detail": "Invalid JSON"}, status_code=400)
+    text = (body.get("text") or "").strip()
+    if not text:
+        return JSONResponse({"detail": "Missing text"}, status_code=400)
+    voice = (body.get("voice") or DEFAULT_VOICE).strip()
+    length_scale = float(body.get("length_scale", 1.10))
+    noise_scale = float(body.get("noise_scale", 0.35))
+    noise_w = float(body.get("noise_w", 0.90))
+    # prepare output
+    ts = int(time.time() * 1000)
+    out_path = FILES_DIR / f"tts-{ts}.wav"
+    try:
+        # ensure voice (download if needed)
+        ensure_voice(voice)
+        # run piper CLI to a wav file
+        await run_piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w)
+    except Exception as e:
+        return JSONResponse({"ok": False, "error": str(e)}, status_code=500)
+    return {"ok": True, "audio_url": f"/file/{out_path.name}"}
 @app.websocket("/ws/tts")
+async def ws_tts(ws: WebSocket):
+    await ws.accept()
     voice = DEFAULT_VOICE
+    length_scale = 1.10
+    noise_scale = 0.35
+    noise_w = 0.90
+    text_to_speak: Optional[str] = None
     try:
         while True:
+            msg = await ws.receive_text()
+            try:
+                data = json.loads(msg)
+            except Exception:
+                continue
+            ev = data.get("event")
+            if ev == "init":
+                voice = (data.get("voice") or voice).strip()
+                # allow optional tuning via WS
+                if "length_scale" in data: length_scale = float(data["length_scale"])
+                if "noise_scale"  in data: noise_scale  = float(data["noise_scale"])
+                if "noise_w"      in data: noise_w      = float(data["noise_w"])
+                # ensure voice now so we can send ready immediately
+                try:
+                    ensure_voice(voice)
+                except Exception as e:
+                    await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
+                    await ws.close()
+                    return
+                await ws.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CHANNELS}))
+            elif ev == "speak":
+                text_to_speak = (data.get("text") or "").strip()
+                if not text_to_speak:
+                    await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
+                    continue
+                # stream via stdout
+                await run_piper_stream(text_to_speak, voice, ws, length_scale, noise_scale, noise_w)
+            else:
+                # ignore unknown events
+                pass
+    except WebSocketDisconnect:
+        return
     except Exception as e:
+        try:
+            await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
+        except Exception:
+            pass
+        try:
+            await ws.close()
+        except Exception:
+            pass
+if __name__ == "__main__":
+    # For local debug; Spaces uses Entrypoint/Cmd
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)