piper

Sleeping

App Files Files Community

Percy3822 commited on Sep 5, 2025

Commit

5d2e096

verified ·

1 Parent(s): 1042c9d

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -55

app.py CHANGED Viewed

@@ -4,14 +4,14 @@ import json
 import os
 import time
 from pathlib import Path
-from typing import Optional, Dict, Tuple
 import uvicorn
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, BackgroundTasks, Query
 from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse
 # -------------------------
-# Writable directory picker
 # -------------------------
 def pick_writable_dir(candidates):
@@ -29,7 +29,6 @@ def pick_writable_dir(candidates):
             errs.append(f"{p}: {type(e).__name__}({e})")
     raise RuntimeError("No writable dir. Tried:\n  " + "\n  ".join(errs))
-# Only pick truly safe/writable locations on HF Spaces
 ENV_DIR = os.getenv("TTS_DATA_DIR")
 VOICE_CANDIDATES = [
     Path("/tmp/actualtts/voices"),
@@ -57,24 +56,22 @@ def _safe_unlink(path: Path):
 PIPER_BIN = os.getenv("PIPER_BIN", "piper")
-# Hugging Face layout (no tarballs). You can pin a commit hash via env for stability.
 HF_REPO_BASE = "https://huggingface.co/rhasspy/piper-voices/resolve"
-HF_REV = os.getenv("PIPER_VOICES_REV", "main")
-# Sanity thresholds to detect corrupt downloads (bytes)
-MIN_ONNX_BYTES = int(os.getenv("MIN_ONNX_BYTES", "5000000"))  # >= ~5MB
 MIN_JSON_BYTES = int(os.getenv("MIN_JSON_BYTES", "1000"))     # >= 1KB
-# Map short ids to their nested path in the repo: (lang, country, family, quality, basename)
 VOICE_MAP: Dict[str, Tuple[str, str, str, str, str]] = {
-    "en_US-libritts-high": ("en", "en_US", "libritts", "high", "en_US-libritts-high"),
-    "en_US-lessac-high":   ("en", "en_US", "lessac",   "high", "en_US-lessac-high"),
     "en_US-amy-medium":    ("en", "en_US", "amy",      "medium", "en_US-amy-medium"),
 }
 DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-libritts-high")
-DEFAULT_SR = 22050
-DEFAULT_CH = 1
 _http = None
 def http():
@@ -105,34 +102,40 @@ def _file_ok(p: Path, min_bytes: int) -> bool:
     except Exception:
         return False
-def ensure_voice(voice_id: str) -> Dict[str, Path]:
-    """Ensure voice .onnx and .onnx.json exist locally with sane sizes. Returns paths."""
     # Aliases
     if voice_id.lower() in ("en-us", "en_us", "english"):
         voice_id = "en_US-libritts-high"
     if voice_id not in VOICE_MAP:
         raise RuntimeError(f"Unknown voice '{voice_id}'. Known: {list(VOICE_MAP)}")
     lang, country, family, quality, base = VOICE_MAP[voice_id]
-    vdir = VOICES_DIR / voice_id
     model = vdir / f"{base}.onnx"
     cfg   = vdir / f"{base}.onnx.json"
     vdir.mkdir(parents=True, exist_ok=True)
-    # Build URLs at pinned/current revision
-    prefix = f"{HF_REPO_BASE}/{HF_REV}/{lang}/{country}/{family}/{quality}/{base}"
     model_url = f"{prefix}.onnx"
     cfg_url   = f"{prefix}.onnx.json"
-    # Download/verify
     if not _file_ok(model, MIN_ONNX_BYTES):
         _download(model_url, model)
     if not _file_ok(cfg, MIN_JSON_BYTES):
         _download(cfg_url, cfg)
-    # Final sanity
     if not _file_ok(model, MIN_ONNX_BYTES):
         sz = model.stat().st_size if model.exists() else 0
         raise RuntimeError(f"Downloaded .onnx too small ({sz} bytes) for '{voice_id}'")
@@ -140,10 +143,14 @@ def ensure_voice(voice_id: str) -> Dict[str, Path]:
         sz = cfg.stat().st_size if cfg.exists() else 0
         raise RuntimeError(f"Downloaded .onnx.json too small ({sz} bytes) for '{voice_id}'")
-    return {"model": model, "config": cfg}
-def build_piper_cmd(text: str, voice_id: str, to_stdout: bool, out_path: Optional[Path] = None,
-                    length_scale: float = 1.08, noise_scale: float = 0.35, noise_w: float = 0.90) -> list:
     vc = ensure_voice(voice_id)
     cmd = [
         PIPER_BIN,
@@ -155,12 +162,12 @@ def build_piper_cmd(text: str, voice_id: str, to_stdout: bool, out_path: Optiona
         "--noise_w",      str(noise_w),
     ]
     if to_stdout:
-        # Stream RAW PCM (no WAV header) → simpler, no header parsing bugs.
         cmd += ["--raw", "-f", "-"]
     else:
         if out_path is None:
             raise ValueError("out_path required when to_stdout=False")
-        # When writing to file, Piper writes WAV by default.
         cmd += ["-f", str(out_path)]
     return cmd
@@ -170,8 +177,7 @@ async def piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
-    # Send a final newline so Piper treats it as a complete utterance.
-    proc.stdin.write((text + "\n").encode("utf-8"))
     await proc.stdin.drain()
     proc.stdin.close()
     await proc.wait()
@@ -179,22 +185,20 @@ async def piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_
         stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
         raise RuntimeError(f"Piper failed (code {proc.returncode}).\n{stderr}")
-async def piper_stream_stdout(text, voice, ws: WebSocket, length_scale, noise_scale, noise_w):
-    """Stream RAW PCM from Piper stdout over WS (no WAV header), with stderr logs."""
-    await ws.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CH}))
     cmd = build_piper_cmd(text, voice, to_stdout=True,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
-    # Feed text + newline then close stdin
     proc.stdin.write((text + "\n").encode("utf-8"))
     await proc.stdin.drain()
     proc.stdin.close()
-    # Forward stderr lines to client (debug visibility)
     async def pump_stderr():
         try:
             while True:
@@ -209,15 +213,14 @@ async def piper_stream_stdout(text, voice, ws: WebSocket, length_scale, noise_sc
             pass
     stderr_task = asyncio.create_task(pump_stderr())
-    total_bytes = 0
     try:
-        # RAW PCM passthrough
         while True:
             chunk = await proc.stdout.read(4096)
             if not chunk:
                 break
-            total_bytes += len(chunk)
             await ws.send_bytes(chunk)
         await proc.wait()
@@ -226,9 +229,9 @@ async def piper_stream_stdout(text, voice, ws: WebSocket, length_scale, noise_sc
         if proc.returncode != 0:
             rem = await proc.stderr.read()
             detail = rem.decode("utf-8", "ignore").strip()
-            await ws.send_text(json.dumps({"event": "error", "detail": detail or f'piper exited {proc.returncode}'}))
         else:
-            if total_bytes == 0:
                 await ws.send_text(json.dumps({"event": "error", "detail": "No audio produced"}))
             else:
                 await ws.send_text(json.dumps({"event": "done"}))
@@ -242,9 +245,9 @@ async def piper_stream_stdout(text, voice, ws: WebSocket, length_scale, noise_sc
         except Exception:
             pass
-# ---------------
-# FastAPI wiring
-# ---------------
 app = FastAPI(title="ActualTTS (CPU)")
@@ -256,11 +259,10 @@ def health():
             if not child.is_dir():
                 continue
             name = child.name
-            # Only list voices that have sane-sized files
             base = child / f"{name}.onnx"
             cfg  = child / f"{name}.onnx.json"
             if _file_ok(base, MIN_ONNX_BYTES) and _file_ok(cfg, MIN_JSON_BYTES):
-                voices.append(name)
     return {
         "ok": True,
         "engine": "piper-tts (CLI, CPU)",
@@ -284,9 +286,10 @@ def get_file(name: str):
 @app.post("/speak")
 async def speak(request: Request):
     """
-    Body (JSON):
-    {
-      "text": "Hello",
     """
     try:
         body = await request.json()
@@ -297,7 +300,7 @@ async def speak(request: Request):
     if not text:
         return JSONResponse({"detail": "Missing text"}, status_code=400)
-    voice = (body.get("voice") or DEFAULT_VOICE).strip()
     length_scale = float(body.get("length_scale", 1.08))
     noise_scale  = float(body.get("noise_scale", 0.35))
     noise_w      = float(body.get("noise_w", 0.90))
@@ -313,7 +316,6 @@ async def speak(request: Request):
     return {"ok": True, "audio_url": f"/file/{out_path.name}"}
-# --- Direct-file endpoints (audio/wav response) ---
 @app.post("/speak.wav")
 async def speak_wav_post(request: Request, background_tasks: BackgroundTasks):
     """POST JSON -> returns audio/wav directly"""
@@ -350,7 +352,7 @@ async def speak_wav_get(
     length_scale: float = 1.08,
     noise_scale: float = 0.35,
     noise_w: float = 0.90,
-    background_tasks: BackgroundTasks = ...,
 ):
     """GET query -> returns audio/wav directly"""
     text = (text or "").strip()
@@ -366,10 +368,10 @@ async def speak_wav_get(
     except Exception as e:
         return JSONResponse({"ok": False, "error": str(e)}, status_code=500)
-    background_tasks.add_task(_safe_unlink, out_path)
-    return FileResponse(out_path, media_type="audio/wav", filename=out_path.name, background=background_tasks)
-# --- Diagnostics: inspect/refresh downloaded voices ---
 @app.get("/debug/voices")
 def debug_voices(redownload: bool = Query(False, description="Force re-download bad/missing files")):
     out = {"dir": str(VOICES_DIR), "voices": []}
@@ -383,6 +385,7 @@ def debug_voices(redownload: bool = Query(False, description="Force re-download
             "model_exists": model.exists(), "cfg_exists": cfg.exists(),
             "model_size": (model.stat().st_size if model.exists() else 0),
             "cfg_size": (cfg.stat().st_size if cfg.exists() else 0),
         }
         out["voices"].append(info)
@@ -395,6 +398,7 @@ def debug_voices(redownload: bool = Query(False, description="Force re-download
                 info["redownloaded"] = True
                 info["model_size"] = (model.stat().st_size if model.exists() else 0)
                 info["cfg_size"] = (cfg.stat().st_size if cfg.exists() else 0)
             except Exception as e:
                 info["redownload_error"] = str(e)
     return out
@@ -404,6 +408,7 @@ async def ws_tts(ws: WebSocket):
     await ws.accept()
     voice = DEFAULT_VOICE
     length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90
     try:
         while True:
@@ -419,18 +424,20 @@ async def ws_tts(ws: WebSocket):
                 if "noise_scale"  in data: noise_scale  = float(data["noise_scale"])
                 if "noise_w"      in data: noise_w      = float(data["noise_w"])
                 try:
-                    ensure_voice(voice)
                 except Exception as e:
                     await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
                     await ws.close()
                     return
-                await ws.send_text(json.dumps({"event": "ready", "sr": DEFAULT_SR, "channels": DEFAULT_CH}))
             elif ev == "speak":
                 text = (data.get("text") or "").strip()
                 if not text:
                     await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
                     continue
-                await piper_stream_stdout(text, voice, ws, length_scale, noise_scale, noise_w)
             # ignore other events
     except WebSocketDisconnect:
         return

 import os
 import time
 from pathlib import Path
+from typing import Dict, Optional, Tuple
 import uvicorn
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, BackgroundTasks, Query
 from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse
 # -------------------------
+# Writable directories
 # -------------------------
 def pick_writable_dir(candidates):
             errs.append(f"{p}: {type(e).__name__}({e})")
     raise RuntimeError("No writable dir. Tried:\n  " + "\n  ".join(errs))
 ENV_DIR = os.getenv("TTS_DATA_DIR")
 VOICE_CANDIDATES = [
     Path("/tmp/actualtts/voices"),
 PIPER_BIN = os.getenv("PIPER_BIN", "piper")
 HF_REPO_BASE = "https://huggingface.co/rhasspy/piper-voices/resolve"
+HF_REV       = os.getenv("PIPER_VOICES_REV", "main")  # set a commit hash here if you want pinning
+# sanity thresholds (bytes) to detect corrupt downloads
+MIN_ONNX_BYTES = int(os.getenv("MIN_ONNX_BYTES", "5000000"))  # >= ~5MB (real models are 10s–100s MB)
 MIN_JSON_BYTES = int(os.getenv("MIN_JSON_BYTES", "1000"))     # >= 1KB
+# (lang, country, family, quality, basename)
 VOICE_MAP: Dict[str, Tuple[str, str, str, str, str]] = {
+    "en_US-libritts-high": ("en", "en_US", "libritts", "high",   "en_US-libritts-high"),
+    "en_US-lessac-high":   ("en", "en_US", "lessac",   "high",   "en_US-lessac-high"),
     "en_US-amy-medium":    ("en", "en_US", "amy",      "medium", "en_US-amy-medium"),
 }
 DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-libritts-high")
+DEFAULT_CH    = 1  # mono
 _http = None
 def http():
     except Exception:
         return False
+def _read_sr_from_cfg(cfg_path: Path) -> int:
+    import json as _json
+    try:
+        with open(cfg_path, "r", encoding="utf-8") as f:
+            j = _json.load(f)
+        sr = int(j.get("sample_rate", 22050))
+        return sr if sr in (16000, 22050, 24000, 44100, 48000) else 22050
+    except Exception:
+        return 22050
+def ensure_voice(voice_id: str) -> Dict[str, Path | int]:
+    """Ensure voice .onnx and .onnx.json exist locally with sane sizes. Returns paths and SR."""
     # Aliases
     if voice_id.lower() in ("en-us", "en_us", "english"):
         voice_id = "en_US-libritts-high"
     if voice_id not in VOICE_MAP:
         raise RuntimeError(f"Unknown voice '{voice_id}'. Known: {list(VOICE_MAP)}")
     lang, country, family, quality, base = VOICE_MAP[voice_id]
+    vdir  = VOICES_DIR / voice_id
     model = vdir / f"{base}.onnx"
     cfg   = vdir / f"{base}.onnx.json"
     vdir.mkdir(parents=True, exist_ok=True)
+    prefix    = f"{HF_REPO_BASE}/{HF_REV}/{lang}/{country}/{family}/{quality}/{base}"
     model_url = f"{prefix}.onnx"
     cfg_url   = f"{prefix}.onnx.json"
     if not _file_ok(model, MIN_ONNX_BYTES):
         _download(model_url, model)
     if not _file_ok(cfg, MIN_JSON_BYTES):
         _download(cfg_url, cfg)
     if not _file_ok(model, MIN_ONNX_BYTES):
         sz = model.stat().st_size if model.exists() else 0
         raise RuntimeError(f"Downloaded .onnx too small ({sz} bytes) for '{voice_id}'")
         sz = cfg.stat().st_size if cfg.exists() else 0
         raise RuntimeError(f"Downloaded .onnx.json too small ({sz} bytes) for '{voice_id}'")
+    sr = _read_sr_from_cfg(cfg)
+    return {"model": model, "config": cfg, "sr": sr}
+def build_piper_cmd(
+    text: str, voice_id: str, to_stdout: bool,
+    out_path: Optional[Path] = None,
+    length_scale: float = 1.08, noise_scale: float = 0.35, noise_w: float = 0.90
+) -> list:
     vc = ensure_voice(voice_id)
     cmd = [
         PIPER_BIN,
         "--noise_w",      str(noise_w),
     ]
     if to_stdout:
+        # Stream RAW PCM (16-bit little-endian). Simpler to play on clients.
         cmd += ["--raw", "-f", "-"]
     else:
         if out_path is None:
             raise ValueError("out_path required when to_stdout=False")
+        # File output: Piper writes WAV by default.
         cmd += ["-f", str(out_path)]
     return cmd
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
+    proc.stdin.write((text + "\n").encode("utf-8"))  # newline to terminate
     await proc.stdin.drain()
     proc.stdin.close()
     await proc.wait()
         stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
         raise RuntimeError(f"Piper failed (code {proc.returncode}).\n{stderr}")
+async def piper_stream_raw(text, voice, ws: WebSocket, length_scale, noise_scale, noise_w, sr: int):
+    """Stream RAW PCM frames over WS; send stderr as 'log' events; signal 'done'."""
+    # We already announced 'ready' with the correct sr in the init step.
     cmd = build_piper_cmd(text, voice, to_stdout=True,
                           length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
     proc = await asyncio.create_subprocess_exec(
         *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
+    # Send text and close stdin
     proc.stdin.write((text + "\n").encode("utf-8"))
     await proc.stdin.drain()
     proc.stdin.close()
     async def pump_stderr():
         try:
             while True:
             pass
     stderr_task = asyncio.create_task(pump_stderr())
+    total = 0
     try:
         while True:
             chunk = await proc.stdout.read(4096)
             if not chunk:
                 break
+            total += len(chunk)
             await ws.send_bytes(chunk)
         await proc.wait()
         if proc.returncode != 0:
             rem = await proc.stderr.read()
             detail = rem.decode("utf-8", "ignore").strip()
+            await ws.send_text(json.dumps({"event": "error", "detail": detail or f"piper exited {proc.returncode}"}))
         else:
+            if total == 0:
                 await ws.send_text(json.dumps({"event": "error", "detail": "No audio produced"}))
             else:
                 await ws.send_text(json.dumps({"event": "done"}))
         except Exception:
             pass
+# -------------------------
+# FastAPI app & routes
+# -------------------------
 app = FastAPI(title="ActualTTS (CPU)")
             if not child.is_dir():
                 continue
             name = child.name
             base = child / f"{name}.onnx"
             cfg  = child / f"{name}.onnx.json"
             if _file_ok(base, MIN_ONNX_BYTES) and _file_ok(cfg, MIN_JSON_BYTES):
+                voices.append({"id": name, "sr": _read_sr_from_cfg(cfg)})
     return {
         "ok": True,
         "engine": "piper-tts (CLI, CPU)",
 @app.post("/speak")
 async def speak(request: Request):
     """
+    POST JSON:
+      { "text": "Hello", "voice": "en_US-libritts-high",
+        "length_scale": 1.08, "noise_scale": 0.35, "noise_w": 0.90 }
+    Returns: { "ok": true, "audio_url": "/file/tts-XXXX.wav" }
     """
     try:
         body = await request.json()
     if not text:
         return JSONResponse({"detail": "Missing text"}, status_code=400)
+    voice        = (body.get("voice") or DEFAULT_VOICE).strip()
     length_scale = float(body.get("length_scale", 1.08))
     noise_scale  = float(body.get("noise_scale", 0.35))
     noise_w      = float(body.get("noise_w", 0.90))
     return {"ok": True, "audio_url": f"/file/{out_path.name}"}
 @app.post("/speak.wav")
 async def speak_wav_post(request: Request, background_tasks: BackgroundTasks):
     """POST JSON -> returns audio/wav directly"""
     length_scale: float = 1.08,
     noise_scale: float = 0.35,
     noise_w: float = 0.90,
+    background_tasks: BackgroundTasks = None,
 ):
     """GET query -> returns audio/wav directly"""
     text = (text or "").strip()
     except Exception as e:
         return JSONResponse({"ok": False, "error": str(e)}, status_code=500)
+    if background_tasks:
+        background_tasks.add_task(_safe_unlink, out_path)
+    return FileResponse(out_path, media_type="audio/wav", filename=out_path.name)
 @app.get("/debug/voices")
 def debug_voices(redownload: bool = Query(False, description="Force re-download bad/missing files")):
     out = {"dir": str(VOICES_DIR), "voices": []}
             "model_exists": model.exists(), "cfg_exists": cfg.exists(),
             "model_size": (model.stat().st_size if model.exists() else 0),
             "cfg_size": (cfg.stat().st_size if cfg.exists() else 0),
+            "sr": _read_sr_from_cfg(cfg) if cfg.exists() else None,
         }
         out["voices"].append(info)
                 info["redownloaded"] = True
                 info["model_size"] = (model.stat().st_size if model.exists() else 0)
                 info["cfg_size"] = (cfg.stat().st_size if cfg.exists() else 0)
+                info["sr"] = _read_sr_from_cfg(cfg) if cfg.exists() else None
             except Exception as e:
                 info["redownload_error"] = str(e)
     return out
     await ws.accept()
     voice = DEFAULT_VOICE
     length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90
+    voice_sr = 22050  # will be overwritten by ensure_voice
     try:
         while True:
                 if "noise_scale"  in data: noise_scale  = float(data["noise_scale"])
                 if "noise_w"      in data: noise_w      = float(data["noise_w"])
                 try:
+                    info = ensure_voice(voice)
+                    voice_sr = int(info.get("sr", 22050))
                 except Exception as e:
                     await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
                     await ws.close()
                     return
+                # announce the correct SR so the client opens the audio device properly
+                await ws.send_text(json.dumps({"event": "ready", "sr": voice_sr, "channels": DEFAULT_CH}))
             elif ev == "speak":
                 text = (data.get("text") or "").strip()
                 if not text:
                     await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
                     continue
+                await piper_stream_raw(text, voice, ws, length_scale, noise_scale, noise_w, sr=voice_sr)
             # ignore other events
     except WebSocketDisconnect:
         return