Update app.py
Browse files
app.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
| 2 |
import asyncio
|
| 3 |
import json
|
| 4 |
import os
|
|
|
|
|
|
|
| 5 |
import time
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Dict, Optional, Tuple
|
|
@@ -51,16 +53,30 @@ def _safe_unlink(path: Path):
|
|
| 51 |
pass
|
| 52 |
|
| 53 |
# -------------------------
|
| 54 |
-
# Piper
|
| 55 |
# -------------------------
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
HF_REPO_BASE = "https://huggingface.co/rhasspy/piper-voices/resolve"
|
| 60 |
-
HF_REV = os.getenv("PIPER_VOICES_REV", "main") #
|
| 61 |
|
| 62 |
-
# sanity thresholds (bytes)
|
| 63 |
-
MIN_ONNX_BYTES = int(os.getenv("MIN_ONNX_BYTES", "5000000")) # >= ~5MB (real models are
|
| 64 |
MIN_JSON_BYTES = int(os.getenv("MIN_JSON_BYTES", "1000")) # >= 1KB
|
| 65 |
|
| 66 |
# (lang, country, family, quality, basename)
|
|
@@ -103,16 +119,16 @@ def _file_ok(p: Path, min_bytes: int) -> bool:
|
|
| 103 |
return False
|
| 104 |
|
| 105 |
def _read_sr_from_cfg(cfg_path: Path) -> int:
|
| 106 |
-
import json as _json
|
| 107 |
try:
|
| 108 |
with open(cfg_path, "r", encoding="utf-8") as f:
|
| 109 |
-
j =
|
| 110 |
sr = int(j.get("sample_rate", 22050))
|
|
|
|
| 111 |
return sr if sr in (16000, 22050, 24000, 44100, 48000) else 22050
|
| 112 |
except Exception:
|
| 113 |
return 22050
|
| 114 |
|
| 115 |
-
def ensure_voice(voice_id: str) -> Dict[str,
|
| 116 |
"""Ensure voice .onnx and .onnx.json exist locally with sane sizes. Returns paths and SR."""
|
| 117 |
# Aliases
|
| 118 |
if voice_id.lower() in ("en-us", "en_us", "english"):
|
|
@@ -146,6 +162,10 @@ def ensure_voice(voice_id: str) -> Dict[str, Path | int]:
|
|
| 146 |
sr = _read_sr_from_cfg(cfg)
|
| 147 |
return {"model": model, "config": cfg, "sr": sr}
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
def build_piper_cmd(
|
| 150 |
text: str, voice_id: str, to_stdout: bool,
|
| 151 |
out_path: Optional[Path] = None,
|
|
@@ -153,7 +173,7 @@ def build_piper_cmd(
|
|
| 153 |
) -> list:
|
| 154 |
vc = ensure_voice(voice_id)
|
| 155 |
cmd = [
|
| 156 |
-
|
| 157 |
"-m", str(vc["model"]),
|
| 158 |
"-c", str(vc["config"]),
|
| 159 |
"-q",
|
|
@@ -162,12 +182,12 @@ def build_piper_cmd(
|
|
| 162 |
"--noise_w", str(noise_w),
|
| 163 |
]
|
| 164 |
if to_stdout:
|
| 165 |
-
# Stream RAW PCM (16-bit little-endian)
|
| 166 |
cmd += ["--raw", "-f", "-"]
|
| 167 |
else:
|
| 168 |
if out_path is None:
|
| 169 |
raise ValueError("out_path required when to_stdout=False")
|
| 170 |
-
# File output:
|
| 171 |
cmd += ["-f", str(out_path)]
|
| 172 |
return cmd
|
| 173 |
|
|
@@ -177,7 +197,8 @@ async def piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_
|
|
| 177 |
proc = await asyncio.create_subprocess_exec(
|
| 178 |
*cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
| 179 |
)
|
| 180 |
-
|
|
|
|
| 181 |
await proc.stdin.drain()
|
| 182 |
proc.stdin.close()
|
| 183 |
await proc.wait()
|
|
@@ -185,9 +206,8 @@ async def piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_
|
|
| 185 |
stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
|
| 186 |
raise RuntimeError(f"Piper failed (code {proc.returncode}).\n{stderr}")
|
| 187 |
|
| 188 |
-
async def piper_stream_raw(text, voice, ws: WebSocket, length_scale, noise_scale, noise_w
|
| 189 |
"""Stream RAW PCM frames over WS; send stderr as 'log' events; signal 'done'."""
|
| 190 |
-
# We already announced 'ready' with the correct sr in the init step.
|
| 191 |
cmd = build_piper_cmd(text, voice, to_stdout=True,
|
| 192 |
length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
|
| 193 |
proc = await asyncio.create_subprocess_exec(
|
|
@@ -372,6 +392,7 @@ async def speak_wav_get(
|
|
| 372 |
background_tasks.add_task(_safe_unlink, out_path)
|
| 373 |
return FileResponse(out_path, media_type="audio/wav", filename=out_path.name)
|
| 374 |
|
|
|
|
| 375 |
@app.get("/debug/voices")
|
| 376 |
def debug_voices(redownload: bool = Query(False, description="Force re-download bad/missing files")):
|
| 377 |
out = {"dir": str(VOICES_DIR), "voices": []}
|
|
@@ -403,12 +424,13 @@ def debug_voices(redownload: bool = Query(False, description="Force re-download
|
|
| 403 |
info["redownload_error"] = str(e)
|
| 404 |
return out
|
| 405 |
|
|
|
|
| 406 |
@app.websocket("/ws/tts")
|
| 407 |
async def ws_tts(ws: WebSocket):
|
| 408 |
await ws.accept()
|
| 409 |
voice = DEFAULT_VOICE
|
| 410 |
length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90
|
| 411 |
-
voice_sr = 22050 # will be
|
| 412 |
|
| 413 |
try:
|
| 414 |
while True:
|
|
@@ -430,15 +452,15 @@ async def ws_tts(ws: WebSocket):
|
|
| 430 |
await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
|
| 431 |
await ws.close()
|
| 432 |
return
|
| 433 |
-
#
|
| 434 |
await ws.send_text(json.dumps({"event": "ready", "sr": voice_sr, "channels": DEFAULT_CH}))
|
| 435 |
elif ev == "speak":
|
| 436 |
text = (data.get("text") or "").strip()
|
| 437 |
if not text:
|
| 438 |
await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
|
| 439 |
continue
|
| 440 |
-
await piper_stream_raw(text, voice, ws, length_scale, noise_scale, noise_w
|
| 441 |
-
# ignore
|
| 442 |
except WebSocketDisconnect:
|
| 443 |
return
|
| 444 |
except Exception as e:
|
|
|
|
| 2 |
import asyncio
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
+
import sys
|
| 6 |
+
import shutil
|
| 7 |
import time
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Dict, Optional, Tuple
|
|
|
|
| 53 |
pass
|
| 54 |
|
| 55 |
# -------------------------
|
| 56 |
+
# Piper command resolution
|
| 57 |
# -------------------------
|
| 58 |
|
| 59 |
+
def resolve_piper_cmd():
|
| 60 |
+
env = os.getenv("PIPER_BIN")
|
| 61 |
+
if env:
|
| 62 |
+
return env.split()
|
| 63 |
+
path = shutil.which("piper")
|
| 64 |
+
if path:
|
| 65 |
+
return [path]
|
| 66 |
+
# fallback to module runner if console script isn't on PATH
|
| 67 |
+
return [sys.executable, "-m", "piper"]
|
| 68 |
+
|
| 69 |
+
PIPER_CMD = resolve_piper_cmd()
|
| 70 |
+
|
| 71 |
+
# -------------------------
|
| 72 |
+
# Voice download & checks
|
| 73 |
+
# -------------------------
|
| 74 |
|
| 75 |
HF_REPO_BASE = "https://huggingface.co/rhasspy/piper-voices/resolve"
|
| 76 |
+
HF_REV = os.getenv("PIPER_VOICES_REV", "main") # optionally pin a commit hash
|
| 77 |
|
| 78 |
+
# sanity thresholds (bytes)
|
| 79 |
+
MIN_ONNX_BYTES = int(os.getenv("MIN_ONNX_BYTES", "5000000")) # >= ~5MB (real models are much larger)
|
| 80 |
MIN_JSON_BYTES = int(os.getenv("MIN_JSON_BYTES", "1000")) # >= 1KB
|
| 81 |
|
| 82 |
# (lang, country, family, quality, basename)
|
|
|
|
| 119 |
return False
|
| 120 |
|
| 121 |
def _read_sr_from_cfg(cfg_path: Path) -> int:
|
|
|
|
| 122 |
try:
|
| 123 |
with open(cfg_path, "r", encoding="utf-8") as f:
|
| 124 |
+
j = json.load(f)
|
| 125 |
sr = int(j.get("sample_rate", 22050))
|
| 126 |
+
# keep it reasonable
|
| 127 |
return sr if sr in (16000, 22050, 24000, 44100, 48000) else 22050
|
| 128 |
except Exception:
|
| 129 |
return 22050
|
| 130 |
|
| 131 |
+
def ensure_voice(voice_id: str) -> Dict[str, object]:
|
| 132 |
"""Ensure voice .onnx and .onnx.json exist locally with sane sizes. Returns paths and SR."""
|
| 133 |
# Aliases
|
| 134 |
if voice_id.lower() in ("en-us", "en_us", "english"):
|
|
|
|
| 162 |
sr = _read_sr_from_cfg(cfg)
|
| 163 |
return {"model": model, "config": cfg, "sr": sr}
|
| 164 |
|
| 165 |
+
# -------------------------
|
| 166 |
+
# Piper exec helpers
|
| 167 |
+
# -------------------------
|
| 168 |
+
|
| 169 |
def build_piper_cmd(
|
| 170 |
text: str, voice_id: str, to_stdout: bool,
|
| 171 |
out_path: Optional[Path] = None,
|
|
|
|
| 173 |
) -> list:
|
| 174 |
vc = ensure_voice(voice_id)
|
| 175 |
cmd = [
|
| 176 |
+
*PIPER_CMD,
|
| 177 |
"-m", str(vc["model"]),
|
| 178 |
"-c", str(vc["config"]),
|
| 179 |
"-q",
|
|
|
|
| 182 |
"--noise_w", str(noise_w),
|
| 183 |
]
|
| 184 |
if to_stdout:
|
| 185 |
+
# Stream RAW PCM (16-bit little-endian)
|
| 186 |
cmd += ["--raw", "-f", "-"]
|
| 187 |
else:
|
| 188 |
if out_path is None:
|
| 189 |
raise ValueError("out_path required when to_stdout=False")
|
| 190 |
+
# File output: piper writes a WAV
|
| 191 |
cmd += ["-f", str(out_path)]
|
| 192 |
return cmd
|
| 193 |
|
|
|
|
| 197 |
proc = await asyncio.create_subprocess_exec(
|
| 198 |
*cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
| 199 |
)
|
| 200 |
+
# terminate the utterance with newline
|
| 201 |
+
proc.stdin.write((text + "\n").encode("utf-8"))
|
| 202 |
await proc.stdin.drain()
|
| 203 |
proc.stdin.close()
|
| 204 |
await proc.wait()
|
|
|
|
| 206 |
stderr = (await proc.stderr.read()).decode("utf-8", "ignore")
|
| 207 |
raise RuntimeError(f"Piper failed (code {proc.returncode}).\n{stderr}")
|
| 208 |
|
| 209 |
+
async def piper_stream_raw(text, voice, ws: WebSocket, length_scale, noise_scale, noise_w):
|
| 210 |
"""Stream RAW PCM frames over WS; send stderr as 'log' events; signal 'done'."""
|
|
|
|
| 211 |
cmd = build_piper_cmd(text, voice, to_stdout=True,
|
| 212 |
length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
|
| 213 |
proc = await asyncio.create_subprocess_exec(
|
|
|
|
| 392 |
background_tasks.add_task(_safe_unlink, out_path)
|
| 393 |
return FileResponse(out_path, media_type="audio/wav", filename=out_path.name)
|
| 394 |
|
| 395 |
+
# --- Diagnostics: inspect/refresh downloaded voices ---
|
| 396 |
@app.get("/debug/voices")
|
| 397 |
def debug_voices(redownload: bool = Query(False, description="Force re-download bad/missing files")):
|
| 398 |
out = {"dir": str(VOICES_DIR), "voices": []}
|
|
|
|
| 424 |
info["redownload_error"] = str(e)
|
| 425 |
return out
|
| 426 |
|
| 427 |
+
# --- Live streaming WS ---
|
| 428 |
@app.websocket("/ws/tts")
|
| 429 |
async def ws_tts(ws: WebSocket):
|
| 430 |
await ws.accept()
|
| 431 |
voice = DEFAULT_VOICE
|
| 432 |
length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90
|
| 433 |
+
voice_sr = 22050 # will be set from config on init
|
| 434 |
|
| 435 |
try:
|
| 436 |
while True:
|
|
|
|
| 452 |
await ws.send_text(json.dumps({"event": "error", "detail": str(e)}))
|
| 453 |
await ws.close()
|
| 454 |
return
|
| 455 |
+
# Announce the correct SR so the client opens the device properly
|
| 456 |
await ws.send_text(json.dumps({"event": "ready", "sr": voice_sr, "channels": DEFAULT_CH}))
|
| 457 |
elif ev == "speak":
|
| 458 |
text = (data.get("text") or "").strip()
|
| 459 |
if not text:
|
| 460 |
await ws.send_text(json.dumps({"event": "error", "detail": "empty text"}))
|
| 461 |
continue
|
| 462 |
+
await piper_stream_raw(text, voice, ws, length_scale, noise_scale, noise_w)
|
| 463 |
+
# ignore others
|
| 464 |
except WebSocketDisconnect:
|
| 465 |
return
|
| 466 |
except Exception as e:
|