|
|
import asyncio |
|
|
import json |
|
|
import os |
|
|
import sys |
|
|
import shutil |
|
|
import time |
|
|
from pathlib import Path |
|
|
from typing import Dict, Optional, Tuple |
|
|
|
|
|
import uvicorn |
|
|
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, BackgroundTasks, Query, Header |
|
|
from fastapi.responses import JSONResponse, FileResponse, PlainTextResponse |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pick_writable_dir(candidates): |
|
|
errs = [] |
|
|
for p in candidates: |
|
|
if not p: |
|
|
continue |
|
|
try: |
|
|
p.mkdir(parents=True, exist_ok=True) |
|
|
probe = p / ".probe" |
|
|
probe.write_bytes(b"ok") |
|
|
probe.unlink(missing_ok=True) |
|
|
return p |
|
|
except Exception as e: |
|
|
errs.append(f"{p}: {type(e).__name__}({e})") |
|
|
raise RuntimeError("No writable dir. Tried:\n " + "\n ".join(errs)) |
|
|
|
|
|
ENV_DIR = os.getenv("TTS_DATA_DIR") |
|
|
VOICE_CANDIDATES = [ |
|
|
Path("/tmp/actualtts/voices"), |
|
|
Path("/dev/shm/actualtts_voices"), |
|
|
Path(ENV_DIR) / "voices" if ENV_DIR else None, |
|
|
] |
|
|
FILE_CANDIDATES = [ |
|
|
Path("/tmp/actualtts/files"), |
|
|
Path("/dev/shm/actualtts_files"), |
|
|
Path(ENV_DIR) / "files" if ENV_DIR else None, |
|
|
] |
|
|
|
|
|
VOICES_DIR = pick_writable_dir([p for p in VOICE_CANDIDATES if p]) |
|
|
FILES_DIR = pick_writable_dir([p for p in FILE_CANDIDATES if p]) |
|
|
|
|
|
def _safe_unlink(path: Path): |
|
|
try: |
|
|
path.unlink(missing_ok=True) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resolve_piper_cmd(): |
|
|
env = os.getenv("PIPER_BIN") |
|
|
if env: |
|
|
return env.split() |
|
|
path = shutil.which("piper") |
|
|
if path: |
|
|
return [path] |
|
|
|
|
|
return [sys.executable, "-m", "piper"] |
|
|
|
|
|
PIPER_CMD = resolve_piper_cmd() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SENTENCE_SILENCE = float(os.getenv("PIPER_SENTENCE_SILENCE", "0.05")) |
|
|
|
|
|
|
|
|
STREAM_BATCH_MS = int(os.getenv("STREAM_BATCH_MS", "100")) |
|
|
|
|
|
DEFAULT_CH = 1 |
|
|
|
|
|
|
|
|
PREBUFFER_MS = int(os.getenv("PREBUFFER_MS", "6000")) |
|
|
PREBUFFER_MAX_WAIT_MS = int(os.getenv("PREBUFFER_MAX_WAIT_MS", "15000")) |
|
|
|
|
|
|
|
|
MAX_TEXT_CHARS = int(os.getenv("MAX_TEXT_CHARS", "800")) |
|
|
|
|
|
|
|
|
AUTH_SHARED_SECRET = (os.getenv("AUTH_SHARED_SECRET") or "").strip() |
|
|
|
|
|
def _auth_ok(x_auth: Optional[str]) -> bool: |
|
|
return (not AUTH_SHARED_SECRET) or (x_auth == AUTH_SHARED_SECRET) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HF_REPO_BASE = "https://huggingface.co/rhasspy/piper-voices/resolve" |
|
|
HF_REV = os.getenv("PIPER_VOICES_REV", "main") |
|
|
|
|
|
|
|
|
MIN_ONNX_BYTES = int(os.getenv("MIN_ONNX_BYTES", "5000000")) |
|
|
MIN_JSON_BYTES = int(os.getenv("MIN_JSON_BYTES", "1000")) |
|
|
|
|
|
|
|
|
VOICE_MAP: Dict[str, Tuple[str, str, str, str, str]] = { |
|
|
"en_US-libritts-high": ("en", "en_US", "libritts", "high", "en_US-libritts-high"), |
|
|
"en_US-lessac-high": ("en", "en_US", "lessac", "high", "en_US-lessac-high"), |
|
|
"en_US-amy-medium": ("en", "en_US", "amy", "medium", "en_US-amy-medium"), |
|
|
} |
|
|
|
|
|
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "en_US-libritts-high") |
|
|
|
|
|
_http = None |
|
|
def http(): |
|
|
global _http |
|
|
if _http is None: |
|
|
import requests |
|
|
_http = requests.Session() |
|
|
_http.headers.update({"User-Agent": "ActualTTS/CPU"}) |
|
|
return _http |
|
|
|
|
|
def _download(url: str, dest: Path, timeout: int = 300): |
|
|
dest.parent.mkdir(parents=True, exist_ok=True) |
|
|
with http().get(url, timeout=timeout, stream=True, headers={"Accept": "application/octet-stream"}) as r: |
|
|
r.raise_for_status() |
|
|
ct = (r.headers.get("content-type") or "").lower() |
|
|
if "text/html" in ct: |
|
|
raise RuntimeError(f"Bad content-type for {url}: {ct}") |
|
|
tmp = dest.with_suffix(dest.suffix + ".part") |
|
|
with open(tmp, "wb") as f: |
|
|
for chunk in r.iter_content(1 << 16): |
|
|
if chunk: |
|
|
f.write(chunk) |
|
|
tmp.replace(dest) |
|
|
|
|
|
def _file_ok(p: Path, min_bytes: int) -> bool: |
|
|
try: |
|
|
return p.exists() and p.stat().st_size >= min_bytes |
|
|
except Exception: |
|
|
return False |
|
|
|
|
|
def _read_sr_from_cfg(cfg_path: Path) -> int: |
|
|
try: |
|
|
with open(cfg_path, "r", encoding="utf-8") as f: |
|
|
j = json.load(f) |
|
|
sr = int(j.get("sample_rate", 22050)) |
|
|
|
|
|
return sr if sr in (16000, 22050, 24000, 44100, 48000) else 22050 |
|
|
except Exception: |
|
|
return 22050 |
|
|
|
|
|
def ensure_voice(voice_id: str) -> Dict[str, object]: |
|
|
"""Ensure voice .onnx and .onnx.json exist locally with sane sizes. Returns paths and SR.""" |
|
|
|
|
|
if voice_id.lower() in ("en-us", "en_us", "english"): |
|
|
voice_id = "en_US-libritts-high" |
|
|
if voice_id not in VOICE_MAP: |
|
|
raise RuntimeError(f"Unknown voice '{voice_id}'. Known: {list(VOICE_MAP)}") |
|
|
|
|
|
lang, country, family, quality, base = VOICE_MAP[voice_id] |
|
|
vdir = VOICES_DIR / voice_id |
|
|
model = vdir / f"{base}.onnx" |
|
|
cfg = vdir / f"{base}.onnx.json" |
|
|
|
|
|
vdir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
prefix = f"{HF_REPO_BASE}/{HF_REV}/{lang}/{country}/{family}/{quality}/{base}" |
|
|
model_url = f"{prefix}.onnx" |
|
|
cfg_url = f"{prefix}.onnx.json" |
|
|
|
|
|
if not _file_ok(model, MIN_ONNX_BYTES): |
|
|
_download(model_url, model) |
|
|
if not _file_ok(cfg, MIN_JSON_BYTES): |
|
|
_download(cfg_url, cfg) |
|
|
|
|
|
if not _file_ok(model, MIN_ONNX_BYTES): |
|
|
sz = model.stat().st_size if model.exists() else 0 |
|
|
raise RuntimeError(f"Downloaded .onnx too small ({sz} bytes) for '{voice_id}'") |
|
|
if not _file_ok(cfg, MIN_JSON_BYTES): |
|
|
sz = cfg.stat().st_size if cfg.exists() else 0 |
|
|
raise RuntimeError(f"Downloaded .onnx.json too small ({sz} bytes) for '{voice_id}'") |
|
|
|
|
|
sr = _read_sr_from_cfg(cfg) |
|
|
return {"model": model, "config": cfg, "sr": sr} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_piper_cmd( |
|
|
text: str, voice_id: str, to_stdout: bool, |
|
|
out_path: Optional[Path] = None, |
|
|
length_scale: float = 1.08, noise_scale: float = 0.35, noise_w: float = 0.90 |
|
|
) -> list: |
|
|
vc = ensure_voice(voice_id) |
|
|
cmd = [ |
|
|
*PIPER_CMD, |
|
|
"-m", str(vc["model"]), |
|
|
"-c", str(vc["config"]), |
|
|
"--length-scale", str(length_scale), |
|
|
"--noise-scale", str(noise_scale), |
|
|
"--noise-w", str(noise_w), |
|
|
"--sentence-silence", str(SENTENCE_SILENCE), |
|
|
] |
|
|
if to_stdout: |
|
|
|
|
|
cmd += ["-f", "-", "--output-raw"] |
|
|
else: |
|
|
if out_path is None: |
|
|
raise ValueError("out_path required when to_stdout=False") |
|
|
|
|
|
cmd += ["-f", str(out_path)] |
|
|
return cmd |
|
|
|
|
|
async def piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w): |
|
|
cmd = build_piper_cmd(text, voice, to_stdout=False, out_path=out_path, |
|
|
length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w) |
|
|
proc = await asyncio.create_subprocess_exec( |
|
|
*cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE |
|
|
) |
|
|
|
|
|
proc.stdin.write((text + "\n").encode("utf-8")) |
|
|
await proc.stdin.drain() |
|
|
proc.stdin.close() |
|
|
await proc.wait() |
|
|
if proc.returncode != 0: |
|
|
stderr = (await proc.stderr.read()).decode("utf-8", "ignore") |
|
|
raise RuntimeError(f"Piper failed (code {proc.returncode}).\n{stderr}") |
|
|
|
|
|
async def piper_stream_raw( |
|
|
text: str, |
|
|
voice: str, |
|
|
ws: WebSocket, |
|
|
sr: int, |
|
|
channels: int, |
|
|
length_scale: float, |
|
|
noise_scale: float, |
|
|
noise_w: float, |
|
|
prebuffer_ms: int, |
|
|
prebuffer_max_wait_ms: int, |
|
|
): |
|
|
""" |
|
|
Synthesize immediately; stream in *batched, clock-paced* frames: |
|
|
- Accumulate audio until `prebuffer_ms` (or `prebuffer_max_wait_ms` elapses). |
|
|
- Then send fixed batches of STREAM_BATCH_MS at a steady cadence. |
|
|
""" |
|
|
cmd = build_piper_cmd(text, voice, to_stdout=True, |
|
|
length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w) |
|
|
proc = await asyncio.create_subprocess_exec( |
|
|
*cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE |
|
|
) |
|
|
|
|
|
|
|
|
proc.stdin.write((text + "\n").encode("utf-8")) |
|
|
await proc.stdin.drain() |
|
|
proc.stdin.close() |
|
|
|
|
|
async def pump_stderr(): |
|
|
try: |
|
|
while True: |
|
|
line = await proc.stderr.readline() |
|
|
if not line: |
|
|
break |
|
|
try: |
|
|
await ws.send_text(json.dumps({"event": "log", "stderr": line.decode("utf-8", "ignore").rstrip()})) |
|
|
except Exception: |
|
|
break |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
stderr_task = asyncio.create_task(pump_stderr()) |
|
|
total = 0 |
|
|
|
|
|
|
|
|
bytes_per_ms = max(1, int(sr * channels * 2 / 1000)) |
|
|
batch_bytes = max(bytes_per_ms, int(STREAM_BATCH_MS * bytes_per_ms)) |
|
|
target_prebuffer_bytes = max(0, int(prebuffer_ms) * bytes_per_ms) |
|
|
|
|
|
buf = bytearray() |
|
|
started_streaming = False |
|
|
first_audio_ts = None |
|
|
pace_start_t = None |
|
|
batches_sent = 0 |
|
|
|
|
|
try: |
|
|
while True: |
|
|
chunk = await proc.stdout.read(8192) |
|
|
if chunk: |
|
|
if first_audio_ts is None: |
|
|
first_audio_ts = time.time() |
|
|
buf.extend(chunk) |
|
|
|
|
|
|
|
|
if not started_streaming: |
|
|
enough = (len(buf) >= target_prebuffer_bytes) if target_prebuffer_bytes > 0 else True |
|
|
waited = False |
|
|
if first_audio_ts is not None and prebuffer_max_wait_ms > 0: |
|
|
waited = ((time.time() - first_audio_ts) * 1000.0) >= prebuffer_max_wait_ms |
|
|
if enough or waited: |
|
|
started_streaming = True |
|
|
pace_start_t = time.time() |
|
|
batches_sent = 0 |
|
|
|
|
|
|
|
|
if started_streaming: |
|
|
while len(buf) >= batch_bytes: |
|
|
due_t = pace_start_t + (batches_sent * STREAM_BATCH_MS) / 1000.0 |
|
|
sleep_s = due_t - time.time() |
|
|
if sleep_s > 0: |
|
|
await asyncio.sleep(sleep_s) |
|
|
await ws.send_bytes(buf[:batch_bytes]) |
|
|
del buf[:batch_bytes] |
|
|
total += batch_bytes |
|
|
batches_sent += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
if not started_streaming and len(buf) > 0: |
|
|
started_streaming = True |
|
|
pace_start_t = time.time() |
|
|
batches_sent = 0 |
|
|
|
|
|
|
|
|
while len(buf) >= batch_bytes: |
|
|
due_t = pace_start_t + (batches_sent * STREAM_BATCH_MS) / 1000.0 |
|
|
sleep_s = due_t - time.time() |
|
|
if sleep_s > 0: |
|
|
await asyncio.sleep(sleep_s) |
|
|
await ws.send_bytes(buf[:batch_bytes]) |
|
|
del buf[:batch_bytes] |
|
|
total += batch_bytes |
|
|
batches_sent += 1 |
|
|
|
|
|
|
|
|
if len(buf) > 0: |
|
|
await ws.send_bytes(bytes(buf)) |
|
|
total += len(buf) |
|
|
buf.clear() |
|
|
break |
|
|
|
|
|
await proc.wait() |
|
|
await stderr_task |
|
|
|
|
|
if proc.returncode != 0: |
|
|
rem = await proc.stderr.read() |
|
|
detail = rem.decode("utf-8", "ignore").strip() |
|
|
await ws.send_text(json.dumps({"event": "error", "detail": detail or f"piper exited {proc.returncode}"})) |
|
|
else: |
|
|
if total == 0: |
|
|
await ws.send_text(json.dumps({"event": "error", "detail": "No audio produced"})) |
|
|
else: |
|
|
await ws.send_text(json.dumps({"event": "done"})) |
|
|
except WebSocketDisconnect: |
|
|
try: |
|
|
proc.kill() |
|
|
except Exception: |
|
|
pass |
|
|
try: |
|
|
await stderr_task |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI(title="ActualTTS (CPU)") |
|
|
|
|
|
@app.get("/health") |
|
|
def health(): |
|
|
voices = [] |
|
|
if VOICES_DIR.exists(): |
|
|
for child in VOICES_DIR.iterdir(): |
|
|
if not child.is_dir(): |
|
|
continue |
|
|
name = child.name |
|
|
base = child / f"{name}.onnx" |
|
|
cfg = child / f"{name}.onnx.json" |
|
|
if _file_ok(base, MIN_ONNX_BYTES) and _file_ok(cfg, MIN_JSON_BYTES): |
|
|
voices.append({"id": name, "sr": _read_sr_from_cfg(cfg)}) |
|
|
|
|
|
try: |
|
|
import numpy, onnxruntime as ort |
|
|
numpy_version = numpy.__version__ |
|
|
onnxruntime_version = ort.__version__ |
|
|
except Exception: |
|
|
numpy_version = onnxruntime_version = None |
|
|
|
|
|
return { |
|
|
"ok": True, |
|
|
"engine": "piper-tts (CLI, CPU)", |
|
|
"default_voice": DEFAULT_VOICE, |
|
|
"voice_dir": str(VOICES_DIR), |
|
|
"available_voices": voices, |
|
|
"files_dir": str(FILES_DIR), |
|
|
"numpy": numpy_version, |
|
|
"onnxruntime": onnxruntime_version, |
|
|
} |
|
|
|
|
|
@app.get("/") |
|
|
def root(): |
|
|
return PlainTextResponse("ActualTTS (CPU) — use POST /speak, GET/POST /speak.wav, or WS /ws/tts") |
|
|
|
|
|
@app.post("/provision") |
|
|
async def provision(request: Request, x_auth: Optional[str] = Header(None)): |
|
|
""" |
|
|
POST JSON: { "voice": "en_US-amy-medium" } |
|
|
Downloads voice assets if missing. Returns {ok, voice, sr}. |
|
|
""" |
|
|
if not _auth_ok(x_auth): |
|
|
return JSONResponse({"ok": False, "error": "unauthorized"}, status_code=401) |
|
|
try: |
|
|
body = await request.json() |
|
|
except Exception: |
|
|
return JSONResponse({"ok": False, "error": "invalid json"}, status_code=400) |
|
|
voice = (body.get("voice") or DEFAULT_VOICE).strip() |
|
|
try: |
|
|
info = ensure_voice(voice) |
|
|
return {"ok": True, "voice": voice, "sr": int(info.get("sr", 22050))} |
|
|
except Exception as e: |
|
|
return JSONResponse({"ok": False, "error": str(e)}, status_code=500) |
|
|
|
|
|
@app.get("/file/{name}") |
|
|
def get_file(name: str): |
|
|
path = FILES_DIR / name |
|
|
if not path.exists(): |
|
|
return JSONResponse({"ok": False, "error": "not found"}, status_code=404) |
|
|
return FileResponse(path) |
|
|
|
|
|
def _validate_text(text: str) -> Optional[str]: |
|
|
if not text: |
|
|
return "Missing text" |
|
|
if len(text) > MAX_TEXT_CHARS: |
|
|
return f"text too long (>{MAX_TEXT_CHARS} chars)" |
|
|
return None |
|
|
|
|
|
@app.post("/speak") |
|
|
async def speak(request: Request, x_auth: Optional[str] = Header(None)): |
|
|
""" |
|
|
POST JSON: |
|
|
{ "text": "Hello", "voice": "en_US-libritts-high", |
|
|
"length_scale": 1.08, "noise_scale": 0.35, "noise_w": 0.90 } |
|
|
Returns: { "ok": true, "audio_url": "/file/tts-XXXX.wav" } |
|
|
""" |
|
|
if not _auth_ok(x_auth): |
|
|
return JSONResponse({"ok": False, "error": "unauthorized"}, status_code=401) |
|
|
try: |
|
|
body = await request.json() |
|
|
except Exception: |
|
|
return JSONResponse({"detail": "Invalid JSON"}, status_code=400) |
|
|
|
|
|
text = (body.get("text") or "").strip() |
|
|
err = _validate_text(text) |
|
|
if err: |
|
|
return JSONResponse({"detail": err}, status_code=400) |
|
|
|
|
|
voice = (body.get("voice") or DEFAULT_VOICE).strip() |
|
|
length_scale = float(body.get("length_scale", 1.08)) |
|
|
noise_scale = float(body.get("noise_scale", 0.35)) |
|
|
noise_w = float(body.get("noise_w", 0.90)) |
|
|
|
|
|
ts = int(time.time() * 1000) |
|
|
out_path = FILES_DIR / f"tts-{ts}.wav" |
|
|
|
|
|
try: |
|
|
ensure_voice(voice) |
|
|
await piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w) |
|
|
except Exception as e: |
|
|
return JSONResponse({"ok": False, "error": str(e)}, status_code=500) |
|
|
|
|
|
return {"ok": True, "audio_url": f"/file/{out_path.name}"} |
|
|
|
|
|
@app.post("/speak.wav") |
|
|
async def speak_wav_post(request: Request, background_tasks: BackgroundTasks, x_auth: Optional[str] = Header(None)): |
|
|
"""POST JSON -> returns audio/wav directly""" |
|
|
if not _auth_ok(x_auth): |
|
|
return JSONResponse({"ok": False, "error": "unauthorized"}, status_code=401) |
|
|
try: |
|
|
body = await request.json() |
|
|
except Exception: |
|
|
return JSONResponse({"detail": "Invalid JSON"}, status_code=400) |
|
|
|
|
|
text = (body.get("text") or "").strip() |
|
|
err = _validate_text(text) |
|
|
if err: |
|
|
return JSONResponse({"detail": err}, status_code=400) |
|
|
|
|
|
voice = (body.get("voice") or DEFAULT_VOICE).strip() |
|
|
length_scale = float(body.get("length_scale", 1.08)) |
|
|
noise_scale = float(body.get("noise_scale", 0.35)) |
|
|
noise_w = float(body.get("noise_w", 0.90)) |
|
|
|
|
|
ts = int(time.time() * 1000) |
|
|
out_path = FILES_DIR / f"tts-{ts}.wav" |
|
|
|
|
|
try: |
|
|
ensure_voice(voice) |
|
|
await piper_to_file(text, voice, out_path, length_scale, noise_scale, noise_w) |
|
|
except Exception as e: |
|
|
return JSONResponse({"ok": False, "error": str(e)}, status_code=500) |
|
|
|
|
|
background_tasks.add_task(_safe_unlink, out_path) |
|
|
return FileResponse(out_path, media_type="audio/wav", filename=out_path.name, background=background_tasks) |
|
|
|
|
|
@app.get("/speak.wav") |
|
|
async def speak_wav_get( |
|
|
text: str, |
|
|
voice: str = DEFAULT_VOICE, |
|
|
length_scale: float = 1.08, |
|
|
noise_scale: float = 0.35, |
|
|
noise_w: float = 0.90, |
|
|
background_tasks: BackgroundTasks = None, |
|
|
x_auth: Optional[str] = Header(None), |
|
|
): |
|
|
"""GET query -> returns audio/wav directly""" |
|
|
if not _auth_ok(x_auth): |
|
|
return JSONResponse({"ok": False, "error": "unauthorized"}, status_code=401) |
|
|
|
|
|
text = (text or "").strip() |
|
|
err = _validate_text(text) |
|
|
if err: |
|
|
return JSONResponse({"detail": err}, status_code=400) |
|
|
|
|
|
ts = int(time.time() * 1000) |
|
|
out_path = FILES_DIR / f"tts-{ts}.wav" |
|
|
|
|
|
try: |
|
|
ensure_voice(voice.strip()) |
|
|
await piper_to_file(text, voice.strip(), out_path, float(length_scale), float(noise_scale), float(noise_w)) |
|
|
except Exception as e: |
|
|
return JSONResponse({"ok": False, "error": str(e)}, status_code=500) |
|
|
|
|
|
if background_tasks: |
|
|
background_tasks.add_task(_safe_unlink, out_path) |
|
|
return FileResponse(out_path, media_type="audio/wav", filename=out_path.name) |
|
|
|
|
|
|
|
|
@app.get("/debug/voices") |
|
|
def debug_voices(redownload: bool = Query(False, description="Force re-download bad/missing files")): |
|
|
out = {"dir": str(VOICES_DIR), "voices": []} |
|
|
for vid, (_lang, _country, _family, _quality, base) in VOICE_MAP.items(): |
|
|
vdir = VOICES_DIR / vid |
|
|
model = vdir / f"{base}.onnx" |
|
|
cfg = vdir / f"{base}.onnx.json" |
|
|
info = { |
|
|
"id": vid, |
|
|
"model": str(model), "cfg": str(cfg), |
|
|
"model_exists": model.exists(), "cfg_exists": cfg.exists(), |
|
|
"model_size": (model.stat().st_size if model.exists() else 0), |
|
|
"cfg_size": (cfg.stat().st_size if cfg.exists() else 0), |
|
|
"sr": _read_sr_from_cfg(cfg) if cfg.exists() else None, |
|
|
} |
|
|
out["voices"].append(info) |
|
|
|
|
|
if redownload and (not _file_ok(model, MIN_ONNX_BYTES) or not _file_ok(cfg, MIN_JSON_BYTES)): |
|
|
try: |
|
|
vdir.mkdir(parents=True, exist_ok=True) |
|
|
for p in vdir.glob("*"): |
|
|
p.unlink(missing_ok=True) |
|
|
ensure_voice(vid) |
|
|
info["redownloaded"] = True |
|
|
info["model_size"] = (model.stat().st_size if model.exists() else 0) |
|
|
info["cfg_size"] = (cfg.stat().st_size if cfg.exists() else 0) |
|
|
info["sr"] = _read_sr_from_cfg(cfg) if cfg.exists() else None |
|
|
except Exception as e: |
|
|
info["redownload_error"] = str(e) |
|
|
return out |
|
|
|
|
|
|
|
|
@app.websocket("/ws/tts") |
|
|
async def ws_tts(ws: WebSocket): |
|
|
await ws.accept() |
|
|
voice = DEFAULT_VOICE |
|
|
length_scale, noise_scale, noise_w = 1.08, 0.35, 0.90 |
|
|
voice_sr = 22050 |
|
|
|
|
|
prebuffer_ms = PREBUFFER_MS |
|
|
prebuffer_max_wait_ms = PREBUFFER_MAX_WAIT_MS |
|
|
|
|
|
try: |
|
|
while True: |
|
|
msg = await ws.receive_text() |
|
|
try: |
|
|
data = json.loads(msg) |
|
|
except Exception: |
|
|
continue |
|
|
ev = data.get("event") |
|
|
if ev == "init": |
|
|
|
|
|
token = (data.get("token") or "") |
|
|
if AUTH_SHARED_SECRET and token != AUTH_SHARED_SECRET: |
|
|
await ws.send_text(json.dumps({"event": "error", "detail": "unauthorized"})) |
|
|
await ws.close(); return |
|
|
|
|
|
voice = (data.get("voice") or voice).strip() |
|
|
|
|
|
if "length_scale" in data: length_scale = float(data["length_scale"]) |
|
|
if "noise_scale" in data: noise_scale = float(data["noise_scale"]) |
|
|
if "noise_w" in data: noise_w = float(data["noise_w"]) |
|
|
|
|
|
if "length_scale" not in data and "rate_wpm" in data: |
|
|
try: |
|
|
rate_wpm = int(data.get("rate_wpm", 165)) |
|
|
length_scale = max(0.70, min(1.40, 165.0 / max(100, rate_wpm))) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
if "prebuffer_ms" in data: |
|
|
try: prebuffer_ms = max(0, int(data["prebuffer_ms"])) |
|
|
except Exception: pass |
|
|
if "prebuffer_max_wait_ms" in data: |
|
|
try: prebuffer_max_wait_ms = max(0, int(data["prebuffer_max_wait_ms"])) |
|
|
except Exception: pass |
|
|
|
|
|
try: |
|
|
info = ensure_voice(voice) |
|
|
voice_sr = int(info.get("sr", 22050)) |
|
|
except Exception as e: |
|
|
await ws.send_text(json.dumps({"event": "error", "detail": str(e)})) |
|
|
await ws.close() |
|
|
return |
|
|
|
|
|
await ws.send_text(json.dumps({"event": "ready", "sr": voice_sr, "channels": DEFAULT_CH})) |
|
|
elif ev == "speak": |
|
|
text = (data.get("text") or "").strip() |
|
|
if not text: |
|
|
await ws.send_text(json.dumps({"event": "error", "detail": "empty text"})) |
|
|
continue |
|
|
if len(text) > MAX_TEXT_CHARS: |
|
|
await ws.send_text(json.dumps({"event":"error","detail": f"text too long (>{MAX_TEXT_CHARS})"})) |
|
|
continue |
|
|
await piper_stream_raw( |
|
|
text, voice, ws, voice_sr, DEFAULT_CH, length_scale, noise_scale, noise_w, |
|
|
prebuffer_ms, prebuffer_max_wait_ms |
|
|
) |
|
|
|
|
|
except WebSocketDisconnect: |
|
|
return |
|
|
except Exception as e: |
|
|
try: |
|
|
await ws.send_text(json.dumps({"event": "error", "detail": str(e)})) |
|
|
except Exception: |
|
|
pass |
|
|
try: |
|
|
await ws.close() |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
if __name__ == "__main__": |
|
|
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")), reload=False) |
|
|
|