tts / app.py
triflix's picture
Update app.py
2eeb709 verified
import io
import os
import wave
import asyncio
from fastapi import FastAPI, HTTPException, Body
from fastapi.responses import StreamingResponse, Response
from piper import PiperVoice, SynthesisConfig
import onnxruntime as ort
app = FastAPI(title="Piper TTS API")
MODEL_ONNX = os.environ.get("PIPER_MODEL", "models/en_US-ljspeech-medium.onnx")
engine_ready = False
engine_provider = "CPU"
voice = None
sample_rate = 22050
synth_lock = asyncio.Lock()
async def init_engine():
global engine_ready, engine_provider, voice, sample_rate
try:
providers = []
try:
providers = ort.get_available_providers()
except Exception:
providers = []
use_cuda = "CUDAExecutionProvider" in providers
engine_provider = "CUDAExecutionProvider" if use_cuda else "CPU"
voice = PiperVoice.load(MODEL_ONNX, use_cuda=use_cuda)
# PiperVoice exposes effective sample rate via metadata on chunks; default to 22050
sample_rate = getattr(voice, "sample_rate", 22050)
engine_ready = True
except Exception as e:
engine_ready = False
raise e
@app.on_event("startup")
async def on_startup():
await init_engine()
@app.get("/health")
async def health():
if not engine_ready:
raise HTTPException(503, "model not ready")
return {"status": "ok", "provider": engine_provider, "sample_rate": sample_rate}
def map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio):
return SynthesisConfig(
volume=float(volume),
length_scale=float(length_scale),
noise_scale=float(noise_scale),
noise_w_scale=float(noise_w_scale),
normalize_audio=bool(normalize_audio),
)
@app.post("/tts")
async def tts_stream(
text: str = Body(..., embed=True),
volume: float = 0.8,
length_scale: float = 1.0,
noise_scale: float = 0.6,
noise_w_scale: float = 0.6,
normalize_audio: bool = True,
):
if not engine_ready:
raise HTTPException(503, "model not ready")
if not text or not text.strip():
raise HTTPException(400, "text required")
cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio)
if not synth_lock.locked():
await synth_lock.acquire()
else:
raise HTTPException(429, "synthesis in progress, try later")
async def generator():
try:
# Emit a minimal valid WAV header (PCM 16-bit mono)
header_buf = io.BytesIO()
with wave.open(header_buf, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(b"")
header_buf.seek(0)
yield header_buf.read()
# Stream raw PCM frames as generated by Piper
for chunk in voice.synthesize(text, syn_config=cfg):
yield chunk.audio_int16_bytes
finally:
synth_lock.release()
headers = {"Content-Disposition": 'inline; filename="speech.wav"', "Cache-Control": "no-store"}
return StreamingResponse(generator(), media_type="audio/wav", headers=headers)
@app.post("/tts-file")
async def tts_file(
text: str = Body(..., embed=True),
volume: float = 0.8,
length_scale: float = 1.0,
noise_scale: float = 0.6,
noise_w_scale: float = 0.6,
normalize_audio: bool = True,
):
if not engine_ready:
raise HTTPException(503, "model not ready")
if not text or not text.strip():
raise HTTPException(400, "text required")
cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio)
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
voice.synthesize_wav(text, wf, syn_config=cfg)
audio = buf.getvalue()
headers = {"Content-Disposition": 'attachment; filename="speech.wav"'}
return Response(content=audio, media_type="audio/wav", headers=headers)