|
|
import io |
|
|
import os |
|
|
import wave |
|
|
import asyncio |
|
|
from fastapi import FastAPI, HTTPException, Body |
|
|
from fastapi.responses import StreamingResponse, Response |
|
|
|
|
|
|
|
|
from piper import PiperVoice, SynthesisConfig |
|
|
import onnxruntime as ort |
|
|
|
|
|
app = FastAPI(title="Piper TTS API") |
|
|
|
|
|
MODEL_ONNX = os.environ.get("PIPER_MODEL", "models/en_US-ljspeech-medium.onnx") |
|
|
|
|
|
engine_ready = False |
|
|
engine_provider = "CPU" |
|
|
voice = None |
|
|
sample_rate = 22050 |
|
|
synth_lock = asyncio.Lock() |
|
|
|
|
|
async def init_engine(): |
|
|
global engine_ready, engine_provider, voice, sample_rate |
|
|
try: |
|
|
providers = [] |
|
|
try: |
|
|
providers = ort.get_available_providers() |
|
|
except Exception: |
|
|
providers = [] |
|
|
use_cuda = "CUDAExecutionProvider" in providers |
|
|
engine_provider = "CUDAExecutionProvider" if use_cuda else "CPU" |
|
|
voice = PiperVoice.load(MODEL_ONNX, use_cuda=use_cuda) |
|
|
|
|
|
sample_rate = getattr(voice, "sample_rate", 22050) |
|
|
engine_ready = True |
|
|
except Exception as e: |
|
|
engine_ready = False |
|
|
raise e |
|
|
|
|
|
@app.on_event("startup") |
|
|
async def on_startup(): |
|
|
await init_engine() |
|
|
|
|
|
@app.get("/health") |
|
|
async def health(): |
|
|
if not engine_ready: |
|
|
raise HTTPException(503, "model not ready") |
|
|
return {"status": "ok", "provider": engine_provider, "sample_rate": sample_rate} |
|
|
|
|
|
def map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio): |
|
|
return SynthesisConfig( |
|
|
volume=float(volume), |
|
|
length_scale=float(length_scale), |
|
|
noise_scale=float(noise_scale), |
|
|
noise_w_scale=float(noise_w_scale), |
|
|
normalize_audio=bool(normalize_audio), |
|
|
) |
|
|
|
|
|
@app.post("/tts") |
|
|
async def tts_stream( |
|
|
text: str = Body(..., embed=True), |
|
|
volume: float = 0.8, |
|
|
length_scale: float = 1.0, |
|
|
noise_scale: float = 0.6, |
|
|
noise_w_scale: float = 0.6, |
|
|
normalize_audio: bool = True, |
|
|
): |
|
|
if not engine_ready: |
|
|
raise HTTPException(503, "model not ready") |
|
|
if not text or not text.strip(): |
|
|
raise HTTPException(400, "text required") |
|
|
|
|
|
cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio) |
|
|
|
|
|
if not synth_lock.locked(): |
|
|
await synth_lock.acquire() |
|
|
else: |
|
|
raise HTTPException(429, "synthesis in progress, try later") |
|
|
|
|
|
async def generator(): |
|
|
try: |
|
|
|
|
|
header_buf = io.BytesIO() |
|
|
with wave.open(header_buf, "wb") as wf: |
|
|
wf.setnchannels(1) |
|
|
wf.setsampwidth(2) |
|
|
wf.setframerate(sample_rate) |
|
|
wf.writeframes(b"") |
|
|
header_buf.seek(0) |
|
|
yield header_buf.read() |
|
|
|
|
|
|
|
|
for chunk in voice.synthesize(text, syn_config=cfg): |
|
|
yield chunk.audio_int16_bytes |
|
|
finally: |
|
|
synth_lock.release() |
|
|
|
|
|
headers = {"Content-Disposition": 'inline; filename="speech.wav"', "Cache-Control": "no-store"} |
|
|
return StreamingResponse(generator(), media_type="audio/wav", headers=headers) |
|
|
|
|
|
@app.post("/tts-file") |
|
|
async def tts_file( |
|
|
text: str = Body(..., embed=True), |
|
|
volume: float = 0.8, |
|
|
length_scale: float = 1.0, |
|
|
noise_scale: float = 0.6, |
|
|
noise_w_scale: float = 0.6, |
|
|
normalize_audio: bool = True, |
|
|
): |
|
|
if not engine_ready: |
|
|
raise HTTPException(503, "model not ready") |
|
|
if not text or not text.strip(): |
|
|
raise HTTPException(400, "text required") |
|
|
|
|
|
cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio) |
|
|
buf = io.BytesIO() |
|
|
with wave.open(buf, "wb") as wf: |
|
|
voice.synthesize_wav(text, wf, syn_config=cfg) |
|
|
audio = buf.getvalue() |
|
|
headers = {"Content-Disposition": 'attachment; filename="speech.wav"'} |
|
|
return Response(content=audio, media_type="audio/wav", headers=headers) |
|
|
|