import io import os import wave import asyncio from fastapi import FastAPI, HTTPException, Body from fastapi.responses import StreamingResponse, Response from piper import PiperVoice, SynthesisConfig import onnxruntime as ort app = FastAPI(title="Piper TTS API") MODEL_ONNX = os.environ.get("PIPER_MODEL", "models/en_US-ljspeech-medium.onnx") engine_ready = False engine_provider = "CPU" voice = None sample_rate = 22050 synth_lock = asyncio.Lock() async def init_engine(): global engine_ready, engine_provider, voice, sample_rate try: providers = [] try: providers = ort.get_available_providers() except Exception: providers = [] use_cuda = "CUDAExecutionProvider" in providers engine_provider = "CUDAExecutionProvider" if use_cuda else "CPU" voice = PiperVoice.load(MODEL_ONNX, use_cuda=use_cuda) # PiperVoice exposes effective sample rate via metadata on chunks; default to 22050 sample_rate = getattr(voice, "sample_rate", 22050) engine_ready = True except Exception as e: engine_ready = False raise e @app.on_event("startup") async def on_startup(): await init_engine() @app.get("/health") async def health(): if not engine_ready: raise HTTPException(503, "model not ready") return {"status": "ok", "provider": engine_provider, "sample_rate": sample_rate} def map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio): return SynthesisConfig( volume=float(volume), length_scale=float(length_scale), noise_scale=float(noise_scale), noise_w_scale=float(noise_w_scale), normalize_audio=bool(normalize_audio), ) @app.post("/tts") async def tts_stream( text: str = Body(..., embed=True), volume: float = 0.8, length_scale: float = 1.0, noise_scale: float = 0.6, noise_w_scale: float = 0.6, normalize_audio: bool = True, ): if not engine_ready: raise HTTPException(503, "model not ready") if not text or not text.strip(): raise HTTPException(400, "text required") cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio) if not synth_lock.locked(): await synth_lock.acquire() else: raise HTTPException(429, "synthesis in progress, try later") async def generator(): try: # Emit a minimal valid WAV header (PCM 16-bit mono) header_buf = io.BytesIO() with wave.open(header_buf, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(b"") header_buf.seek(0) yield header_buf.read() # Stream raw PCM frames as generated by Piper for chunk in voice.synthesize(text, syn_config=cfg): yield chunk.audio_int16_bytes finally: synth_lock.release() headers = {"Content-Disposition": 'inline; filename="speech.wav"', "Cache-Control": "no-store"} return StreamingResponse(generator(), media_type="audio/wav", headers=headers) @app.post("/tts-file") async def tts_file( text: str = Body(..., embed=True), volume: float = 0.8, length_scale: float = 1.0, noise_scale: float = 0.6, noise_w_scale: float = 0.6, normalize_audio: bool = True, ): if not engine_ready: raise HTTPException(503, "model not ready") if not text or not text.strip(): raise HTTPException(400, "text required") cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio) buf = io.BytesIO() with wave.open(buf, "wb") as wf: voice.synthesize_wav(text, wf, syn_config=cfg) audio = buf.getvalue() headers = {"Content-Disposition": 'attachment; filename="speech.wav"'} return Response(content=audio, media_type="audio/wav", headers=headers)