import io
import os
import wave
import asyncio
from fastapi import FastAPI, HTTPException, Body
from fastapi.responses import StreamingResponse, Response


from piper import PiperVoice, SynthesisConfig
import onnxruntime as ort

app = FastAPI(title="Piper TTS API")

MODEL_ONNX = os.environ.get("PIPER_MODEL", "models/en_US-ljspeech-medium.onnx")

engine_ready = False
engine_provider = "CPU"
voice = None
sample_rate = 22050
synth_lock = asyncio.Lock()

async def init_engine():
    global engine_ready, engine_provider, voice, sample_rate
    try:
        providers = []
        try:
            providers = ort.get_available_providers()
        except Exception:
            providers = []
        use_cuda = "CUDAExecutionProvider" in providers
        engine_provider = "CUDAExecutionProvider" if use_cuda else "CPU"
        voice = PiperVoice.load(MODEL_ONNX, use_cuda=use_cuda)
        # PiperVoice exposes effective sample rate via metadata on chunks; default to 22050
        sample_rate = getattr(voice, "sample_rate", 22050)
        engine_ready = True
    except Exception as e:
        engine_ready = False
        raise e

@app.on_event("startup")
async def on_startup():
    await init_engine()

@app.get("/health")
async def health():
    if not engine_ready:
        raise HTTPException(503, "model not ready")
    return {"status": "ok", "provider": engine_provider, "sample_rate": sample_rate}

def map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio):
    return SynthesisConfig(
        volume=float(volume),
        length_scale=float(length_scale),
        noise_scale=float(noise_scale),
        noise_w_scale=float(noise_w_scale),
        normalize_audio=bool(normalize_audio),
    )

@app.post("/tts")
async def tts_stream(
    text: str = Body(..., embed=True),
    volume: float = 0.8,
    length_scale: float = 1.0,
    noise_scale: float = 0.6,
    noise_w_scale: float = 0.6,
    normalize_audio: bool = True,
):
    if not engine_ready:
        raise HTTPException(503, "model not ready")
    if not text or not text.strip():
        raise HTTPException(400, "text required")

    cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio)

    if not synth_lock.locked():
        await synth_lock.acquire()
    else:
        raise HTTPException(429, "synthesis in progress, try later")

    async def generator():
        try:
            # Emit a minimal valid WAV header (PCM 16-bit mono)
            header_buf = io.BytesIO()
            with wave.open(header_buf, "wb") as wf:
                wf.setnchannels(1)
                wf.setsampwidth(2)
                wf.setframerate(sample_rate)
                wf.writeframes(b"")
            header_buf.seek(0)
            yield header_buf.read()

            # Stream raw PCM frames as generated by Piper
            for chunk in voice.synthesize(text, syn_config=cfg):
                yield chunk.audio_int16_bytes
        finally:
            synth_lock.release()

    headers = {"Content-Disposition": 'inline; filename="speech.wav"', "Cache-Control": "no-store"}
    return StreamingResponse(generator(), media_type="audio/wav", headers=headers)

@app.post("/tts-file")
async def tts_file(
    text: str = Body(..., embed=True),
    volume: float = 0.8,
    length_scale: float = 1.0,
    noise_scale: float = 0.6,
    noise_w_scale: float = 0.6,
    normalize_audio: bool = True,
):
    if not engine_ready:
        raise HTTPException(503, "model not ready")
    if not text or not text.strip():
        raise HTTPException(400, "text required")

    cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio)
    buf = io.BytesIO()
    with wave.open(buf, "wb") as wf:
        voice.synthesize_wav(text, wf, syn_config=cfg)
    audio = buf.getvalue()
    headers = {"Content-Disposition": 'attachment; filename="speech.wav"'}
    return Response(content=audio, media_type="audio/wav", headers=headers)