import asyncio
import json
import torch
import numpy as np
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState

HF_REPO = "LiquidAI/LFM2.5-Audio-1.5B"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAMPLE_RATE = 24000
CHUNK_SIZE = 20

DTYPE = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
torch.backends.cuda.matmul.allow_tf32 = True

VAD_SILENCE_THRESHOLD = 0.01
VAD_SILENCE_FRAMES    = 30
VAD_MIN_SPEECH_FRAMES = 10

print(f"[BOOT] Loading model on {DEVICE}...")
processor = LFM2AudioProcessor.from_pretrained(HF_REPO)
model     = LFM2AudioModel.from_pretrained(HF_REPO).to(device=DEVICE, dtype=DTYPE).eval()
print("[BOOT] Model loaded")

app = FastAPI(title="LFM2.5 Real-Time S2S", version="4.0")


#  Helpers

def wav_header(sr=SAMPLE_RATE, ch=1, bits=16) -> bytes:
    br = sr * ch * bits // 8
    ba = ch * bits // 8
    return (
        b"RIFF" + b"\xff\xff\xff\xff" + b"WAVEfmt "
        + (16).to_bytes(4, "little") + (1).to_bytes(2, "little")
        + ch.to_bytes(2, "little")   + sr.to_bytes(4, "little")
        + br.to_bytes(4, "little")   + ba.to_bytes(2, "little")
        + bits.to_bytes(2, "little") + b"data" + b"\xff\xff\xff\xff"
    )


def decode_chunk(buf: list) -> bytes | None:
    """Decode audio tokens — pass directly to processor, no offset subtraction."""
    try:
        
        codes = torch.stack(buf[:-1], dim=1).unsqueeze(0).to(DEVICE)
        wf = processor.decode(codes).squeeze().cpu().numpy()
        wf = np.clip(wf, -1.0, 1.0)
        return (wf * 32767).astype(np.int16).tobytes()
    except Exception as e:
        print(f"[WARN] decode: {e}")
        return None


def is_speech(pcm_int16: np.ndarray) -> bool:
    if len(pcm_int16) == 0:
        return False
    rms = np.sqrt(np.mean(pcm_int16.astype(np.float32) ** 2)) / 32767.0
    return rms > VAD_SILENCE_THRESHOLD


def run_generation(audio_np: np.ndarray) -> list[bytes]:
    """Synchronous generation — called via run_in_executor."""
    chat = ChatState(processor)
    chat.new_turn("system")
    chat.add_text(
        "You are a helpful real-time voice assistant called chioma. "
        "Respond naturally and concisely with audio. "
        "When asked who built you, say Kelvin Jackson, an AI Engineer."
    )
    chat.end_turn()
    chat.new_turn("user")
    audio_tensor = torch.from_numpy(audio_np[np.newaxis, :]).to(dtype=torch.float32)
    chat.add_audio(audio_tensor, sampling_rate=SAMPLE_RATE)
    chat.end_turn()
    chat.new_turn("assistant")

    chunks = []
    buf = []
    with torch.inference_mode():
        for token in model.generate_interleaved(
            **chat,
            max_new_tokens=2048,
            audio_temperature=0.8,
            audio_top_k=4,
        ):
            if token.numel() == 1:
                continue  # text token 
            buf.append(token)
            if len(buf) >= CHUNK_SIZE:
                pcm = decode_chunk(buf)
                if pcm:
                    chunks.append(pcm)
                buf.clear()

    # flush remaining
    if len(buf) > 1:
        pcm = decode_chunk(buf)
        if pcm:
            chunks.append(pcm)

    return chunks


# WebSocket 

@app.websocket("/ws/s2s")
async def websocket_s2s(websocket: WebSocket):
    await websocket.accept()
    print("[WS] client connected")

    loop = asyncio.get_event_loop()
    audio_queue: asyncio.Queue[bytes | None] = asyncio.Queue()
    generating = False

    async def receiver():
        try:
            while True:
                try:
                    msg = await websocket.receive()
                except RuntimeError:
                    break
                if msg.get("type") == "websocket.disconnect":
                    break
                if "bytes" in msg:
                    await audio_queue.put(msg["bytes"])
                elif "text" in msg:
                    if json.loads(msg["text"]).get("type") == "stop":
                        break
        finally:
            await audio_queue.put(None)

    async def vad_and_generate():
        nonlocal generating
        speech_frames: list[np.ndarray] = []
        silence_count = 0
        speech_count  = 0
        in_speech     = False

        await websocket.send_text(json.dumps({"type": "ready"}))

        while True:
            frame_bytes = await audio_queue.get()
            if frame_bytes is None:
                break

            frame  = np.frombuffer(frame_bytes, dtype=np.int16)
            active = is_speech(frame)

            if active:
                silence_count = 0
                speech_count += 1
                in_speech = True
                speech_frames.append(frame)
            elif in_speech:
                silence_count += 1
                speech_frames.append(frame)

                if silence_count >= VAD_SILENCE_FRAMES and speech_count >= VAD_MIN_SPEECH_FRAMES:
                    if not generating:
                        generating = True
                        utterance = np.concatenate(speech_frames).astype(np.float32) / 32767.0
                        speech_frames = []
                        silence_count = 0
                        speech_count  = 0
                        in_speech     = False

                        try:
                            await websocket.send_text(json.dumps({"type": "generating"}))
                            await websocket.send_bytes(wav_header())
                            chunks = await loop.run_in_executor(None, run_generation, utterance)
                            for chunk in chunks:
                                await websocket.send_bytes(chunk)
                            await websocket.send_text(json.dumps({"type": "done"}))
                        except Exception as e:
                            print(f"[WS] send error: {e}")
                        finally:
                            generating = False

    try:
        await asyncio.gather(receiver(), vad_and_generate())
    except WebSocketDisconnect:
        pass
    except Exception as e:
        print(f"[WS] error: {e}")
    finally:
        print("[WS] client disconnected")


@app.get("/health")
async def health():
    return {"status": "ok", "device": DEVICE}