"""CPU and llama.cpp serving helpers."""

from __future__ import annotations

import logging
import shutil

from fastapi import FastAPI
from pydantic import BaseModel

from serve.control_plane import build_control_router, get_runtime_access_info


app = FastAPI(title="SAGE CPU Server")
_LOGGER = logging.getLogger("uvicorn.error")


def _print_startup_banner() -> None:
    """Print the login details for the browser control UI."""
    access = get_runtime_access_info()
    local_url = (access["local_url"] or "http://127.0.0.1:8001").rstrip("/")
    public_url = access["public_url"]
    _LOGGER.info("SAGE local URL: %s/", local_url)
    if public_url:
        _LOGGER.info("SAGE public URL: %s/", public_url.rstrip("/"))
    _LOGGER.info("SAGE login password: %s", access["password"])


class ChatRequest(BaseModel):
    """Request schema for the browser chat surface."""

    prompt: str
    max_new_tokens: int = 64


@app.get("/health")
def health() -> dict[str, object]:
    """Report llama.cpp availability for CPU serving."""
    return {"status": "ok", "llama_cpp_available": shutil.which("llama-server") is not None, "chat": chat_status()}


def chat_status() -> dict[str, object]:
    """Return chat readiness for the CPU server."""
    return {
        "available": False,
        "warning": "Browser chat is only wired to the PyTorch GPU server in this repo. Use serve.server:app for direct interaction.",
    }


@app.get("/chat/status")
def get_chat_status() -> dict[str, object]:
    """Expose browser-chat readiness."""
    return chat_status()


@app.post("/chat")
def chat(_: ChatRequest) -> dict[str, object]:
    """Return a clear error for CPU-only control-plane mode."""
    return {"success": False, "detail": chat_status()["warning"], **chat_status()}


def _health_action(_: dict[str, object]) -> dict[str, object]:
    return health()


app.include_router(build_control_router({"health_check": _health_action}))


@app.on_event("startup")
def _startup_banner() -> None:
    _print_startup_banner()