sage / serve /server_cpu.py
sage002's picture
feat: add authenticated remote control UI and ngrok launcher
a21a30f verified
"""CPU and llama.cpp serving helpers."""
from __future__ import annotations
import logging
import shutil
from fastapi import FastAPI
from pydantic import BaseModel
from serve.control_plane import build_control_router, get_runtime_access_info
app = FastAPI(title="SAGE CPU Server")
_LOGGER = logging.getLogger("uvicorn.error")
def _print_startup_banner() -> None:
"""Print the login details for the browser control UI."""
access = get_runtime_access_info()
local_url = (access["local_url"] or "http://127.0.0.1:8001").rstrip("/")
public_url = access["public_url"]
_LOGGER.info("SAGE local URL: %s/", local_url)
if public_url:
_LOGGER.info("SAGE public URL: %s/", public_url.rstrip("/"))
_LOGGER.info("SAGE login password: %s", access["password"])
class ChatRequest(BaseModel):
"""Request schema for the browser chat surface."""
prompt: str
max_new_tokens: int = 64
@app.get("/health")
def health() -> dict[str, object]:
"""Report llama.cpp availability for CPU serving."""
return {"status": "ok", "llama_cpp_available": shutil.which("llama-server") is not None, "chat": chat_status()}
def chat_status() -> dict[str, object]:
"""Return chat readiness for the CPU server."""
return {
"available": False,
"warning": "Browser chat is only wired to the PyTorch GPU server in this repo. Use serve.server:app for direct interaction.",
}
@app.get("/chat/status")
def get_chat_status() -> dict[str, object]:
"""Expose browser-chat readiness."""
return chat_status()
@app.post("/chat")
def chat(_: ChatRequest) -> dict[str, object]:
"""Return a clear error for CPU-only control-plane mode."""
return {"success": False, "detail": chat_status()["warning"], **chat_status()}
def _health_action(_: dict[str, object]) -> dict[str, object]:
return health()
app.include_router(build_control_router({"health_check": _health_action}))
@app.on_event("startup")
def _startup_banner() -> None:
_print_startup_banner()