""" openclaw-api — OpenAI-compatible LLM API running locally on CPU Uses llama-cpp-python with Qwen3-0.6B GGUF model """ import time import uuid import os import json from fastapi import FastAPI, HTTPException, Depends, Header from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse from pydantic import BaseModel, ConfigDict from typing import List, Optional, Any from llama_cpp import Llama # ─── CONFIG ──────────────────────────────────────────────────────────────── MODEL_PATH = "/app/model.gguf" API_KEY = os.environ.get("API_KEY", "") N_CTX = 8192 # increased from 2048 — fits OpenClaw's system prompt N_THREADS = 4 MAX_TOKENS = 512 # max tokens to generate per response # ─────────────────────────────────────────────────────────────────────────── app = FastAPI(title="openclaw-api", version="1.0.0") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) print("Loading model...") llm = Llama( model_path=MODEL_PATH, n_ctx=N_CTX, n_threads=N_THREADS, verbose=False, ) print("Model loaded!") # ─── Auth ─────────────────────────────────────────────────────────────────── def verify_key(authorization: Optional[str] = Header(None)): if not API_KEY: return if authorization != f"Bearer {API_KEY}": raise HTTPException(status_code=401, detail="Unauthorized") # ─── Schemas ──────────────────────────────────────────────────────────────── class Message(BaseModel): model_config = ConfigDict(extra="allow") role: str content: Any class ChatRequest(BaseModel): model_config = ConfigDict(extra="allow") model: Optional[str] = "qwen3-0.6b" messages: List[Message] max_tokens: Optional[int] = MAX_TOKENS temperature: Optional[float] = 0.7 stream: Optional[bool] = False top_p: Optional[float] = None frequency_penalty: Optional[float] = None presence_penalty: Optional[float] = None stop: Optional[Any] = None class CompletionRequest(BaseModel): model_config = ConfigDict(extra="allow") model: Optional[str] = "qwen3-0.6b" prompt: str max_tokens: Optional[int] = MAX_TOKENS temperature: Optional[float] = 0.7 stream: Optional[bool] = False # ─── Helpers ──────────────────────────────────────────────────────────────── def normalize_messages(messages: List[Message]) -> List[dict]: """Convert messages to plain dicts, normalize content to string.""" result = [] for m in messages: content = m.content if isinstance(content, list): content = " ".join( part.get("text", "") for part in content if isinstance(part, dict) and part.get("type") == "text" ) result.append({"role": m.role, "content": str(content)}) return result def truncate_messages(messages: List[dict], max_ctx: int = N_CTX, max_out: int = MAX_TOKENS) -> List[dict]: """ Truncate messages to fit within the context window. Always keeps the system message + last N user/assistant turns. Budget = N_CTX - max_out - 256 (safety margin) """ budget = max_ctx - max_out - 256 char_budget = budget * 3 # rough chars-per-token estimate system_msgs = [m for m in messages if m["role"] == "system"] other_msgs = [m for m in messages if m["role"] != "system"] # Truncate long system messages for m in system_msgs: if len(m["content"]) > char_budget // 2: m["content"] = m["content"][: char_budget // 2] + "\n[truncated]" # Keep as many recent messages as fit kept = [] used = sum(len(m["content"]) for m in system_msgs) for m in reversed(other_msgs): used += len(m["content"]) if used > char_budget: break kept.insert(0, m) return system_msgs + kept # ─── Routes ───────────────────────────────────────────────────────────────── @app.get("/") def root(): return {"status": "openclaw-api is running", "model": "qwen3-0.6b", "backend": "llama-cpp-python (CPU)", "n_ctx": N_CTX} @app.get("/v1/models", dependencies=[Depends(verify_key)]) def list_models(): return { "object": "list", "data": [{ "id": "qwen3-0.6b", "object": "model", "created": int(time.time()), "owned_by": "local", }] } @app.post("/v1/chat/completions", dependencies=[Depends(verify_key)]) def chat_completions(req: ChatRequest): messages = normalize_messages(req.messages) messages = truncate_messages(messages, max_out=req.max_tokens or MAX_TOKENS) max_tokens = min(req.max_tokens or MAX_TOKENS, MAX_TOKENS) if req.stream: def generate(): stream = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=req.temperature or 0.7, stream=True, ) for chunk in stream: delta = chunk["choices"][0].get("delta", {}) data = { "id": f"chatcmpl-{uuid.uuid4().hex}", "object": "chat.completion.chunk", "created": int(time.time()), "model": req.model, "choices": [{"delta": delta, "index": 0, "finish_reason": None}], } yield f"data: {json.dumps(data)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") result = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=req.temperature or 0.7, ) return { "id": f"chatcmpl-{uuid.uuid4().hex}", "object": "chat.completion", "created": int(time.time()), "model": req.model, "choices": [{ "index": 0, "message": { "role": "assistant", "content": result["choices"][0]["message"]["content"], }, "finish_reason": result["choices"][0].get("finish_reason", "stop"), }], "usage": result.get("usage", {}), } @app.post("/v1/completions", dependencies=[Depends(verify_key)]) def completions(req: CompletionRequest): result = llm( req.prompt, max_tokens=min(req.max_tokens or MAX_TOKENS, MAX_TOKENS), temperature=req.temperature or 0.7, ) return { "id": f"cmpl-{uuid.uuid4().hex}", "object": "text_completion", "created": int(time.time()), "model": req.model, "choices": [{ "text": result["choices"][0]["text"], "index": 0, "finish_reason": result["choices"][0].get("finish_reason", "stop"), }], "usage": result.get("usage", {}), }