Spaces:
Sleeping
Sleeping
| """ | |
| openclaw-api β OpenAI-compatible LLM API running locally on CPU | |
| Uses llama-cpp-python with Qwen3-0.6B GGUF model | |
| """ | |
| import time | |
| import uuid | |
| import os | |
| import json | |
| from fastapi import FastAPI, HTTPException, Depends, Header | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import StreamingResponse | |
| from pydantic import BaseModel, ConfigDict | |
| from typing import List, Optional, Any | |
| from llama_cpp import Llama | |
| # βββ CONFIG ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_PATH = "/app/model.gguf" | |
| API_KEY = os.environ.get("API_KEY", "") | |
| N_CTX = 8192 # increased from 2048 β fits OpenClaw's system prompt | |
| N_THREADS = 4 | |
| MAX_TOKENS = 512 # max tokens to generate per response | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app = FastAPI(title="openclaw-api", version="1.0.0") | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| print("Loading model...") | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=N_CTX, | |
| n_threads=N_THREADS, | |
| verbose=False, | |
| ) | |
| print("Model loaded!") | |
| # βββ Auth βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def verify_key(authorization: Optional[str] = Header(None)): | |
| if not API_KEY: | |
| return | |
| if authorization != f"Bearer {API_KEY}": | |
| raise HTTPException(status_code=401, detail="Unauthorized") | |
| # βββ Schemas ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class Message(BaseModel): | |
| model_config = ConfigDict(extra="allow") | |
| role: str | |
| content: Any | |
| class ChatRequest(BaseModel): | |
| model_config = ConfigDict(extra="allow") | |
| model: Optional[str] = "qwen3-0.6b" | |
| messages: List[Message] | |
| max_tokens: Optional[int] = MAX_TOKENS | |
| temperature: Optional[float] = 0.7 | |
| stream: Optional[bool] = False | |
| top_p: Optional[float] = None | |
| frequency_penalty: Optional[float] = None | |
| presence_penalty: Optional[float] = None | |
| stop: Optional[Any] = None | |
| class CompletionRequest(BaseModel): | |
| model_config = ConfigDict(extra="allow") | |
| model: Optional[str] = "qwen3-0.6b" | |
| prompt: str | |
| max_tokens: Optional[int] = MAX_TOKENS | |
| temperature: Optional[float] = 0.7 | |
| stream: Optional[bool] = False | |
| # βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def normalize_messages(messages: List[Message]) -> List[dict]: | |
| """Convert messages to plain dicts, normalize content to string.""" | |
| result = [] | |
| for m in messages: | |
| content = m.content | |
| if isinstance(content, list): | |
| content = " ".join( | |
| part.get("text", "") for part in content | |
| if isinstance(part, dict) and part.get("type") == "text" | |
| ) | |
| result.append({"role": m.role, "content": str(content)}) | |
| return result | |
| def truncate_messages(messages: List[dict], max_ctx: int = N_CTX, max_out: int = MAX_TOKENS) -> List[dict]: | |
| """ | |
| Truncate messages to fit within the context window. | |
| Always keeps the system message + last N user/assistant turns. | |
| Budget = N_CTX - max_out - 256 (safety margin) | |
| """ | |
| budget = max_ctx - max_out - 256 | |
| char_budget = budget * 3 # rough chars-per-token estimate | |
| system_msgs = [m for m in messages if m["role"] == "system"] | |
| other_msgs = [m for m in messages if m["role"] != "system"] | |
| # Truncate long system messages | |
| for m in system_msgs: | |
| if len(m["content"]) > char_budget // 2: | |
| m["content"] = m["content"][: char_budget // 2] + "\n[truncated]" | |
| # Keep as many recent messages as fit | |
| kept = [] | |
| used = sum(len(m["content"]) for m in system_msgs) | |
| for m in reversed(other_msgs): | |
| used += len(m["content"]) | |
| if used > char_budget: | |
| break | |
| kept.insert(0, m) | |
| return system_msgs + kept | |
| # βββ Routes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def root(): | |
| return {"status": "openclaw-api is running", "model": "qwen3-0.6b", "backend": "llama-cpp-python (CPU)", "n_ctx": N_CTX} | |
| def list_models(): | |
| return { | |
| "object": "list", | |
| "data": [{ | |
| "id": "qwen3-0.6b", | |
| "object": "model", | |
| "created": int(time.time()), | |
| "owned_by": "local", | |
| }] | |
| } | |
| def chat_completions(req: ChatRequest): | |
| messages = normalize_messages(req.messages) | |
| messages = truncate_messages(messages, max_out=req.max_tokens or MAX_TOKENS) | |
| max_tokens = min(req.max_tokens or MAX_TOKENS, MAX_TOKENS) | |
| if req.stream: | |
| def generate(): | |
| stream = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=req.temperature or 0.7, | |
| stream=True, | |
| ) | |
| for chunk in stream: | |
| delta = chunk["choices"][0].get("delta", {}) | |
| data = { | |
| "id": f"chatcmpl-{uuid.uuid4().hex}", | |
| "object": "chat.completion.chunk", | |
| "created": int(time.time()), | |
| "model": req.model, | |
| "choices": [{"delta": delta, "index": 0, "finish_reason": None}], | |
| } | |
| yield f"data: {json.dumps(data)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(generate(), media_type="text/event-stream") | |
| result = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=req.temperature or 0.7, | |
| ) | |
| return { | |
| "id": f"chatcmpl-{uuid.uuid4().hex}", | |
| "object": "chat.completion", | |
| "created": int(time.time()), | |
| "model": req.model, | |
| "choices": [{ | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": result["choices"][0]["message"]["content"], | |
| }, | |
| "finish_reason": result["choices"][0].get("finish_reason", "stop"), | |
| }], | |
| "usage": result.get("usage", {}), | |
| } | |
| def completions(req: CompletionRequest): | |
| result = llm( | |
| req.prompt, | |
| max_tokens=min(req.max_tokens or MAX_TOKENS, MAX_TOKENS), | |
| temperature=req.temperature or 0.7, | |
| ) | |
| return { | |
| "id": f"cmpl-{uuid.uuid4().hex}", | |
| "object": "text_completion", | |
| "created": int(time.time()), | |
| "model": req.model, | |
| "choices": [{ | |
| "text": result["choices"][0]["text"], | |
| "index": 0, | |
| "finish_reason": result["choices"][0].get("finish_reason", "stop"), | |
| }], | |
| "usage": result.get("usage", {}), | |
| } |