Spaces:
Sleeping
Sleeping
File size: 7,688 Bytes
77cc00a 78ec158 77cc00a 78ec158 77cc00a 8642ea9 77cc00a 78ec158 77cc00a 8642ea9 77cc00a 78ec158 77cc00a 8642ea9 77cc00a 78ec158 77cc00a 8642ea9 77cc00a 78ec158 77cc00a 78ec158 77cc00a 8642ea9 77cc00a 8642ea9 77cc00a 8642ea9 77cc00a 8642ea9 77cc00a 8642ea9 78ec158 77cc00a 8642ea9 78ec158 77cc00a 8642ea9 78ec158 77cc00a 78ec158 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | """
openclaw-api β OpenAI-compatible LLM API running locally on CPU
Uses llama-cpp-python with Qwen3-0.6B GGUF model
"""
import time
import uuid
import os
import json
from fastapi import FastAPI, HTTPException, Depends, Header
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, ConfigDict
from typing import List, Optional, Any
from llama_cpp import Llama
# βββ CONFIG ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MODEL_PATH = "/app/model.gguf"
API_KEY = os.environ.get("API_KEY", "")
N_CTX = 8192 # increased from 2048 β fits OpenClaw's system prompt
N_THREADS = 4
MAX_TOKENS = 512 # max tokens to generate per response
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
app = FastAPI(title="openclaw-api", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
print("Loading model...")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=N_CTX,
n_threads=N_THREADS,
verbose=False,
)
print("Model loaded!")
# βββ Auth βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def verify_key(authorization: Optional[str] = Header(None)):
if not API_KEY:
return
if authorization != f"Bearer {API_KEY}":
raise HTTPException(status_code=401, detail="Unauthorized")
# βββ Schemas ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class Message(BaseModel):
model_config = ConfigDict(extra="allow")
role: str
content: Any
class ChatRequest(BaseModel):
model_config = ConfigDict(extra="allow")
model: Optional[str] = "qwen3-0.6b"
messages: List[Message]
max_tokens: Optional[int] = MAX_TOKENS
temperature: Optional[float] = 0.7
stream: Optional[bool] = False
top_p: Optional[float] = None
frequency_penalty: Optional[float] = None
presence_penalty: Optional[float] = None
stop: Optional[Any] = None
class CompletionRequest(BaseModel):
model_config = ConfigDict(extra="allow")
model: Optional[str] = "qwen3-0.6b"
prompt: str
max_tokens: Optional[int] = MAX_TOKENS
temperature: Optional[float] = 0.7
stream: Optional[bool] = False
# βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def normalize_messages(messages: List[Message]) -> List[dict]:
"""Convert messages to plain dicts, normalize content to string."""
result = []
for m in messages:
content = m.content
if isinstance(content, list):
content = " ".join(
part.get("text", "") for part in content
if isinstance(part, dict) and part.get("type") == "text"
)
result.append({"role": m.role, "content": str(content)})
return result
def truncate_messages(messages: List[dict], max_ctx: int = N_CTX, max_out: int = MAX_TOKENS) -> List[dict]:
"""
Truncate messages to fit within the context window.
Always keeps the system message + last N user/assistant turns.
Budget = N_CTX - max_out - 256 (safety margin)
"""
budget = max_ctx - max_out - 256
char_budget = budget * 3 # rough chars-per-token estimate
system_msgs = [m for m in messages if m["role"] == "system"]
other_msgs = [m for m in messages if m["role"] != "system"]
# Truncate long system messages
for m in system_msgs:
if len(m["content"]) > char_budget // 2:
m["content"] = m["content"][: char_budget // 2] + "\n[truncated]"
# Keep as many recent messages as fit
kept = []
used = sum(len(m["content"]) for m in system_msgs)
for m in reversed(other_msgs):
used += len(m["content"])
if used > char_budget:
break
kept.insert(0, m)
return system_msgs + kept
# βββ Routes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@app.get("/")
def root():
return {"status": "openclaw-api is running", "model": "qwen3-0.6b", "backend": "llama-cpp-python (CPU)", "n_ctx": N_CTX}
@app.get("/v1/models", dependencies=[Depends(verify_key)])
def list_models():
return {
"object": "list",
"data": [{
"id": "qwen3-0.6b",
"object": "model",
"created": int(time.time()),
"owned_by": "local",
}]
}
@app.post("/v1/chat/completions", dependencies=[Depends(verify_key)])
def chat_completions(req: ChatRequest):
messages = normalize_messages(req.messages)
messages = truncate_messages(messages, max_out=req.max_tokens or MAX_TOKENS)
max_tokens = min(req.max_tokens or MAX_TOKENS, MAX_TOKENS)
if req.stream:
def generate():
stream = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=req.temperature or 0.7,
stream=True,
)
for chunk in stream:
delta = chunk["choices"][0].get("delta", {})
data = {
"id": f"chatcmpl-{uuid.uuid4().hex}",
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": req.model,
"choices": [{"delta": delta, "index": 0, "finish_reason": None}],
}
yield f"data: {json.dumps(data)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
result = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=req.temperature or 0.7,
)
return {
"id": f"chatcmpl-{uuid.uuid4().hex}",
"object": "chat.completion",
"created": int(time.time()),
"model": req.model,
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": result["choices"][0]["message"]["content"],
},
"finish_reason": result["choices"][0].get("finish_reason", "stop"),
}],
"usage": result.get("usage", {}),
}
@app.post("/v1/completions", dependencies=[Depends(verify_key)])
def completions(req: CompletionRequest):
result = llm(
req.prompt,
max_tokens=min(req.max_tokens or MAX_TOKENS, MAX_TOKENS),
temperature=req.temperature or 0.7,
)
return {
"id": f"cmpl-{uuid.uuid4().hex}",
"object": "text_completion",
"created": int(time.time()),
"model": req.model,
"choices": [{
"text": result["choices"][0]["text"],
"index": 0,
"finish_reason": result["choices"][0].get("finish_reason", "stop"),
}],
"usage": result.get("usage", {}),
} |