theutkarshjaiswal commited on
Commit
77cc00a
Β·
verified Β·
1 Parent(s): b399677

Upload 3 files

Browse files
Files changed (3) hide show
  1. download_model.py +25 -0
  2. requirements.txt +5 -2
  3. server.py +152 -0
download_model.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Downloads the GGUF model at Docker build time.
3
+ Model: Qwen3-0.6B (Q4_K_M quantized) β€” ~400MB, runs well on CPU
4
+ """
5
+ from huggingface_hub import hf_hub_download
6
+ import os
7
+
8
+ MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
9
+ MODEL_FILE = "qwen3-0.6b-q4_k_m.gguf"
10
+ SAVE_PATH = "/app/model.gguf"
11
+
12
+ print(f"Downloading {MODEL_FILE} from {MODEL_REPO}...")
13
+
14
+ path = hf_hub_download(
15
+ repo_id=MODEL_REPO,
16
+ filename=MODEL_FILE,
17
+ local_dir="/app",
18
+ local_dir_use_symlinks=False,
19
+ )
20
+
21
+ # Rename to a fixed path for server.py
22
+ if path != SAVE_PATH:
23
+ os.rename(path, SAVE_PATH)
24
+
25
+ print(f"Model saved to {SAVE_PATH}")
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
- huggingface_hub
2
- llama-cpp-python[server]
 
 
 
 
1
+ llama-cpp-python==0.3.4
2
+ fastapi==0.115.0
3
+ uvicorn==0.30.6
4
+ huggingface-hub==0.24.6
5
+ pydantic==2.8.2
server.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ openclaw-api β€” OpenAI-compatible LLM API running locally on CPU
3
+ Uses llama-cpp-python with Qwen3-0.6B GGUF model
4
+ """
5
+ import time
6
+ import uuid
7
+ import os
8
+ from fastapi import FastAPI, HTTPException, Depends, Header
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from fastapi.responses import StreamingResponse
11
+ from pydantic import BaseModel
12
+ from typing import List, Optional, AsyncGenerator
13
+ from llama_cpp import Llama
14
+ import json
15
+
16
+ # ─── CONFIG ────────────────────────────────────────────────────────────────
17
+ MODEL_PATH = "/app/model.gguf"
18
+ API_KEY = os.environ.get("API_KEY", "") # optional: set in HF Secrets
19
+ N_CTX = 2048 # context window
20
+ N_THREADS = 4 # CPU threads
21
+ # ───────────────────────────────────────────────────────────────────────────
22
+
23
+ app = FastAPI(title="openclaw-api", version="1.0.0")
24
+
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=["*"],
28
+ allow_methods=["*"],
29
+ allow_headers=["*"],
30
+ )
31
+
32
+ # Load model once at startup
33
+ print("Loading model...")
34
+ llm = Llama(
35
+ model_path=MODEL_PATH,
36
+ n_ctx=N_CTX,
37
+ n_threads=N_THREADS,
38
+ verbose=False,
39
+ )
40
+ print("Model loaded!")
41
+
42
+ # ─── Auth ───────────────────────────────────────────────────────────────────
43
+ def verify_key(authorization: Optional[str] = Header(None)):
44
+ if not API_KEY:
45
+ return # no key set = open
46
+ if authorization != f"Bearer {API_KEY}":
47
+ raise HTTPException(status_code=401, detail="Unauthorized")
48
+
49
+ # ─── Schemas ────────────────────────────────────────────────────────────────
50
+ class Message(BaseModel):
51
+ role: str
52
+ content: str
53
+
54
+ class ChatRequest(BaseModel):
55
+ model: Optional[str] = "qwen3-0.6b"
56
+ messages: List[Message]
57
+ max_tokens: Optional[int] = 512
58
+ temperature: Optional[float] = 0.7
59
+ stream: Optional[bool] = False
60
+
61
+ class CompletionRequest(BaseModel):
62
+ model: Optional[str] = "qwen3-0.6b"
63
+ prompt: str
64
+ max_tokens: Optional[int] = 512
65
+ temperature: Optional[float] = 0.7
66
+ stream: Optional[bool] = False
67
+
68
+ # ─── Routes ─────────────────────────────────────────────────────────────────
69
+
70
+ @app.get("/")
71
+ def root():
72
+ return {"status": "openclaw-api is running", "model": "qwen3-0.6b", "backend": "llama-cpp-python (CPU)"}
73
+
74
+ @app.get("/v1/models", dependencies=[Depends(verify_key)])
75
+ def list_models():
76
+ return {
77
+ "object": "list",
78
+ "data": [{
79
+ "id": "qwen3-0.6b",
80
+ "object": "model",
81
+ "created": int(time.time()),
82
+ "owned_by": "local",
83
+ }]
84
+ }
85
+
86
+ @app.post("/v1/chat/completions", dependencies=[Depends(verify_key)])
87
+ def chat_completions(req: ChatRequest):
88
+ messages = [{"role": m.role, "content": m.content} for m in req.messages]
89
+
90
+ if req.stream:
91
+ def generate():
92
+ stream = llm.create_chat_completion(
93
+ messages=messages,
94
+ max_tokens=req.max_tokens,
95
+ temperature=req.temperature,
96
+ stream=True,
97
+ )
98
+ for chunk in stream:
99
+ delta = chunk["choices"][0].get("delta", {})
100
+ data = {
101
+ "id": f"chatcmpl-{uuid.uuid4().hex}",
102
+ "object": "chat.completion.chunk",
103
+ "created": int(time.time()),
104
+ "model": req.model,
105
+ "choices": [{"delta": delta, "index": 0, "finish_reason": None}],
106
+ }
107
+ yield f"data: {json.dumps(data)}\n\n"
108
+ yield "data: [DONE]\n\n"
109
+
110
+ return StreamingResponse(generate(), media_type="text/event-stream")
111
+
112
+ result = llm.create_chat_completion(
113
+ messages=messages,
114
+ max_tokens=req.max_tokens,
115
+ temperature=req.temperature,
116
+ )
117
+
118
+ return {
119
+ "id": f"chatcmpl-{uuid.uuid4().hex}",
120
+ "object": "chat.completion",
121
+ "created": int(time.time()),
122
+ "model": req.model,
123
+ "choices": [{
124
+ "index": 0,
125
+ "message": {
126
+ "role": "assistant",
127
+ "content": result["choices"][0]["message"]["content"],
128
+ },
129
+ "finish_reason": result["choices"][0].get("finish_reason", "stop"),
130
+ }],
131
+ "usage": result.get("usage", {}),
132
+ }
133
+
134
+ @app.post("/v1/completions", dependencies=[Depends(verify_key)])
135
+ def completions(req: CompletionRequest):
136
+ result = llm(
137
+ req.prompt,
138
+ max_tokens=req.max_tokens,
139
+ temperature=req.temperature,
140
+ )
141
+ return {
142
+ "id": f"cmpl-{uuid.uuid4().hex}",
143
+ "object": "text_completion",
144
+ "created": int(time.time()),
145
+ "model": req.model,
146
+ "choices": [{
147
+ "text": result["choices"][0]["text"],
148
+ "index": 0,
149
+ "finish_reason": result["choices"][0].get("finish_reason", "stop"),
150
+ }],
151
+ "usage": result.get("usage", {}),
152
+ }