mrmadblack commited on
Commit
e032d80
·
verified ·
1 Parent(s): 0a9db98

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +256 -151
server.py CHANGED
@@ -1,4 +1,10 @@
1
- from fastapi import FastAPI
 
 
 
 
 
 
2
  from fastapi.responses import StreamingResponse, JSONResponse
3
  from pydantic import BaseModel
4
  from huggingface_hub import hf_hub_download
@@ -10,20 +16,43 @@ import json
10
  import time
11
  import hashlib
12
  import threading
 
13
 
14
  app = FastAPI()
15
 
 
16
  # ---------------------------
17
- # MODEL CONFIG
18
  # ---------------------------
19
 
20
- MODEL_NAME = "tinyllama"
21
- MODEL_PATH = "models/tinyllama.gguf"
22
-
23
- MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
24
- MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
25
-
26
- LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  # ---------------------------
@@ -31,111 +60,168 @@ LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
31
  # ---------------------------
32
 
33
  class ChatRequest(BaseModel):
34
- model: str
35
  messages: list
36
  stream: bool = True
 
37
 
38
 
39
  class GenerateRequest(BaseModel):
40
- model: str
41
  prompt: str
 
 
42
 
43
 
44
  # ---------------------------
45
- # PROMPT BUILDER
46
  # ---------------------------
47
 
48
- def build_prompt(messages):
49
-
 
 
 
 
 
50
  prompt = ""
 
 
 
 
51
 
52
  for m in messages:
53
- role = m.get("role")
54
  content = m.get("content", "").strip()
55
-
56
  if not content:
57
  continue
58
-
59
- if role == "user":
60
- prompt += f"<|user|>\n{content}\n"
 
61
  elif role == "assistant":
62
- prompt += f"<|assistant|>\n{content}\n"
63
-
64
- prompt += "<|assistant|>\n"
65
 
 
66
  return prompt
67
 
68
 
69
  # ---------------------------
70
- # DOWNLOAD MODEL
71
  # ---------------------------
72
 
73
- os.makedirs("models", exist_ok=True)
 
 
 
 
 
 
 
 
 
74
 
75
- if not os.path.exists(MODEL_PATH):
76
 
77
- print("Downloading model from HuggingFace...")
 
 
78
 
79
- downloaded = hf_hub_download(
80
- repo_id=MODEL_REPO,
81
- filename=MODEL_FILE
82
- )
83
 
84
- os.system(f"cp {downloaded} {MODEL_PATH}")
 
 
 
 
 
85
 
86
- print("Model ready:", MODEL_PATH)
 
87
 
88
 
89
  # ---------------------------
90
- # START LLAMA SERVER
91
  # ---------------------------
92
 
93
- import os
94
- import subprocess
95
- import requests
96
- import time
97
-
98
- def start_llama():
99
 
100
- print("Starting llama-server...")
101
-
102
- threads = str(os.cpu_count() or 2)
103
 
104
  process = subprocess.Popen([
105
  LLAMA_SERVER,
106
- "-m", MODEL_PATH,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- "--host", "0.0.0.0",
109
- "--port", "8080",
110
 
111
- # context window
112
- "-c", "4096",
113
 
114
- # CPU tuning
115
- "--threads", threads,
116
- "--batch-size", "512",
117
 
118
- # ensure CPU-only
119
- "-ngl", "0"
120
- ])
121
 
122
- # wait for llama-server to be ready
123
- for i in range(30):
124
- try:
125
- r = requests.get("http://localhost:8080/health", timeout=2)
126
- if r.status_code == 200:
127
- print("llama-server ready")
128
- return process
129
- except requests.exceptions.RequestException:
130
- pass
131
 
132
- print(f"waiting for llama-server... ({i+1}/30)")
133
- time.sleep(1)
 
 
 
 
134
 
135
- raise RuntimeError("llama-server failed to start")
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
 
138
- threading.Thread(target=start_llama, daemon=True).start()
 
 
 
 
 
 
 
 
 
 
139
 
140
 
141
  # ---------------------------
@@ -144,148 +230,167 @@ threading.Thread(target=start_llama, daemon=True).start()
144
 
145
  @app.get("/")
146
  def root():
147
- return {"status": "running"}
148
 
149
 
150
  # ---------------------------
151
- # MODEL LIST (Ollama style)
152
  # ---------------------------
153
 
154
  @app.get("/api/tags")
155
  def tags():
 
 
 
 
 
 
156
 
157
- size = os.path.getsize(MODEL_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- with open(MODEL_PATH, "rb") as f:
160
- digest = hashlib.sha256(f.read()).hexdigest()
161
 
162
- return {
163
- "models": [
164
- {
165
- "name": MODEL_NAME,
166
- "model": MODEL_NAME,
167
- "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
168
- "size": size,
169
- "digest": digest,
170
- "details": {
171
- "format": "gguf",
172
- "family": "llama",
173
- "families": ["llama"],
174
- "parameter_size": "1.1B",
175
- "quantization_level": "Q4_K_M"
176
- }
177
- }
178
- ]
179
- }
180
 
181
 
182
  # ---------------------------
183
- # GENERATE (non-stream)
184
  # ---------------------------
185
 
186
  @app.post("/api/generate")
187
  def generate(req: GenerateRequest):
 
 
 
 
 
188
 
189
  r = requests.post(
190
- "http://localhost:8080/completion",
191
- json={
192
- "prompt": req.prompt,
193
- "n_predict": 256
194
- }
195
  )
196
 
197
- data = r.json()
 
 
 
 
 
 
 
198
 
199
- text = data.get("content", "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- return {
202
- "model": req.model,
203
- "response": text,
204
- "done": True
205
- }
206
 
207
 
208
  # ---------------------------
209
- # CHAT (Ollama streaming)
210
  # ---------------------------
211
 
212
  @app.post("/api/chat")
213
  def chat(req: ChatRequest):
214
-
 
215
  prompt = build_prompt(req.messages)
 
 
 
216
 
217
  r = requests.post(
218
- "http://localhost:8080/completion",
219
- json={
220
- "prompt": prompt,
221
- "stream": req.stream,
222
- "n_predict": 1024,
223
- "temperature": 0.7,
224
- "top_p": 0.9,
225
- "top_k": 40,
226
- "repeat_penalty": 1.1
227
- },
228
- stream=req.stream
229
  )
230
 
231
  if not req.stream:
232
-
233
- data = r.json()
234
- text = data.get("content", "")
235
-
236
  return JSONResponse({
237
- "model": req.model,
238
- "message": {
239
- "role": "assistant",
240
- "content": text
241
- },
242
- "done": True
243
  })
244
 
245
- def stream_generator():
246
-
247
  for line in r.iter_lines():
248
-
249
  if not line:
250
  continue
251
-
252
  line = line.decode("utf-8").strip()
253
-
254
  if line.startswith("data:"):
255
  line = line[5:].strip()
256
-
257
  try:
258
  data = json.loads(line)
259
- except:
260
  continue
261
-
262
  token = data.get("content", "")
263
-
264
  yield json.dumps({
265
- "model": req.model,
266
- "message": {
267
- "role": "assistant",
268
- "content": token
269
- },
270
- "done": False
271
  }) + "\n"
272
-
 
273
  yield json.dumps({
274
- "model": req.model,
275
- "done": True,
276
- "done_reason": "stop"
277
  }) + "\n"
278
 
279
- return StreamingResponse(
280
- stream_generator(),
281
- media_type="application/x-ndjson",
282
- headers={"Cache-Control": "no-cache"}
283
- )
284
 
285
 
286
  # ---------------------------
287
- # START API
288
  # ---------------------------
289
 
290
  if __name__ == "__main__":
291
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ """
2
+ Ollama-compatible API server
3
+ Models: Qwen3-0.6B (fast) + Qwen3-1.7B (smart)
4
+ Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
5
+ """
6
+
7
+ from fastapi import FastAPI, HTTPException
8
  from fastapi.responses import StreamingResponse, JSONResponse
9
  from pydantic import BaseModel
10
  from huggingface_hub import hf_hub_download
 
16
  import time
17
  import hashlib
18
  import threading
19
+ from typing import Optional
20
 
21
  app = FastAPI()
22
 
23
+
24
  # ---------------------------
25
+ # MODEL CONFIGS
26
  # ---------------------------
27
 
28
+ MODELS = {
29
+ "qwen3.5-0.8b": {
30
+ "path": "models/qwen3.5-0.8b.gguf",
31
+ "repo": "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
32
+ "file": "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
33
+ "port": 8080,
34
+ "param_size": "0.8B",
35
+ "family": "qwen3.5",
36
+ # tight tuning for speed on 2 vCPU
37
+ "threads": 2,
38
+ "ctx": 2048,
39
+ "batch": 512,
40
+ },
41
+ "qwen3.5-2b": {
42
+ "path": "models/qwen3.5-2b.gguf",
43
+ "repo": "bartowski/Qwen_Qwen3.5-2B-GGUF",
44
+ "file": "Qwen_Qwen3.5-2B-Q4_K_M.gguf",
45
+ "port": 8081,
46
+ "param_size": "2B",
47
+ "family": "qwen3.5",
48
+ "threads": 2,
49
+ "ctx": 2048,
50
+ "batch": 512,
51
+ },
52
+ }
53
+
54
+ DEFAULT_MODEL = "qwen3.5-0.8b"
55
+ LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
56
 
57
 
58
  # ---------------------------
 
60
  # ---------------------------
61
 
62
  class ChatRequest(BaseModel):
63
+ model: str = DEFAULT_MODEL
64
  messages: list
65
  stream: bool = True
66
+ options: Optional[dict] = None
67
 
68
 
69
  class GenerateRequest(BaseModel):
70
+ model: str = DEFAULT_MODEL
71
  prompt: str
72
+ stream: bool = False
73
+ options: Optional[dict] = None
74
 
75
 
76
  # ---------------------------
77
+ # PROMPT BUILDER (Qwen3 ChatML)
78
  # ---------------------------
79
 
80
+ def build_prompt(messages: list) -> str:
81
+ """
82
+ Qwen3 uses ChatML format:
83
+ <|im_start|>system\n…<|im_end|>
84
+ <|im_start|>user\n…<|im_end|>
85
+ <|im_start|>assistant\n
86
+ """
87
  prompt = ""
88
+ has_system = any(m.get("role") == "system" for m in messages)
89
+
90
+ if not has_system:
91
+ prompt += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
92
 
93
  for m in messages:
94
+ role = m.get("role", "user")
95
  content = m.get("content", "").strip()
 
96
  if not content:
97
  continue
98
+ if role == "system":
99
+ prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
100
+ elif role == "user":
101
+ prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
102
  elif role == "assistant":
103
+ prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
 
 
104
 
105
+ prompt += "<|im_start|>assistant\n"
106
  return prompt
107
 
108
 
109
  # ---------------------------
110
+ # MODEL RESOLVER
111
  # ---------------------------
112
 
113
+ def resolve_model(name: str) -> dict:
114
+ """Fuzzy match model name → config. Falls back to default."""
115
+ name = (name or DEFAULT_MODEL).lower().strip()
116
+ if name in MODELS:
117
+ return MODELS[name]
118
+ # partial match
119
+ for key, cfg in MODELS.items():
120
+ if key in name or name in key:
121
+ return cfg
122
+ return MODELS[DEFAULT_MODEL]
123
 
 
124
 
125
+ # ---------------------------
126
+ # DOWNLOAD MODELS
127
+ # ---------------------------
128
 
129
+ os.makedirs("models", exist_ok=True)
 
 
 
130
 
131
+ def download_model(cfg: dict):
132
+ if not os.path.exists(cfg["path"]):
133
+ print(f"Downloading {cfg['file']} ...")
134
+ downloaded = hf_hub_download(repo_id=cfg["repo"], filename=cfg["file"])
135
+ os.system(f"cp '{downloaded}' '{cfg['path']}'")
136
+ print(f" ✓ saved to {cfg['path']}")
137
 
138
+ for m in MODELS.values():
139
+ download_model(m)
140
 
141
 
142
  # ---------------------------
143
+ # START LLAMA SERVERS
144
  # ---------------------------
145
 
146
+ _server_ready: dict[str, bool] = {k: False for k in MODELS}
 
 
 
 
 
147
 
148
+ def start_llama(model_name: str, cfg: dict):
149
+ print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
 
150
 
151
  process = subprocess.Popen([
152
  LLAMA_SERVER,
153
+ "-m", cfg["path"],
154
+ "--host", "0.0.0.0",
155
+ "--port", str(cfg["port"]),
156
+ "-c", str(cfg["ctx"]),
157
+ "--threads", str(cfg["threads"]),
158
+ "--batch-size",str(cfg["batch"]),
159
+ "-ngl", "0", # CPU only
160
+ "--mlock", # pin model in RAM → no swap
161
+ "--flash-attn", # faster attention (if supported, harmless if not)
162
+ "-np", "1", # 1 parallel slot (we only have 2 CPUs)
163
+ "--no-mmap", # mlock + no-mmap = fastest cold reads
164
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
165
+
166
+ url = f"http://localhost:{cfg['port']}/health"
167
+ for i in range(60):
168
+ try:
169
+ r = requests.get(url, timeout=2)
170
+ if r.status_code == 200:
171
+ _server_ready[model_name] = True
172
+ print(f" ✓ {model_name} ready")
173
+ return process
174
+ except Exception:
175
+ pass
176
+ print(f" waiting for {model_name}... ({i+1}/60)")
177
+ time.sleep(2)
178
 
179
+ print(f" {model_name} failed to start")
180
+ return None
181
 
 
 
182
 
183
+ for name, cfg in MODELS.items():
184
+ threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()
 
185
 
 
 
 
186
 
187
+ # ---------------------------
188
+ # HELPERS
189
+ # ---------------------------
 
 
 
 
 
 
190
 
191
+ def model_meta(name: str, cfg: dict) -> dict:
192
+ size = os.path.getsize(cfg["path"]) if os.path.exists(cfg["path"]) else 0
193
+ digest = ""
194
+ if os.path.exists(cfg["path"]):
195
+ with open(cfg["path"], "rb") as f:
196
+ digest = hashlib.md5(f.read(65536)).hexdigest() # partial hash for speed
197
 
198
+ return {
199
+ "name": name,
200
+ "model": name,
201
+ "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
202
+ "size": size,
203
+ "digest": f"sha256:{digest}",
204
+ "details": {
205
+ "format": "gguf",
206
+ "family": cfg["family"],
207
+ "families": [cfg["family"]],
208
+ "parameter_size": cfg["param_size"],
209
+ "quantization_level": "Q4_K_M",
210
+ },
211
+ }
212
 
213
 
214
+ def llama_params(options: Optional[dict]) -> dict:
215
+ """Map Ollama options → llama.cpp completion params."""
216
+ o = options or {}
217
+ return {
218
+ "temperature": o.get("temperature", 0.7),
219
+ "top_p": o.get("top_p", 0.9),
220
+ "top_k": o.get("top_k", 40),
221
+ "repeat_penalty": o.get("repeat_penalty", 1.1),
222
+ "n_predict": o.get("num_predict", 1024),
223
+ "stop": o.get("stop", ["<|im_end|>", "<|endoftext|>"]),
224
+ }
225
 
226
 
227
  # ---------------------------
 
230
 
231
  @app.get("/")
232
  def root():
233
+ return {"status": "running", "models": list(MODELS.keys())}
234
 
235
 
236
  # ---------------------------
237
+ # /api/tags — model list
238
  # ---------------------------
239
 
240
  @app.get("/api/tags")
241
  def tags():
242
+ return {"models": [model_meta(n, c) for n, c in MODELS.items()]}
243
+
244
+
245
+ # ---------------------------
246
+ # /api/show — model detail (needed by some UIs)
247
+ # ---------------------------
248
 
249
+ @app.post("/api/show")
250
+ def show(body: dict):
251
+ name = body.get("name", DEFAULT_MODEL)
252
+ cfg = resolve_model(name)
253
+ meta = model_meta(name, cfg)
254
+ meta["modelfile"] = f"FROM {name}\n"
255
+ meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
256
+ meta["template"] = (
257
+ "<|im_start|>system\n{{ .System }}<|im_end|>\n"
258
+ "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
259
+ "<|im_start|>assistant\n"
260
+ )
261
+ return meta
262
 
 
 
263
 
264
+ # ---------------------------
265
+ # /api/ps — running models
266
+ # ---------------------------
267
+
268
+ @app.get("/api/ps")
269
+ def ps():
270
+ running = []
271
+ for name, cfg in MODELS.items():
272
+ if _server_ready.get(name):
273
+ m = model_meta(name, cfg)
274
+ m["expires_at"] = "0001-01-01T00:00:00Z"
275
+ m["size_vram"] = 0
276
+ running.append(m)
277
+ return {"models": running}
 
 
 
 
278
 
279
 
280
  # ---------------------------
281
+ # /api/generate
282
  # ---------------------------
283
 
284
  @app.post("/api/generate")
285
  def generate(req: GenerateRequest):
286
+ cfg = resolve_model(req.model)
287
+ port = cfg["port"]
288
+ params = llama_params(req.options)
289
+ params["prompt"] = req.prompt
290
+ params["stream"] = req.stream
291
 
292
  r = requests.post(
293
+ f"http://localhost:{port}/completion",
294
+ json=params,
295
+ stream=req.stream,
296
+ timeout=120,
 
297
  )
298
 
299
+ if not req.stream:
300
+ text = r.json().get("content", "").strip()
301
+ return {
302
+ "model": req.model,
303
+ "response": text,
304
+ "done": True,
305
+ "done_reason":"stop",
306
+ }
307
 
308
+ def stream_gen():
309
+ for line in r.iter_lines():
310
+ if not line:
311
+ continue
312
+ line = line.decode("utf-8").strip()
313
+ if line.startswith("data:"):
314
+ line = line[5:].strip()
315
+ try:
316
+ data = json.loads(line)
317
+ except Exception:
318
+ continue
319
+ token = data.get("content", "")
320
+ done = data.get("stop", False)
321
+ yield json.dumps({
322
+ "model": req.model,
323
+ "response": token,
324
+ "done": done,
325
+ }) + "\n"
326
+ if done:
327
+ break
328
+ yield json.dumps({"model": req.model, "response": "", "done": True, "done_reason": "stop"}) + "\n"
329
 
330
+ return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
331
+ headers={"Cache-Control": "no-cache"})
 
 
 
332
 
333
 
334
  # ---------------------------
335
+ # /api/chat
336
  # ---------------------------
337
 
338
  @app.post("/api/chat")
339
  def chat(req: ChatRequest):
340
+ cfg = resolve_model(req.model)
341
+ port = cfg["port"]
342
  prompt = build_prompt(req.messages)
343
+ params = llama_params(req.options)
344
+ params["prompt"] = prompt
345
+ params["stream"] = req.stream
346
 
347
  r = requests.post(
348
+ f"http://localhost:{port}/completion",
349
+ json=params,
350
+ stream=req.stream,
351
+ timeout=120,
 
 
 
 
 
 
 
352
  )
353
 
354
  if not req.stream:
355
+ text = r.json().get("content", "").strip()
 
 
 
356
  return JSONResponse({
357
+ "model": req.model,
358
+ "message": {"role": "assistant", "content": text},
359
+ "done": True,
360
+ "done_reason": "stop",
 
 
361
  })
362
 
363
+ def stream_gen():
 
364
  for line in r.iter_lines():
 
365
  if not line:
366
  continue
 
367
  line = line.decode("utf-8").strip()
 
368
  if line.startswith("data:"):
369
  line = line[5:].strip()
 
370
  try:
371
  data = json.loads(line)
372
+ except Exception:
373
  continue
 
374
  token = data.get("content", "")
375
+ done = data.get("stop", False)
376
  yield json.dumps({
377
+ "model": req.model,
378
+ "message": {"role": "assistant", "content": token},
379
+ "done": done,
 
 
 
380
  }) + "\n"
381
+ if done:
382
+ break
383
  yield json.dumps({
384
+ "model": req.model, "done": True, "done_reason": "stop"
 
 
385
  }) + "\n"
386
 
387
+ return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
388
+ headers={"Cache-Control": "no-cache"})
 
 
 
389
 
390
 
391
  # ---------------------------
392
+ # START
393
  # ---------------------------
394
 
395
  if __name__ == "__main__":
396
+ uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)