mrmadblack commited on
Commit
4b6c283
Β·
verified Β·
1 Parent(s): 325785f

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +94 -50
server.py CHANGED
@@ -1,13 +1,11 @@
1
  """
2
  Ollama-compatible API server
3
- Models: Qwen3.5-0.8B (fast) + Qwen3.5-2B (smart)
 
 
 
 
4
  Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
5
-
6
- FIXES vs previous version:
7
- 1. Removed --flash-attn / --mlock / --no-mmap (not all llama.cpp builds support them β€” caused silent crash)
8
- 2. llama-server logs go to llama_<model>.log so errors are visible in HF Space terminal
9
- 3. /api/chat and /api/generate now WAIT up to 120s for server readiness
10
- instead of immediately crashing with ConnectionRefused
11
  """
12
 
13
  from fastapi import FastAPI, HTTPException
@@ -32,31 +30,45 @@ app = FastAPI()
32
  # ---------------------------
33
 
34
  MODELS = {
35
- "qwen3.5-0.8b": {
36
- "path": "models/qwen3.5-0.8b.gguf",
37
- "repo": "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
38
- "file": "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
39
  "port": 8080,
40
- "param_size": "0.8B",
41
- "family": "qwen3.5",
 
42
  "threads": 2,
43
  "ctx": 2048,
44
  "batch": 512,
45
  },
46
- "qwen3.5-2b": {
47
- "path": "models/qwen3.5-2b.gguf",
48
- "repo": "bartowski/Qwen_Qwen3.5-2B-GGUF",
49
- "file": "Qwen_Qwen3.5-2B-Q4_K_M.gguf",
50
  "port": 8081,
51
- "param_size": "2B",
52
- "family": "qwen3.5",
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  "threads": 2,
54
  "ctx": 2048,
55
  "batch": 512,
56
  },
57
  }
58
 
59
- DEFAULT_MODEL = "qwen3.5-0.8b"
60
  LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
61
 
62
 
@@ -79,10 +91,26 @@ class GenerateRequest(BaseModel):
79
 
80
 
81
  # ---------------------------
82
- # PROMPT BUILDER (Qwen3.5 ChatML)
83
  # ---------------------------
84
 
85
- def build_prompt(messages: list) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  prompt = ""
87
  has_system = any(m.get("role") == "system" for m in messages)
88
  if not has_system:
@@ -144,7 +172,6 @@ _server_ready: dict = {k: False for k in MODELS}
144
  def start_llama(model_name: str, cfg: dict):
145
  print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
146
 
147
- # FIX 1: Write logs to file β€” safe flags only, no --flash-attn/--mlock/--no-mmap
148
  log = open(f"llama_{model_name}.log", "w")
149
 
150
  process = subprocess.Popen([
@@ -161,7 +188,7 @@ def start_llama(model_name: str, cfg: dict):
161
 
162
  url = f"http://localhost:{cfg['port']}/health"
163
 
164
- for i in range(90): # up to 3 min
165
  time.sleep(2)
166
  try:
167
  r = requests.get(url, timeout=2)
@@ -172,7 +199,7 @@ def start_llama(model_name: str, cfg: dict):
172
  except Exception:
173
  pass
174
 
175
- # FIX 2: Echo last log line so HF Space logs show real llama-server output
176
  try:
177
  with open(f"llama_{model_name}.log") as lf:
178
  lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
@@ -189,15 +216,11 @@ for name, cfg in MODELS.items():
189
 
190
 
191
  # ---------------------------
192
- # READINESS GUARD ← KEY FIX
193
  # ---------------------------
194
 
195
  def wait_for_model(model_key: str, timeout: int = 120):
196
- """
197
- FIX 3: Block the incoming request until the llama-server is ready.
198
- Instead of crashing with ConnectionRefused, the client gets a clean
199
- response once the model is loaded (or a 503 if it never comes up).
200
- """
201
  deadline = time.time() + timeout
202
  while time.time() < deadline:
203
  if _server_ready.get(model_key):
@@ -235,15 +258,22 @@ def model_meta(name: str, cfg: dict) -> dict:
235
  }
236
 
237
 
238
- def llama_params(options: Optional[dict]) -> dict:
239
  o = options or {}
 
 
 
 
 
 
 
240
  return {
241
  "temperature": o.get("temperature", 0.7),
242
  "top_p": o.get("top_p", 0.9),
243
  "top_k": o.get("top_k", 40),
244
  "repeat_penalty": o.get("repeat_penalty", 1.1),
245
  "n_predict": o.get("num_predict", 1024),
246
- "stop": o.get("stop", ["<|im_end|>", "<|endoftext|>"]),
247
  }
248
 
249
 
@@ -253,7 +283,15 @@ def llama_params(options: Optional[dict]) -> dict:
253
 
254
  @app.get("/")
255
  def root():
256
- return {"status": "running", "models_ready": dict(_server_ready)}
 
 
 
 
 
 
 
 
257
 
258
 
259
  # ---------------------------
@@ -276,11 +314,17 @@ def show(body: dict):
276
  meta = model_meta(key, cfg)
277
  meta["modelfile"] = f"FROM {key}\n"
278
  meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
279
- meta["template"] = (
280
- "<|im_start|>system\n{{ .System }}<|im_end|>\n"
281
- "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
282
- "<|im_start|>assistant\n"
283
- )
 
 
 
 
 
 
284
  return meta
285
 
286
 
@@ -306,18 +350,18 @@ def ps():
306
 
307
  @app.post("/api/generate")
308
  def generate(req: GenerateRequest):
309
- key = resolve_model(req.model)
310
- cfg = MODELS[key]
311
 
312
- wait_for_model(key) # ← blocks until ready, not crash
313
 
314
- params = llama_params(req.options)
315
  params["prompt"] = req.prompt
316
  params["stream"] = req.stream
317
 
318
  r = requests.post(
319
  f"http://localhost:{cfg['port']}/completion",
320
- json=params, stream=req.stream, timeout=120,
321
  )
322
 
323
  if not req.stream:
@@ -352,19 +396,19 @@ def generate(req: GenerateRequest):
352
 
353
  @app.post("/api/chat")
354
  def chat(req: ChatRequest):
355
- key = resolve_model(req.model)
356
- cfg = MODELS[key]
357
 
358
- wait_for_model(key) # ← blocks until ready, not crash
359
 
360
- prompt = build_prompt(req.messages)
361
- params = llama_params(req.options)
362
  params["prompt"] = prompt
363
  params["stream"] = req.stream
364
 
365
  r = requests.post(
366
  f"http://localhost:{cfg['port']}/completion",
367
- json=params, stream=req.stream, timeout=120,
368
  )
369
 
370
  if not req.stream:
 
1
  """
2
  Ollama-compatible API server
3
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4
+ ⚑ qwen2.5-coder-1.5b β†’ coding, quick replies (port 8080)
5
+ 🧠 qwen3-4b β†’ thinking, hard problems (port 8081)
6
+ 🌐 gemma3-4b β†’ translation, general chat (port 8082)
7
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
8
  Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
 
 
 
 
 
 
9
  """
10
 
11
  from fastapi import FastAPI, HTTPException
 
30
  # ---------------------------
31
 
32
  MODELS = {
33
+ "qwen2.5-coder-1.5b": { # ⚑ FAST β€” coding, snippets, quick replies
34
+ "path": "models/qwen2.5-coder-1.5b.gguf",
35
+ "repo": "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF",
36
+ "file": "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf",
37
  "port": 8080,
38
+ "param_size": "1.5B",
39
+ "family": "qwen2.5",
40
+ "fmt": "chatml",
41
  "threads": 2,
42
  "ctx": 2048,
43
  "batch": 512,
44
  },
45
+ "qwen3-4b": { # 🧠 THINKING β€” hard bugs, architecture, logic (/think)
46
+ "path": "models/qwen3-4b.gguf",
47
+ "repo": "bartowski/Qwen_Qwen3-4B-GGUF",
48
+ "file": "Qwen_Qwen3-4B-Q4_K_M.gguf",
49
  "port": 8081,
50
+ "param_size": "4B",
51
+ "family": "qwen3",
52
+ "fmt": "chatml",
53
+ "threads": 2,
54
+ "ctx": 2048,
55
+ "batch": 512,
56
+ },
57
+ "gemma3-4b": { # 🌐 GENERAL β€” translation, Tamil↔English, daily chat
58
+ "path": "models/gemma3-4b.gguf",
59
+ "repo": "bartowski/google_gemma-3-4b-it-GGUF",
60
+ "file": "google_gemma-3-4b-it-Q4_K_M.gguf",
61
+ "port": 8082,
62
+ "param_size": "4B",
63
+ "family": "gemma3",
64
+ "fmt": "gemma",
65
  "threads": 2,
66
  "ctx": 2048,
67
  "batch": 512,
68
  },
69
  }
70
 
71
+ DEFAULT_MODEL = "qwen2.5-coder-1.5b"
72
  LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
73
 
74
 
 
91
 
92
 
93
  # ---------------------------
94
+ # PROMPT BUILDER
95
  # ---------------------------
96
 
97
+ def build_prompt(messages: list, fmt: str = "chatml") -> str:
98
+
99
+ # ── Gemma3 format ──────────────────────────────────────────
100
+ # <bos><start_of_turn>user\n…<end_of_turn>\n<start_of_turn>model\n
101
+ if fmt == "gemma":
102
+ prompt = "<bos>"
103
+ for m in messages:
104
+ role = m.get("role", "user")
105
+ content = m.get("content", "").strip()
106
+ if not content or role == "system":
107
+ continue # Gemma3 has no system role
108
+ turn = "user" if role == "user" else "model"
109
+ prompt += f"<start_of_turn>{turn}\n{content}<end_of_turn>\n"
110
+ prompt += "<start_of_turn>model\n"
111
+ return prompt
112
+
113
+ # ── ChatML format (Qwen2.5-Coder, Qwen3) ───────────────────
114
  prompt = ""
115
  has_system = any(m.get("role") == "system" for m in messages)
116
  if not has_system:
 
172
  def start_llama(model_name: str, cfg: dict):
173
  print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
174
 
 
175
  log = open(f"llama_{model_name}.log", "w")
176
 
177
  process = subprocess.Popen([
 
188
 
189
  url = f"http://localhost:{cfg['port']}/health"
190
 
191
+ for i in range(90): # wait up to 3 min
192
  time.sleep(2)
193
  try:
194
  r = requests.get(url, timeout=2)
 
199
  except Exception:
200
  pass
201
 
202
+ # Echo last log line so HF Space logs show real llama-server progress
203
  try:
204
  with open(f"llama_{model_name}.log") as lf:
205
  lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
 
216
 
217
 
218
  # ---------------------------
219
+ # READINESS GUARD
220
  # ---------------------------
221
 
222
  def wait_for_model(model_key: str, timeout: int = 120):
223
+ """Block the request until the llama-server is ready."""
 
 
 
 
224
  deadline = time.time() + timeout
225
  while time.time() < deadline:
226
  if _server_ready.get(model_key):
 
258
  }
259
 
260
 
261
+ def llama_params(options: Optional[dict], fmt: str = "chatml") -> dict:
262
  o = options or {}
263
+
264
+ # Stop tokens differ per model family
265
+ if fmt == "gemma":
266
+ default_stop = ["<end_of_turn>", "<eos>"]
267
+ else:
268
+ default_stop = ["<|im_end|>", "<|endoftext|>", "</think>"]
269
+
270
  return {
271
  "temperature": o.get("temperature", 0.7),
272
  "top_p": o.get("top_p", 0.9),
273
  "top_k": o.get("top_k", 40),
274
  "repeat_penalty": o.get("repeat_penalty", 1.1),
275
  "n_predict": o.get("num_predict", 1024),
276
+ "stop": o.get("stop", default_stop),
277
  }
278
 
279
 
 
283
 
284
  @app.get("/")
285
  def root():
286
+ return {
287
+ "status": "running",
288
+ "models_ready": dict(_server_ready),
289
+ "usage": {
290
+ "fast coding": "qwen2.5-coder-1.5b",
291
+ "thinking": "qwen3-4b (add /think to your message)",
292
+ "translation": "gemma3-4b",
293
+ }
294
+ }
295
 
296
 
297
  # ---------------------------
 
314
  meta = model_meta(key, cfg)
315
  meta["modelfile"] = f"FROM {key}\n"
316
  meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
317
+
318
+ if cfg["fmt"] == "gemma":
319
+ meta["template"] = (
320
+ "{{ .Prompt }}"
321
+ )
322
+ else:
323
+ meta["template"] = (
324
+ "<|im_start|>system\n{{ .System }}<|im_end|>\n"
325
+ "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
326
+ "<|im_start|>assistant\n"
327
+ )
328
  return meta
329
 
330
 
 
350
 
351
  @app.post("/api/generate")
352
  def generate(req: GenerateRequest):
353
+ key = resolve_model(req.model)
354
+ cfg = MODELS[key]
355
 
356
+ wait_for_model(key)
357
 
358
+ params = llama_params(req.options, fmt=cfg["fmt"])
359
  params["prompt"] = req.prompt
360
  params["stream"] = req.stream
361
 
362
  r = requests.post(
363
  f"http://localhost:{cfg['port']}/completion",
364
+ json=params, stream=req.stream, timeout=180,
365
  )
366
 
367
  if not req.stream:
 
396
 
397
  @app.post("/api/chat")
398
  def chat(req: ChatRequest):
399
+ key = resolve_model(req.model)
400
+ cfg = MODELS[key]
401
 
402
+ wait_for_model(key)
403
 
404
+ prompt = build_prompt(req.messages, fmt=cfg["fmt"])
405
+ params = llama_params(req.options, fmt=cfg["fmt"])
406
  params["prompt"] = prompt
407
  params["stream"] = req.stream
408
 
409
  r = requests.post(
410
  f"http://localhost:{cfg['port']}/completion",
411
+ json=params, stream=req.stream, timeout=180,
412
  )
413
 
414
  if not req.stream: