Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

mrmadblack commited on 25 days ago

Commit

4b6c283

verified ·

1 Parent(s): 325785f

Update server.py

Browse files

Files changed (1) hide show

server.py +94 -50

server.py CHANGED Viewed

@@ -1,13 +1,11 @@
 """
 Ollama-compatible API server
-Models: Qwen3.5-0.8B (fast) + Qwen3.5-2B (smart)
 Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
-FIXES vs previous version:
-  1. Removed --flash-attn / --mlock / --no-mmap  (not all llama.cpp builds support them — caused silent crash)
-  2. llama-server logs go to llama_<model>.log so errors are visible in HF Space terminal
-  3. /api/chat and /api/generate now WAIT up to 120s for server readiness
-     instead of immediately crashing with ConnectionRefused
 """
 from fastapi import FastAPI, HTTPException
@@ -32,31 +30,45 @@ app = FastAPI()
 # ---------------------------
 MODELS = {
-    "qwen3.5-0.8b": {
-        "path":       "models/qwen3.5-0.8b.gguf",
-        "repo":       "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
-        "file":       "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
         "port":       8080,
-        "param_size": "0.8B",
-        "family":     "qwen3.5",
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
-    "qwen3.5-2b": {
-        "path":       "models/qwen3.5-2b.gguf",
-        "repo":       "bartowski/Qwen_Qwen3.5-2B-GGUF",
-        "file":       "Qwen_Qwen3.5-2B-Q4_K_M.gguf",
         "port":       8081,
-        "param_size": "2B",
-        "family":     "qwen3.5",
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
 }
-DEFAULT_MODEL = "qwen3.5-0.8b"
 LLAMA_SERVER  = "./llama.cpp/build/bin/llama-server"
@@ -79,10 +91,26 @@ class GenerateRequest(BaseModel):
 # ---------------------------
-# PROMPT BUILDER  (Qwen3.5 ChatML)
 # ---------------------------
-def build_prompt(messages: list) -> str:
     prompt = ""
     has_system = any(m.get("role") == "system" for m in messages)
     if not has_system:
@@ -144,7 +172,6 @@ _server_ready: dict = {k: False for k in MODELS}
 def start_llama(model_name: str, cfg: dict):
     print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
-    # FIX 1: Write logs to file — safe flags only, no --flash-attn/--mlock/--no-mmap
     log = open(f"llama_{model_name}.log", "w")
     process = subprocess.Popen([
@@ -161,7 +188,7 @@ def start_llama(model_name: str, cfg: dict):
     url = f"http://localhost:{cfg['port']}/health"
-    for i in range(90):   # up to 3 min
         time.sleep(2)
         try:
             r = requests.get(url, timeout=2)
@@ -172,7 +199,7 @@ def start_llama(model_name: str, cfg: dict):
         except Exception:
             pass
-        # FIX 2: Echo last log line so HF Space logs show real llama-server output
         try:
             with open(f"llama_{model_name}.log") as lf:
                 lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
@@ -189,15 +216,11 @@ for name, cfg in MODELS.items():
 # ---------------------------
-# READINESS GUARD  ← KEY FIX
 # ---------------------------
 def wait_for_model(model_key: str, timeout: int = 120):
-    """
-    FIX 3: Block the incoming request until the llama-server is ready.
-    Instead of crashing with ConnectionRefused, the client gets a clean
-    response once the model is loaded (or a 503 if it never comes up).
-    """
     deadline = time.time() + timeout
     while time.time() < deadline:
         if _server_ready.get(model_key):
@@ -235,15 +258,22 @@ def model_meta(name: str, cfg: dict) -> dict:
     }
-def llama_params(options: Optional[dict]) -> dict:
     o = options or {}
     return {
         "temperature":    o.get("temperature",    0.7),
         "top_p":          o.get("top_p",          0.9),
         "top_k":          o.get("top_k",          40),
         "repeat_penalty": o.get("repeat_penalty", 1.1),
         "n_predict":      o.get("num_predict",    1024),
-        "stop":           o.get("stop",           ["<|im_end|>", "<|endoftext|>"]),
     }
@@ -253,7 +283,15 @@ def llama_params(options: Optional[dict]) -> dict:
 @app.get("/")
 def root():
-    return {"status": "running", "models_ready": dict(_server_ready)}
 # ---------------------------
@@ -276,11 +314,17 @@ def show(body: dict):
     meta = model_meta(key, cfg)
     meta["modelfile"]  = f"FROM {key}\n"
     meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
-    meta["template"]   = (
-        "<|im_start|>system\n{{ .System }}<|im_end|>\n"
-        "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
-        "<|im_start|>assistant\n"
-    )
     return meta
@@ -306,18 +350,18 @@ def ps():
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
-    key  = resolve_model(req.model)
-    cfg  = MODELS[key]
-    wait_for_model(key)   # ← blocks until ready, not crash
-    params = llama_params(req.options)
     params["prompt"] = req.prompt
     params["stream"] = req.stream
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
-        json=params, stream=req.stream, timeout=120,
     )
     if not req.stream:
@@ -352,19 +396,19 @@ def generate(req: GenerateRequest):
 @app.post("/api/chat")
 def chat(req: ChatRequest):
-    key  = resolve_model(req.model)
-    cfg  = MODELS[key]
-    wait_for_model(key)   # ← blocks until ready, not crash
-    prompt = build_prompt(req.messages)
-    params = llama_params(req.options)
     params["prompt"] = prompt
     params["stream"] = req.stream
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
-        json=params, stream=req.stream, timeout=120,
     )
     if not req.stream:

 """
 Ollama-compatible API server
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+  ⚡ qwen2.5-coder-1.5b  → coding, quick replies       (port 8080)
+  🧠 qwen3-4b            → thinking, hard problems      (port 8081)
+  🌐 gemma3-4b           → translation, general chat    (port 8082)
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
 """
 from fastapi import FastAPI, HTTPException
 # ---------------------------
 MODELS = {
+    "qwen2.5-coder-1.5b": {   # ⚡ FAST — coding, snippets, quick replies
+        "path":       "models/qwen2.5-coder-1.5b.gguf",
+        "repo":       "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF",
+        "file":       "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf",
         "port":       8080,
+        "param_size": "1.5B",
+        "family":     "qwen2.5",
+        "fmt":        "chatml",
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
+    "qwen3-4b": {              # 🧠 THINKING — hard bugs, architecture, logic (/think)
+        "path":       "models/qwen3-4b.gguf",
+        "repo":       "bartowski/Qwen_Qwen3-4B-GGUF",
+        "file":       "Qwen_Qwen3-4B-Q4_K_M.gguf",
         "port":       8081,
+        "param_size": "4B",
+        "family":     "qwen3",
+        "fmt":        "chatml",
+        "threads":    2,
+        "ctx":        2048,
+        "batch":      512,
+    },
+    "gemma3-4b": {             # 🌐 GENERAL — translation, Tamil↔English, daily chat
+        "path":       "models/gemma3-4b.gguf",
+        "repo":       "bartowski/google_gemma-3-4b-it-GGUF",
+        "file":       "google_gemma-3-4b-it-Q4_K_M.gguf",
+        "port":       8082,
+        "param_size": "4B",
+        "family":     "gemma3",
+        "fmt":        "gemma",
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
 }
+DEFAULT_MODEL = "qwen2.5-coder-1.5b"
 LLAMA_SERVER  = "./llama.cpp/build/bin/llama-server"
 # ---------------------------
+# PROMPT BUILDER
 # ---------------------------
+def build_prompt(messages: list, fmt: str = "chatml") -> str:
+    # ── Gemma3 format ──────────────────────────────────────────
+    # <bos><start_of_turn>user\n…<end_of_turn>\n<start_of_turn>model\n
+    if fmt == "gemma":
+        prompt = "<bos>"
+        for m in messages:
+            role    = m.get("role", "user")
+            content = m.get("content", "").strip()
+            if not content or role == "system":
+                continue   # Gemma3 has no system role
+            turn = "user" if role == "user" else "model"
+            prompt += f"<start_of_turn>{turn}\n{content}<end_of_turn>\n"
+        prompt += "<start_of_turn>model\n"
+        return prompt
+    # ── ChatML format (Qwen2.5-Coder, Qwen3) ───────────────────
     prompt = ""
     has_system = any(m.get("role") == "system" for m in messages)
     if not has_system:
 def start_llama(model_name: str, cfg: dict):
     print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
     log = open(f"llama_{model_name}.log", "w")
     process = subprocess.Popen([
     url = f"http://localhost:{cfg['port']}/health"
+    for i in range(90):   # wait up to 3 min
         time.sleep(2)
         try:
             r = requests.get(url, timeout=2)
         except Exception:
             pass
+        # Echo last log line so HF Space logs show real llama-server progress
         try:
             with open(f"llama_{model_name}.log") as lf:
                 lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
 # ---------------------------
+# READINESS GUARD
 # ---------------------------
 def wait_for_model(model_key: str, timeout: int = 120):
+    """Block the request until the llama-server is ready."""
     deadline = time.time() + timeout
     while time.time() < deadline:
         if _server_ready.get(model_key):
     }
+def llama_params(options: Optional[dict], fmt: str = "chatml") -> dict:
     o = options or {}
+    # Stop tokens differ per model family
+    if fmt == "gemma":
+        default_stop = ["<end_of_turn>", "<eos>"]
+    else:
+        default_stop = ["<|im_end|>", "<|endoftext|>", "</think>"]
     return {
         "temperature":    o.get("temperature",    0.7),
         "top_p":          o.get("top_p",          0.9),
         "top_k":          o.get("top_k",          40),
         "repeat_penalty": o.get("repeat_penalty", 1.1),
         "n_predict":      o.get("num_predict",    1024),
+        "stop":           o.get("stop",           default_stop),
     }
 @app.get("/")
 def root():
+    return {
+        "status": "running",
+        "models_ready": dict(_server_ready),
+        "usage": {
+            "fast coding":   "qwen2.5-coder-1.5b",
+            "thinking":      "qwen3-4b  (add /think to your message)",
+            "translation":   "gemma3-4b",
+        }
+    }
 # ---------------------------
     meta = model_meta(key, cfg)
     meta["modelfile"]  = f"FROM {key}\n"
     meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
+    if cfg["fmt"] == "gemma":
+        meta["template"] = (
+            "{{ .Prompt }}"
+        )
+    else:
+        meta["template"] = (
+            "<|im_start|>system\n{{ .System }}<|im_end|>\n"
+            "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
     return meta
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
+    key = resolve_model(req.model)
+    cfg = MODELS[key]
+    wait_for_model(key)
+    params          = llama_params(req.options, fmt=cfg["fmt"])
     params["prompt"] = req.prompt
     params["stream"] = req.stream
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
+        json=params, stream=req.stream, timeout=180,
     )
     if not req.stream:
 @app.post("/api/chat")
 def chat(req: ChatRequest):
+    key = resolve_model(req.model)
+    cfg = MODELS[key]
+    wait_for_model(key)
+    prompt          = build_prompt(req.messages, fmt=cfg["fmt"])
+    params          = llama_params(req.options, fmt=cfg["fmt"])
     params["prompt"] = prompt
     params["stream"] = req.stream
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
+        json=params, stream=req.stream, timeout=180,
     )
     if not req.stream: