Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

mrmadblack commited on Mar 14

Commit

325785f

verified ·

1 Parent(s): 996a96e

Update server.py

Browse files

Files changed (1) hide show

server.py +291 -174

server.py CHANGED Viewed

@@ -1,4 +1,16 @@
-from fastapi import FastAPI
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
@@ -10,283 +22,388 @@ import json
 import time
 import hashlib
 import threading
 app = FastAPI()
-# -------------------------
-# MODEL CONFIG
-# -------------------------
 MODELS = {
-    "qwen:0.8b": {
-        "repo": "Qwen/Qwen3.5-0.8B-GGUF",
-        "file": "qwen3.5-0.8b-q4_k_m.gguf",
-        "path": "models/qwen_0_8b.gguf",
-        "port": 8081
     },
-    "qwen:2b": {
-        "repo": "Qwen/Qwen3.5-2B-GGUF",
-        "file": "qwen3.5-2b-q4_k_m.gguf",
-        "path": "models/qwen_2b.gguf",
-        "port": 8082
-    }
 }
-LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
-os.makedirs("models", exist_ok=True)
-# -------------------------
 # REQUEST MODELS
-# -------------------------
 class ChatRequest(BaseModel):
-    model: str
     messages: list
-    stream: bool = True
 class GenerateRequest(BaseModel):
-    model: str
-    prompt: str
-# -------------------------
-# PROMPT BUILDER (QWEN)
-# -------------------------
-def build_prompt(messages):
     prompt = ""
     for m in messages:
-        role = m["role"]
-        content = m["content"]
-        if role == "user":
             prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
         elif role == "assistant":
             prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
     prompt += "<|im_start|>assistant\n"
     return prompt
-# -------------------------
-# DOWNLOAD MODELS
-# -------------------------
-def download_models():
-    for name, m in MODELS.items():
-        if os.path.exists(m["path"]):
-            continue
-        print("Downloading", name)
-        f = hf_hub_download(
-            repo_id=m["repo"],
-            filename=m["file"]
-        )
-        os.system(f"cp {f} {m['path']}")
-download_models()
-# -------------------------
 # START LLAMA SERVERS
-# -------------------------
-def start_model(name, cfg):
-    threads = "2"
-    print("Starting", name)
-    subprocess.Popen([
-        LLAMA_SERVER,
-        "-m", cfg["path"],
-        "--host", "0.0.0.0",
-        "--port", str(cfg["port"]),
-        "--threads", threads,
-        "--parallel", "2",
-        "--ctx-size", "4096",
-        "--batch-size", "1024",
-        "--ubatch-size", "512",
-        "-ngl", "0"
-    ])
-    for i in range(30):
         try:
-            r = requests.get(f"http://localhost:{cfg['port']}/health")
             if r.status_code == 200:
-                print(name, "ready")
-                return
-        except:
             pass
         time.sleep(1)
-    raise RuntimeError(name + " failed to start")
-def start_all_models():
-    for name, cfg in MODELS.items():
-        threading.Thread(
-            target=start_model,
-            args=(name, cfg),
-            daemon=True
-        ).start()
-start_all_models()
-# -------------------------
 # ROOT
-# -------------------------
 @app.get("/")
 def root():
-    return {"status": "running"}
-# -------------------------
-# OLLAMA TAGS
-# -------------------------
 @app.get("/api/tags")
 def tags():
-    models = []
-    for name, m in MODELS.items():
-        size = os.path.getsize(m["path"])
-        with open(m["path"], "rb") as f:
-            digest = hashlib.sha256(f.read()).hexdigest()
-        models.append({
-            "name": name,
-            "model": name,
-            "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
-            "size": size,
-            "digest": digest,
-            "details": {
-                "format": "gguf",
-                "family": "qwen",
-                "parameter_size": name.split(":")[1]
-            }
-        })
-    return {"models": models}
-# -------------------------
-# GENERATE
-# -------------------------
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
-    cfg = MODELS[req.model]
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
-        json={
-            "prompt": req.prompt,
-            "n_predict": 512
-        }
     )
-    data = r.json()
-    return {
-        "model": req.model,
-        "response": data.get("content", ""),
-        "done": True
-    }
-# -------------------------
-# CHAT
-# -------------------------
 @app.post("/api/chat")
 def chat(req: ChatRequest):
-    cfg = MODELS[req.model]
     prompt = build_prompt(req.messages)
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
-        json={
-            "prompt": prompt,
-            "stream": req.stream,
-            "n_predict": 1024,
-            "temperature": 0.7
-        },
-        stream=req.stream
     )
     if not req.stream:
-        data = r.json()
         return JSONResponse({
-            "model": req.model,
-            "message": {
-                "role": "assistant",
-                "content": data.get("content", "")
-            },
-            "done": True
         })
-    def stream():
         for line in r.iter_lines():
             if not line:
                 continue
-            line = line.decode().replace("data:", "").strip()
             try:
                 data = json.loads(line)
-            except:
                 continue
             token = data.get("content", "")
             yield json.dumps({
-                "model": req.model,
-                "message": {
-                    "role": "assistant",
-                    "content": token
-                },
-                "done": False
             }) + "\n"
-        yield json.dumps({
-            "model": req.model,
-            "done": True
-        }) + "\n"
-    return StreamingResponse(
-        stream(),
-        media_type="application/x-ndjson"
-    )
-# -------------------------
-# START API
-# -------------------------
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+"""
+Ollama-compatible API server
+Models: Qwen3.5-0.8B (fast) + Qwen3.5-2B (smart)
+Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
+FIXES vs previous version:
+  1. Removed --flash-attn / --mlock / --no-mmap  (not all llama.cpp builds support them — caused silent crash)
+  2. llama-server logs go to llama_<model>.log so errors are visible in HF Space terminal
+  3. /api/chat and /api/generate now WAIT up to 120s for server readiness
+     instead of immediately crashing with ConnectionRefused
+"""
+from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 import time
 import hashlib
 import threading
+from typing import Optional
 app = FastAPI()
+# ---------------------------
+# MODEL CONFIGS
+# ---------------------------
 MODELS = {
+    "qwen3.5-0.8b": {
+        "path":       "models/qwen3.5-0.8b.gguf",
+        "repo":       "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
+        "file":       "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
+        "port":       8080,
+        "param_size": "0.8B",
+        "family":     "qwen3.5",
+        "threads":    2,
+        "ctx":        2048,
+        "batch":      512,
+    },
+    "qwen3.5-2b": {
+        "path":       "models/qwen3.5-2b.gguf",
+        "repo":       "bartowski/Qwen_Qwen3.5-2B-GGUF",
+        "file":       "Qwen_Qwen3.5-2B-Q4_K_M.gguf",
+        "port":       8081,
+        "param_size": "2B",
+        "family":     "qwen3.5",
+        "threads":    2,
+        "ctx":        2048,
+        "batch":      512,
     },
 }
+DEFAULT_MODEL = "qwen3.5-0.8b"
+LLAMA_SERVER  = "./llama.cpp/build/bin/llama-server"
+# ---------------------------
 # REQUEST MODELS
+# ---------------------------
 class ChatRequest(BaseModel):
+    model:    str  = DEFAULT_MODEL
     messages: list
+    stream:   bool = True
+    options:  Optional[dict] = None
 class GenerateRequest(BaseModel):
+    model:   str  = DEFAULT_MODEL
+    prompt:  str
+    stream:  bool = False
+    options: Optional[dict] = None
+# ---------------------------
+# PROMPT BUILDER  (Qwen3.5 ChatML)
+# ---------------------------
+def build_prompt(messages: list) -> str:
     prompt = ""
+    has_system = any(m.get("role") == "system" for m in messages)
+    if not has_system:
+        prompt += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
     for m in messages:
+        role    = m.get("role", "user")
+        content = m.get("content", "").strip()
+        if not content:
+            continue
+        if role == "system":
+            prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
+        elif role == "user":
             prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
         elif role == "assistant":
             prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
     prompt += "<|im_start|>assistant\n"
     return prompt
+# ---------------------------
+# MODEL RESOLVER
+# ---------------------------
+def resolve_model(name: str) -> str:
+    """Fuzzy match model name → key in MODELS. Falls back to default."""
+    name = (name or DEFAULT_MODEL).lower().strip()
+    if name in MODELS:
+        return name
+    for key in MODELS:
+        if key in name or name in key:
+            return key
+    return DEFAULT_MODEL
+# ---------------------------
+# DOWNLOAD MODELS
+# ---------------------------
+os.makedirs("models", exist_ok=True)
+def download_model(cfg: dict):
+    if not os.path.exists(cfg["path"]):
+        print(f"Downloading {cfg['file']} ...")
+        downloaded = hf_hub_download(repo_id=cfg["repo"], filename=cfg["file"])
+        os.system(f"cp '{downloaded}' '{cfg['path']}'")
+        print(f"  ✓ saved to {cfg['path']}")
+for m in MODELS.values():
+    download_model(m)
+# ---------------------------
 # START LLAMA SERVERS
+# ---------------------------
+_server_ready: dict = {k: False for k in MODELS}
+def start_llama(model_name: str, cfg: dict):
+    print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
+    # FIX 1: Write logs to file — safe flags only, no --flash-attn/--mlock/--no-mmap
+    log = open(f"llama_{model_name}.log", "w")
+    process = subprocess.Popen([
+        LLAMA_SERVER,
+        "-m",           cfg["path"],
+        "--host",       "0.0.0.0",
+        "--port",       str(cfg["port"]),
+        "-c",           str(cfg["ctx"]),
+        "--threads",    str(cfg["threads"]),
+        "--batch-size", str(cfg["batch"]),
+        "-ngl",         "0",   # CPU only
+        "-np",          "1",   # 1 parallel slot
+    ], stdout=log, stderr=log)
+    url = f"http://localhost:{cfg['port']}/health"
+    for i in range(90):   # up to 3 min
+        time.sleep(2)
         try:
+            r = requests.get(url, timeout=2)
             if r.status_code == 200:
+                _server_ready[model_name] = True
+                print(f"  ✓ {model_name} ready (took ~{(i+1)*2}s)")
+                return process
+        except Exception:
             pass
+        # FIX 2: Echo last log line so HF Space logs show real llama-server output
+        try:
+            with open(f"llama_{model_name}.log") as lf:
+                lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
+                print(f"  [{model_name}] {lines[-1] if lines else 'starting...'}")
+        except Exception:
+            print(f"  waiting for {model_name}... ({i+1}/90)")
+    print(f"  ✗ {model_name} failed — check llama_{model_name}.log")
+    return None
+for name, cfg in MODELS.items():
+    threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()
+# ---------------------------
+# READINESS GUARD  ← KEY FIX
+# ---------------------------
+def wait_for_model(model_key: str, timeout: int = 120):
+    """
+    FIX 3: Block the incoming request until the llama-server is ready.
+    Instead of crashing with ConnectionRefused, the client gets a clean
+    response once the model is loaded (or a 503 if it never comes up).
+    """
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if _server_ready.get(model_key):
+            return
         time.sleep(1)
+    raise HTTPException(
+        status_code=503,
+        detail=f"Model '{model_key}' is still loading. Please wait and retry."
+    )
+# ---------------------------
+# HELPERS
+# ---------------------------
+def model_meta(name: str, cfg: dict) -> dict:
+    size   = os.path.getsize(cfg["path"]) if os.path.exists(cfg["path"]) else 0
+    digest = ""
+    if os.path.exists(cfg["path"]):
+        with open(cfg["path"], "rb") as f:
+            digest = hashlib.md5(f.read(65536)).hexdigest()
+    return {
+        "name":        name,
+        "model":       name,
+        "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "size":        size,
+        "digest":      f"sha256:{digest}",
+        "details": {
+            "format":             "gguf",
+            "family":             cfg["family"],
+            "families":           [cfg["family"]],
+            "parameter_size":     cfg["param_size"],
+            "quantization_level": "Q4_K_M",
+        },
+    }
+def llama_params(options: Optional[dict]) -> dict:
+    o = options or {}
+    return {
+        "temperature":    o.get("temperature",    0.7),
+        "top_p":          o.get("top_p",          0.9),
+        "top_k":          o.get("top_k",          40),
+        "repeat_penalty": o.get("repeat_penalty", 1.1),
+        "n_predict":      o.get("num_predict",    1024),
+        "stop":           o.get("stop",           ["<|im_end|>", "<|endoftext|>"]),
+    }
+# ---------------------------
 # ROOT
+# ---------------------------
 @app.get("/")
 def root():
+    return {"status": "running", "models_ready": dict(_server_ready)}
+# ---------------------------
+# /api/tags
+# ---------------------------
 @app.get("/api/tags")
 def tags():
+    return {"models": [model_meta(n, c) for n, c in MODELS.items()]}
+# ---------------------------
+# /api/show
+# ---------------------------
+@app.post("/api/show")
+def show(body: dict):
+    key  = resolve_model(body.get("name", DEFAULT_MODEL))
+    cfg  = MODELS[key]
+    meta = model_meta(key, cfg)
+    meta["modelfile"]  = f"FROM {key}\n"
+    meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
+    meta["template"]   = (
+        "<|im_start|>system\n{{ .System }}<|im_end|>\n"
+        "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    return meta
+# ---------------------------
+# /api/ps
+# ---------------------------
+@app.get("/api/ps")
+def ps():
+    running = []
+    for name, cfg in MODELS.items():
+        if _server_ready.get(name):
+            m = model_meta(name, cfg)
+            m["expires_at"] = "0001-01-01T00:00:00Z"
+            m["size_vram"]  = 0
+            running.append(m)
+    return {"models": running}
+# ---------------------------
+# /api/generate
+# ---------------------------
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
+    key  = resolve_model(req.model)
+    cfg  = MODELS[key]
+    wait_for_model(key)   # ← blocks until ready, not crash
+    params = llama_params(req.options)
+    params["prompt"] = req.prompt
+    params["stream"] = req.stream
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
+        json=params, stream=req.stream, timeout=120,
     )
+    if not req.stream:
+        text = r.json().get("content", "").strip()
+        return {"model": req.model, "response": text, "done": True, "done_reason": "stop"}
+    def stream_gen():
+        for line in r.iter_lines():
+            if not line:
+                continue
+            line = line.decode("utf-8").strip()
+            if line.startswith("data:"):
+                line = line[5:].strip()
+            try:
+                data = json.loads(line)
+            except Exception:
+                continue
+            token = data.get("content", "")
+            done  = data.get("stop", False)
+            yield json.dumps({"model": req.model, "response": token, "done": done}) + "\n"
+            if done:
+                break
+        yield json.dumps({"model": req.model, "response": "", "done": True, "done_reason": "stop"}) + "\n"
+    return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
+                             headers={"Cache-Control": "no-cache"})
+# ---------------------------
+# /api/chat
+# ---------------------------
 @app.post("/api/chat")
 def chat(req: ChatRequest):
+    key  = resolve_model(req.model)
+    cfg  = MODELS[key]
+    wait_for_model(key)   # ← blocks until ready, not crash
     prompt = build_prompt(req.messages)
+    params = llama_params(req.options)
+    params["prompt"] = prompt
+    params["stream"] = req.stream
     r = requests.post(
         f"http://localhost:{cfg['port']}/completion",
+        json=params, stream=req.stream, timeout=120,
     )
     if not req.stream:
+        text = r.json().get("content", "").strip()
         return JSONResponse({
+            "model":   req.model,
+            "message": {"role": "assistant", "content": text},
+            "done": True, "done_reason": "stop",
         })
+    def stream_gen():
         for line in r.iter_lines():
             if not line:
                 continue
+            line = line.decode("utf-8").strip()
+            if line.startswith("data:"):
+                line = line[5:].strip()
             try:
                 data = json.loads(line)
+            except Exception:
                 continue
             token = data.get("content", "")
+            done  = data.get("stop", False)
             yield json.dumps({
+                "model":   req.model,
+                "message": {"role": "assistant", "content": token},
+                "done":    done,
             }) + "\n"
+            if done:
+                break
+        yield json.dumps({"model": req.model, "done": True, "done_reason": "stop"}) + "\n"
+    return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
+                             headers={"Cache-Control": "no-cache"})
+# ---------------------------
+# START
+# ---------------------------
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)