Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

mrmadblack commited on Mar 14

Commit

e032d80

verified ·

1 Parent(s): 0a9db98

Update server.py

Browse files

Files changed (1) hide show

server.py +256 -151

server.py CHANGED Viewed

@@ -1,4 +1,10 @@
-from fastapi import FastAPI
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
@@ -10,20 +16,43 @@ import json
 import time
 import hashlib
 import threading
 app = FastAPI()
 # ---------------------------
-# MODEL CONFIG
 # ---------------------------
-MODEL_NAME = "tinyllama"
-MODEL_PATH = "models/tinyllama.gguf"
-MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
-MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
-LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
 # ---------------------------
@@ -31,111 +60,168 @@ LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
 # ---------------------------
 class ChatRequest(BaseModel):
-    model: str
     messages: list
     stream: bool = True
 class GenerateRequest(BaseModel):
-    model: str
     prompt: str
 # ---------------------------
-# PROMPT BUILDER
 # ---------------------------
-def build_prompt(messages):
     prompt = ""
     for m in messages:
-        role = m.get("role")
         content = m.get("content", "").strip()
         if not content:
             continue
-        if role == "user":
-            prompt += f"<|user|>\n{content}\n"
         elif role == "assistant":
-            prompt += f"<|assistant|>\n{content}\n"
-    prompt += "<|assistant|>\n"
     return prompt
 # ---------------------------
-# DOWNLOAD MODEL
 # ---------------------------
-os.makedirs("models", exist_ok=True)
-if not os.path.exists(MODEL_PATH):
-    print("Downloading model from HuggingFace...")
-    downloaded = hf_hub_download(
-        repo_id=MODEL_REPO,
-        filename=MODEL_FILE
-    )
-    os.system(f"cp {downloaded} {MODEL_PATH}")
-    print("Model ready:", MODEL_PATH)
 # ---------------------------
-# START LLAMA SERVER
 # ---------------------------
-import os
-import subprocess
-import requests
-import time
-def start_llama():
-    print("Starting llama-server...")
-    threads = str(os.cpu_count() or 2)
     process = subprocess.Popen([
         LLAMA_SERVER,
-        "-m", MODEL_PATH,
-        "--host", "0.0.0.0",
-        "--port", "8080",
-        # context window
-        "-c", "4096",
-        # CPU tuning
-        "--threads", threads,
-        "--batch-size", "512",
-        # ensure CPU-only
-        "-ngl", "0"
-    ])
-    # wait for llama-server to be ready
-    for i in range(30):
-        try:
-            r = requests.get("http://localhost:8080/health", timeout=2)
-            if r.status_code == 200:
-                print("llama-server ready")
-                return process
-        except requests.exceptions.RequestException:
-            pass
-        print(f"waiting for llama-server... ({i+1}/30)")
-        time.sleep(1)
-    raise RuntimeError("llama-server failed to start")
-threading.Thread(target=start_llama, daemon=True).start()
 # ---------------------------
@@ -144,148 +230,167 @@ threading.Thread(target=start_llama, daemon=True).start()
 @app.get("/")
 def root():
-    return {"status": "running"}
 # ---------------------------
-# MODEL LIST (Ollama style)
 # ---------------------------
 @app.get("/api/tags")
 def tags():
-    size = os.path.getsize(MODEL_PATH)
-    with open(MODEL_PATH, "rb") as f:
-        digest = hashlib.sha256(f.read()).hexdigest()
-    return {
-        "models": [
-            {
-                "name": MODEL_NAME,
-                "model": MODEL_NAME,
-                "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
-                "size": size,
-                "digest": digest,
-                "details": {
-                    "format": "gguf",
-                    "family": "llama",
-                    "families": ["llama"],
-                    "parameter_size": "1.1B",
-                    "quantization_level": "Q4_K_M"
-                }
-            }
-        ]
-    }
 # ---------------------------
-# GENERATE (non-stream)
 # ---------------------------
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
     r = requests.post(
-        "http://localhost:8080/completion",
-        json={
-            "prompt": req.prompt,
-            "n_predict": 256
-        }
     )
-    data = r.json()
-    text = data.get("content", "").strip()
-    return {
-        "model": req.model,
-        "response": text,
-        "done": True
-    }
 # ---------------------------
-# CHAT (Ollama streaming)
 # ---------------------------
 @app.post("/api/chat")
 def chat(req: ChatRequest):
     prompt = build_prompt(req.messages)
     r = requests.post(
-        "http://localhost:8080/completion",
-        json={
-            "prompt": prompt,
-            "stream": req.stream,
-            "n_predict": 1024,
-            "temperature": 0.7,
-            "top_p": 0.9,
-            "top_k": 40,
-            "repeat_penalty": 1.1
-        },
-        stream=req.stream
     )
     if not req.stream:
-        data = r.json()
-        text = data.get("content", "")
         return JSONResponse({
-            "model": req.model,
-            "message": {
-                "role": "assistant",
-                "content": text
-            },
-            "done": True
         })
-    def stream_generator():
         for line in r.iter_lines():
             if not line:
                 continue
             line = line.decode("utf-8").strip()
             if line.startswith("data:"):
                 line = line[5:].strip()
             try:
                 data = json.loads(line)
-            except:
                 continue
             token = data.get("content", "")
             yield json.dumps({
-                "model": req.model,
-                "message": {
-                    "role": "assistant",
-                    "content": token
-                },
-                "done": False
             }) + "\n"
         yield json.dumps({
-            "model": req.model,
-            "done": True,
-            "done_reason": "stop"
         }) + "\n"
-    return StreamingResponse(
-        stream_generator(),
-        media_type="application/x-ndjson",
-        headers={"Cache-Control": "no-cache"}
-    )
 # ---------------------------
-# START API
 # ---------------------------
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+"""
+Ollama-compatible API server
+Models: Qwen3-0.6B (fast) + Qwen3-1.7B (smart)
+Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
+"""
+from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 import time
 import hashlib
 import threading
+from typing import Optional
 app = FastAPI()
 # ---------------------------
+# MODEL CONFIGS
 # ---------------------------
+MODELS = {
+    "qwen3.5-0.8b": {
+        "path":        "models/qwen3.5-0.8b.gguf",
+        "repo":        "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
+        "file":        "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
+        "port":        8080,
+        "param_size":  "0.8B",
+        "family":      "qwen3.5",
+        # tight tuning for speed on 2 vCPU
+        "threads":     2,
+        "ctx":         2048,
+        "batch":       512,
+    },
+    "qwen3.5-2b": {
+        "path":        "models/qwen3.5-2b.gguf",
+        "repo":        "bartowski/Qwen_Qwen3.5-2B-GGUF",
+        "file":        "Qwen_Qwen3.5-2B-Q4_K_M.gguf",
+        "port":        8081,
+        "param_size":  "2B",
+        "family":      "qwen3.5",
+        "threads":     2,
+        "ctx":         2048,
+        "batch":       512,
+    },
+}
+DEFAULT_MODEL  = "qwen3.5-0.8b"
+LLAMA_SERVER   = "./llama.cpp/build/bin/llama-server"
 # ---------------------------
 # ---------------------------
 class ChatRequest(BaseModel):
+    model: str = DEFAULT_MODEL
     messages: list
     stream: bool = True
+    options: Optional[dict] = None
 class GenerateRequest(BaseModel):
+    model: str = DEFAULT_MODEL
     prompt: str
+    stream: bool = False
+    options: Optional[dict] = None
 # ---------------------------
+# PROMPT BUILDER  (Qwen3 ChatML)
 # ---------------------------
+def build_prompt(messages: list) -> str:
+    """
+    Qwen3 uses ChatML format:
+      <|im_start|>system\n…<|im_end|>
+      <|im_start|>user\n…<|im_end|>
+      <|im_start|>assistant\n
+    """
     prompt = ""
+    has_system = any(m.get("role") == "system" for m in messages)
+    if not has_system:
+        prompt += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
     for m in messages:
+        role    = m.get("role", "user")
         content = m.get("content", "").strip()
         if not content:
             continue
+        if role == "system":
+            prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
+        elif role == "user":
+            prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
         elif role == "assistant":
+            prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
+    prompt += "<|im_start|>assistant\n"
     return prompt
 # ---------------------------
+# MODEL RESOLVER
 # ---------------------------
+def resolve_model(name: str) -> dict:
+    """Fuzzy match model name → config. Falls back to default."""
+    name = (name or DEFAULT_MODEL).lower().strip()
+    if name in MODELS:
+        return MODELS[name]
+    # partial match
+    for key, cfg in MODELS.items():
+        if key in name or name in key:
+            return cfg
+    return MODELS[DEFAULT_MODEL]
+# ---------------------------
+# DOWNLOAD MODELS
+# ---------------------------
+os.makedirs("models", exist_ok=True)
+def download_model(cfg: dict):
+    if not os.path.exists(cfg["path"]):
+        print(f"Downloading {cfg['file']} ...")
+        downloaded = hf_hub_download(repo_id=cfg["repo"], filename=cfg["file"])
+        os.system(f"cp '{downloaded}' '{cfg['path']}'")
+        print(f"  ✓ saved to {cfg['path']}")
+for m in MODELS.values():
+    download_model(m)
 # ---------------------------
+# START LLAMA SERVERS
 # ---------------------------
+_server_ready: dict[str, bool] = {k: False for k in MODELS}
+def start_llama(model_name: str, cfg: dict):
+    print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
     process = subprocess.Popen([
         LLAMA_SERVER,
+        "-m",          cfg["path"],
+        "--host",      "0.0.0.0",
+        "--port",      str(cfg["port"]),
+        "-c",          str(cfg["ctx"]),
+        "--threads",   str(cfg["threads"]),
+        "--batch-size",str(cfg["batch"]),
+        "-ngl",        "0",          # CPU only
+        "--mlock",                   # pin model in RAM → no swap
+        "--flash-attn",              # faster attention (if supported, harmless if not)
+        "-np",         "1",          # 1 parallel slot (we only have 2 CPUs)
+        "--no-mmap",                 # mlock + no-mmap = fastest cold reads
+    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    url = f"http://localhost:{cfg['port']}/health"
+    for i in range(60):
+        try:
+            r = requests.get(url, timeout=2)
+            if r.status_code == 200:
+                _server_ready[model_name] = True
+                print(f"  ✓ {model_name} ready")
+                return process
+        except Exception:
+            pass
+        print(f"  waiting for {model_name}... ({i+1}/60)")
+        time.sleep(2)
+    print(f"  ✗ {model_name} failed to start")
+    return None
+for name, cfg in MODELS.items():
+    threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()
+# ---------------------------
+# HELPERS
+# ---------------------------
+def model_meta(name: str, cfg: dict) -> dict:
+    size   = os.path.getsize(cfg["path"]) if os.path.exists(cfg["path"]) else 0
+    digest = ""
+    if os.path.exists(cfg["path"]):
+        with open(cfg["path"], "rb") as f:
+            digest = hashlib.md5(f.read(65536)).hexdigest()   # partial hash for speed
+    return {
+        "name":        name,
+        "model":       name,
+        "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "size":        size,
+        "digest":      f"sha256:{digest}",
+        "details": {
+            "format":              "gguf",
+            "family":              cfg["family"],
+            "families":            [cfg["family"]],
+            "parameter_size":      cfg["param_size"],
+            "quantization_level":  "Q4_K_M",
+        },
+    }
+def llama_params(options: Optional[dict]) -> dict:
+    """Map Ollama options → llama.cpp completion params."""
+    o = options or {}
+    return {
+        "temperature":    o.get("temperature",    0.7),
+        "top_p":          o.get("top_p",          0.9),
+        "top_k":          o.get("top_k",          40),
+        "repeat_penalty": o.get("repeat_penalty", 1.1),
+        "n_predict":      o.get("num_predict",    1024),
+        "stop":           o.get("stop",           ["<|im_end|>", "<|endoftext|>"]),
+    }
 # ---------------------------
 @app.get("/")
 def root():
+    return {"status": "running", "models": list(MODELS.keys())}
 # ---------------------------
+# /api/tags  — model list
 # ---------------------------
 @app.get("/api/tags")
 def tags():
+    return {"models": [model_meta(n, c) for n, c in MODELS.items()]}
+# ---------------------------
+# /api/show  — model detail (needed by some UIs)
+# ---------------------------
+@app.post("/api/show")
+def show(body: dict):
+    name = body.get("name", DEFAULT_MODEL)
+    cfg  = resolve_model(name)
+    meta = model_meta(name, cfg)
+    meta["modelfile"] = f"FROM {name}\n"
+    meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
+    meta["template"] = (
+        "<|im_start|>system\n{{ .System }}<|im_end|>\n"
+        "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    return meta
+# ---------------------------
+# /api/ps  — running models
+# ---------------------------
+@app.get("/api/ps")
+def ps():
+    running = []
+    for name, cfg in MODELS.items():
+        if _server_ready.get(name):
+            m = model_meta(name, cfg)
+            m["expires_at"] = "0001-01-01T00:00:00Z"
+            m["size_vram"]  = 0
+            running.append(m)
+    return {"models": running}
 # ---------------------------
+# /api/generate
 # ---------------------------
 @app.post("/api/generate")
 def generate(req: GenerateRequest):
+    cfg  = resolve_model(req.model)
+    port = cfg["port"]
+    params = llama_params(req.options)
+    params["prompt"]   = req.prompt
+    params["stream"]   = req.stream
     r = requests.post(
+        f"http://localhost:{port}/completion",
+        json=params,
+        stream=req.stream,
+        timeout=120,
     )
+    if not req.stream:
+        text = r.json().get("content", "").strip()
+        return {
+            "model":      req.model,
+            "response":   text,
+            "done":       True,
+            "done_reason":"stop",
+        }
+    def stream_gen():
+        for line in r.iter_lines():
+            if not line:
+                continue
+            line = line.decode("utf-8").strip()
+            if line.startswith("data:"):
+                line = line[5:].strip()
+            try:
+                data  = json.loads(line)
+            except Exception:
+                continue
+            token = data.get("content", "")
+            done  = data.get("stop", False)
+            yield json.dumps({
+                "model":    req.model,
+                "response": token,
+                "done":     done,
+            }) + "\n"
+            if done:
+                break
+        yield json.dumps({"model": req.model, "response": "", "done": True, "done_reason": "stop"}) + "\n"
+    return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
+                             headers={"Cache-Control": "no-cache"})
 # ---------------------------
+# /api/chat
 # ---------------------------
 @app.post("/api/chat")
 def chat(req: ChatRequest):
+    cfg    = resolve_model(req.model)
+    port   = cfg["port"]
     prompt = build_prompt(req.messages)
+    params = llama_params(req.options)
+    params["prompt"] = prompt
+    params["stream"] = req.stream
     r = requests.post(
+        f"http://localhost:{port}/completion",
+        json=params,
+        stream=req.stream,
+        timeout=120,
     )
     if not req.stream:
+        text = r.json().get("content", "").strip()
         return JSONResponse({
+            "model":   req.model,
+            "message": {"role": "assistant", "content": text},
+            "done":    True,
+            "done_reason": "stop",
         })
+    def stream_gen():
         for line in r.iter_lines():
             if not line:
                 continue
             line = line.decode("utf-8").strip()
             if line.startswith("data:"):
                 line = line[5:].strip()
             try:
                 data = json.loads(line)
+            except Exception:
                 continue
             token = data.get("content", "")
+            done  = data.get("stop", False)
             yield json.dumps({
+                "model":   req.model,
+                "message": {"role": "assistant", "content": token},
+                "done":    done,
             }) + "\n"
+            if done:
+                break
         yield json.dumps({
+            "model": req.model, "done": True, "done_reason": "stop"
         }) + "\n"
+    return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
+                             headers={"Cache-Control": "no-cache"})
 # ---------------------------
+# START
 # ---------------------------
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)