llm-api / server.py
mrmadblack's picture
Update server.py
2304b0a verified
"""
Ollama-compatible API server
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
⚑ qwen2.5-coder-1.5b β†’ coding, quick replies (port 8080)
🧠 qwen3.5-4b β†’ thinking, hard problems (port 8081)
🌐 gemma3-4b β†’ translation, general chat (port 8082)
πŸ” qwen3.5-0.8b β†’ internet queries, news, fast (port 8083)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
NO extra packages β€” web search uses only requests (already installed)
Downloads + server starts run in background β€” port 7860 binds instantly
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""
from fastapi import FastAPI, HTTPException, Response
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
import subprocess
import requests
import uvicorn
import os
import json
import time
import hashlib
import threading
import urllib.parse
from typing import Optional
app = FastAPI()
# ---------------------------
# MODEL CONFIGS
# ---------------------------
MODELS = {
"qwen2.5-coder-1.5b": {
"path": "models/qwen2.5-coder-1.5b.gguf",
"repo": "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF",
"file": "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf",
"port": 8080,
"param_size": "1.5B",
"family": "qwen2.5",
"fmt": "chatml",
"web_search": False,
"threads": 2,
"ctx": 8192,
"batch": 512,
},
"qwen3.5-4b": {
"path": "models/qwen3.5-4b.gguf",
"repo": "bartowski/Qwen_Qwen3.5-4B-GGUF",
"file": "Qwen_Qwen3.5-4B-Q4_K_M.gguf",
"port": 8081,
"param_size": "4B",
"family": "qwen3.5",
"fmt": "chatml",
"web_search": False,
"threads": 2,
"ctx": 8192,
"batch": 512,
},
"gemma3-4b": {
"path": "models/gemma3-4b.gguf",
"repo": "bartowski/google_gemma-3-4b-it-GGUF",
"file": "google_gemma-3-4b-it-Q4_K_M.gguf",
"port": 8082,
"param_size": "4B",
"family": "gemma3",
"fmt": "gemma",
"web_search": False,
"threads": 2,
"ctx": 8192,
"batch": 512,
},
"qwen3.5-0.8b": {
"path": "models/qwen3.5-0.8b.gguf",
"repo": "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
"file": "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
"port": 8083,
"param_size": "0.8B",
"family": "qwen3.5",
"fmt": "chatml",
"web_search": True,
"threads": 2,
"ctx": 8192,
"batch": 512,
},
}
DEFAULT_MODEL = "qwen2.5-coder-1.5b"
LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
_server_ready: dict = {k: False for k in MODELS}
# ---------------------------
# REQUEST MODELS
# ---------------------------
class ChatRequest(BaseModel):
model: str = DEFAULT_MODEL
messages: list
stream: bool = True
options: Optional[dict] = None
class GenerateRequest(BaseModel):
model: str = DEFAULT_MODEL
prompt: str
stream: bool = False
options: Optional[dict] = None
# ---------------------------
# WEB SEARCH β€” pure requests, no extra package
# ---------------------------
def web_search(query: str, max_results: int = 3) -> str:
"""
DuckDuckGo search using only the `requests` library.
Uses DDG's JSON API β€” no API key, no extra packages.
"""
try:
# Step 1: get vqd token (DDG requires this for search)
encoded = urllib.parse.quote(query)
headers = {
"User-Agent": "Mozilla/5.0 (compatible; LLM-Search/1.0)",
}
# Use DDG lite HTML endpoint β€” most reliable, no JS required
resp = requests.get(
f"https://html.duckduckgo.com/html/?q={encoded}",
headers=headers,
timeout=8,
)
if resp.status_code != 200:
return ""
# Parse results from HTML using simple string extraction
html = resp.text
results = []
# Extract result blocks between <div class="result"> tags
import re
# Extract titles and snippets
titles = re.findall(r'class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>', html, re.DOTALL)
snippets = re.findall(r'class="result__snippet"[^>]*>(.*?)</div>', html, re.DOTALL)
urls = re.findall(r'class="result__url"[^>]*>(.*?)</span>', html, re.DOTALL)
# Clean HTML tags from extracted text
def strip_tags(text):
return re.sub(r'<[^>]+>', '', text).strip()
count = min(max_results, len(titles), len(snippets))
if count == 0:
return ""
context = "=== Web Search Results ===\n"
for i in range(count):
title = strip_tags(titles[i])
snippet = strip_tags(snippets[i])
url = strip_tags(urls[i]) if i < len(urls) else ""
context += f"\n[{i+1}] {title}\n{snippet}\nSource: {url}\n"
context += "\n=== End of Web Results ===\n"
return context
except Exception as e:
print(f" [web_search] error: {e}")
return ""
def inject_web_context(messages: list) -> list:
"""Inject DuckDuckGo results as system context before last user message."""
if not messages:
return messages
last_user = next(
(m for m in reversed(messages) if m.get("role") == "user"), None
)
if not last_user:
return messages
user_text = last_user.get("content", "")
print(f" [web_search] searching: {user_text[:60]}...")
context = web_search(user_text)
if not context:
print(" [web_search] no results, continuing without web context")
return messages
print(f" [web_search] injected {len(context)} chars of context")
web_system = {
"role": "system",
"content": (
"You have access to the following real-time web search results. "
"Use them to answer the user's question accurately and concisely. "
"Always mention the source when using web data. "
"If the results are not relevant, rely on your own knowledge.\n\n"
+ context
)
}
new_messages = []
inserted = False
for m in messages:
if m is last_user and not inserted:
new_messages.append(web_system)
inserted = True
new_messages.append(m)
return new_messages
# ---------------------------
# PROMPT BUILDER
# ---------------------------
def build_prompt(messages: list, fmt: str = "chatml") -> str:
if fmt == "gemma":
prompt = "<bos>"
for m in messages:
role = m.get("role", "user")
content = m.get("content", "").strip()
if not content:
continue
if role == "system":
prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n"
elif role == "user":
prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
elif role == "assistant":
prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
prompt += "<start_of_turn>model\n"
return prompt
# ChatML (Qwen2.5, Qwen3.5)
prompt = ""
has_system = any(m.get("role") == "system" for m in messages)
if not has_system:
prompt += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
for m in messages:
role = m.get("role", "user")
content = m.get("content", "").strip()
if not content:
continue
if role == "system":
prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
elif role == "user":
prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
elif role == "assistant":
prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
prompt += "<|im_start|>assistant\n"
return prompt
# ---------------------------
# MODEL RESOLVER
# ---------------------------
def resolve_model(name: str) -> str:
name = (name or DEFAULT_MODEL).lower().strip()
if name in MODELS:
return name
for key in MODELS:
if key in name or name in key:
return key
return DEFAULT_MODEL
# ---------------------------
# DOWNLOAD + START (all in background)
# ---------------------------
def download_model(cfg: dict):
if not os.path.exists(cfg["path"]):
print(f"Downloading {cfg['file']} ...")
downloaded = hf_hub_download(repo_id=cfg["repo"], filename=cfg["file"])
os.system(f"cp '{downloaded}' '{cfg['path']}'")
print(f" βœ“ saved to {cfg['path']}")
def start_llama(model_name: str, cfg: dict):
download_model(cfg)
print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
log = open(f"llama_{model_name}.log", "w")
process = subprocess.Popen([
LLAMA_SERVER,
"-m", cfg["path"],
"--host", "0.0.0.0",
"--port", str(cfg["port"]),
"-c", str(cfg["ctx"]),
"--threads", str(cfg["threads"]),
"--batch-size", str(cfg["batch"]),
"-ngl", "0",
"-np", "1",
], stdout=log, stderr=log)
url = f"http://localhost:{cfg['port']}/health"
for i in range(90):
time.sleep(2)
try:
r = requests.get(url, timeout=2)
if r.status_code == 200:
_server_ready[model_name] = True
print(f" βœ“ {model_name} ready (took ~{(i+1)*2}s)")
return process
except Exception:
pass
try:
with open(f"llama_{model_name}.log") as lf:
lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
print(f" [{model_name}] {lines[-1] if lines else 'starting...'}")
except Exception:
print(f" waiting for {model_name}... ({i+1}/90)")
print(f" βœ— {model_name} failed β€” check llama_{model_name}.log")
return None
def setup_all():
"""All downloads + server starts run here in background. Port 7860 binds instantly."""
os.makedirs("models", exist_ok=True)
for name, cfg in MODELS.items():
threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()
# Kick off everything in background immediately β€” uvicorn binds port 7860 first
threading.Thread(target=setup_all, daemon=True).start()
# ---------------------------
# READINESS GUARD
# ---------------------------
def wait_for_model(model_key: str, timeout: int = 300):
deadline = time.time() + timeout
while time.time() < deadline:
if _server_ready.get(model_key):
return
time.sleep(1)
raise HTTPException(
status_code=503,
detail=f"Model '{model_key}' is still loading. Please wait and retry."
)
# ---------------------------
# HELPERS
# ---------------------------
def model_meta(name: str, cfg: dict) -> dict:
size = os.path.getsize(cfg["path"]) if os.path.exists(cfg["path"]) else 0
digest = ""
if os.path.exists(cfg["path"]):
with open(cfg["path"], "rb") as f:
digest = hashlib.md5(f.read(65536)).hexdigest()
return {
"name": name,
"model": name,
"modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
"size": size,
"digest": f"sha256:{digest}",
"details": {
"format": "gguf",
"family": cfg["family"],
"families": [cfg["family"]],
"parameter_size": cfg["param_size"],
"quantization_level": "Q4_K_M",
},
}
def llama_params(options: Optional[dict], fmt: str = "chatml") -> dict:
o = options or {}
if fmt == "gemma":
default_stop = ["<end_of_turn>", "<eos>"]
else:
default_stop = ["<|im_end|>", "<|endoftext|>", "</think>"]
return {
"temperature": o.get("temperature", 0.7),
"top_p": o.get("top_p", 0.9),
"top_k": o.get("top_k", 40),
"repeat_penalty": o.get("repeat_penalty", 1.1),
"n_predict": o.get("num_predict", 1024),
"stop": o.get("stop", default_stop),
}
# ---------------------------
# ROOT
# ---------------------------
@app.get("/")
def root():
return {
"status": "running",
"models_ready": dict(_server_ready),
"usage": {
"⚑ fast coding": "qwen2.5-coder-1.5b",
"🧠 thinking": "qwen3.5-4b (add /think to message)",
"🌐 translation": "gemma3-4b",
"πŸ” internet/news": "qwen3.5-0.8b (auto web search every message)",
}
}
# ---------------------------
# /health (HEAD for UptimeRobot)
# ---------------------------
@app.head("/health")
def health_head():
return Response(status_code=200)
@app.get("/health")
def health_get():
return {"status": "ok", "ready": all(_server_ready.values())}
# ---------------------------
# /api/tags
# ---------------------------
@app.get("/api/tags")
def tags():
return {"models": [model_meta(n, c) for n, c in MODELS.items()]}
# ---------------------------
# /api/show
# ---------------------------
@app.post("/api/show")
def show(body: dict):
key = resolve_model(body.get("name", DEFAULT_MODEL))
cfg = MODELS[key]
meta = model_meta(key, cfg)
meta["modelfile"] = f"FROM {key}\n"
meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
if cfg["fmt"] == "gemma":
meta["template"] = "{{ .Prompt }}"
else:
meta["template"] = (
"<|im_start|>system\n{{ .System }}<|im_end|>\n"
"<|im_start|>user\n{{ .Prompt }}<|im_end|>\n"
"<|im_start|>assistant\n"
)
return meta
# ---------------------------
# /api/ps
# ---------------------------
@app.get("/api/ps")
def ps():
running = []
for name, cfg in MODELS.items():
if _server_ready.get(name):
m = model_meta(name, cfg)
m["expires_at"] = "0001-01-01T00:00:00Z"
m["size_vram"] = 0
running.append(m)
return {"models": running}
# ---------------------------
# /api/generate
# ---------------------------
@app.post("/api/generate")
def generate(req: GenerateRequest):
key = resolve_model(req.model)
cfg = MODELS[key]
wait_for_model(key)
params = llama_params(req.options, fmt=cfg["fmt"])
params["prompt"] = req.prompt
params["stream"] = req.stream
r = requests.post(
f"http://localhost:{cfg['port']}/completion",
json=params, stream=req.stream, timeout=180,
)
if not req.stream:
text = r.json().get("content", "").strip()
return {"model": req.model, "response": text, "done": True, "done_reason": "stop"}
def stream_gen():
for line in r.iter_lines():
if not line:
continue
line = line.decode("utf-8").strip()
if line.startswith("data:"):
line = line[5:].strip()
try:
data = json.loads(line)
except Exception:
continue
token = data.get("content", "")
done = data.get("stop", False)
yield json.dumps({"model": req.model, "response": token, "done": done}) + "\n"
if done:
break
yield json.dumps({"model": req.model, "response": "", "done": True, "done_reason": "stop"}) + "\n"
return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
headers={"Cache-Control": "no-cache"})
# ---------------------------
# /api/chat
# ---------------------------
@app.post("/api/chat")
def chat(req: ChatRequest):
key = resolve_model(req.model)
cfg = MODELS[key]
wait_for_model(key)
messages = req.messages
if cfg.get("web_search", False):
messages = inject_web_context(messages)
prompt = build_prompt(messages, fmt=cfg["fmt"])
params = llama_params(req.options, fmt=cfg["fmt"])
params["prompt"] = prompt
params["stream"] = req.stream
r = requests.post(
f"http://localhost:{cfg['port']}/completion",
json=params, stream=req.stream, timeout=180,
)
if not req.stream:
text = r.json().get("content", "").strip()
return JSONResponse({
"model": req.model,
"message": {"role": "assistant", "content": text},
"done": True, "done_reason": "stop",
})
def stream_gen():
for line in r.iter_lines():
if not line:
continue
line = line.decode("utf-8").strip()
if line.startswith("data:"):
line = line[5:].strip()
try:
data = json.loads(line)
except Exception:
continue
token = data.get("content", "")
done = data.get("stop", False)
yield json.dumps({
"model": req.model,
"message": {"role": "assistant", "content": token},
"done": done,
}) + "\n"
if done:
break
yield json.dumps({"model": req.model, "done": True, "done_reason": "stop"}) + "\n"
return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
headers={"Cache-Control": "no-cache"})
# ---------------------------
# START
# ---------------------------
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)