Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

llm-api / server.py

mrmadblack

Update server.py

2304b0a verified about 2 hours ago

raw

history blame contribute delete

18.2 kB

	"""
	Ollama-compatible API server
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	⚡ qwen2.5-coder-1.5b → coding, quick replies (port 8080)
	🧠 qwen3.5-4b → thinking, hard problems (port 8081)
	🌐 gemma3-4b → translation, general chat (port 8082)
	🔍 qwen3.5-0.8b → internet queries, news, fast (port 8083)
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	NO extra packages — web search uses only requests (already installed)
	Downloads + server starts run in background — port 7860 binds instantly
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	"""

	from fastapi import FastAPI, HTTPException, Response
	from fastapi.responses import StreamingResponse, JSONResponse
	from pydantic import BaseModel
	from huggingface_hub import hf_hub_download
	import subprocess
	import requests
	import uvicorn
	import os
	import json
	import time
	import hashlib
	import threading
	import urllib.parse
	from typing import Optional

	app = FastAPI()


	# ---------------------------
	# MODEL CONFIGS
	# ---------------------------

	MODELS = {
	"qwen2.5-coder-1.5b": {
	"path": "models/qwen2.5-coder-1.5b.gguf",
	"repo": "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF",
	"file": "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf",
	"port": 8080,
	"param_size": "1.5B",
	"family": "qwen2.5",
	"fmt": "chatml",
	"web_search": False,
	"threads": 2,
	"ctx": 8192,
	"batch": 512,
	},
	"qwen3.5-4b": {
	"path": "models/qwen3.5-4b.gguf",
	"repo": "bartowski/Qwen_Qwen3.5-4B-GGUF",
	"file": "Qwen_Qwen3.5-4B-Q4_K_M.gguf",
	"port": 8081,
	"param_size": "4B",
	"family": "qwen3.5",
	"fmt": "chatml",
	"web_search": False,
	"threads": 2,
	"ctx": 8192,
	"batch": 512,
	},
	"gemma3-4b": {
	"path": "models/gemma3-4b.gguf",
	"repo": "bartowski/google_gemma-3-4b-it-GGUF",
	"file": "google_gemma-3-4b-it-Q4_K_M.gguf",
	"port": 8082,
	"param_size": "4B",
	"family": "gemma3",
	"fmt": "gemma",
	"web_search": False,
	"threads": 2,
	"ctx": 8192,
	"batch": 512,
	},
	"qwen3.5-0.8b": {
	"path": "models/qwen3.5-0.8b.gguf",
	"repo": "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
	"file": "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
	"port": 8083,
	"param_size": "0.8B",
	"family": "qwen3.5",
	"fmt": "chatml",
	"web_search": True,
	"threads": 2,
	"ctx": 8192,
	"batch": 512,
	},
	}

	DEFAULT_MODEL = "qwen2.5-coder-1.5b"
	LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
	_server_ready: dict = {k: False for k in MODELS}


	# ---------------------------
	# REQUEST MODELS
	# ---------------------------

	class ChatRequest(BaseModel):
	model: str = DEFAULT_MODEL
	messages: list
	stream: bool = True
	options: Optional[dict] = None


	class GenerateRequest(BaseModel):
	model: str = DEFAULT_MODEL
	prompt: str
	stream: bool = False
	options: Optional[dict] = None


	# ---------------------------
	# WEB SEARCH — pure requests, no extra package
	# ---------------------------

	def web_search(query: str, max_results: int = 3) -> str:
	"""
	DuckDuckGo search using only the `requests` library.
	Uses DDG's JSON API — no API key, no extra packages.
	"""
	try:
	# Step 1: get vqd token (DDG requires this for search)
	encoded = urllib.parse.quote(query)
	headers = {
	"User-Agent": "Mozilla/5.0 (compatible; LLM-Search/1.0)",
	}

	# Use DDG lite HTML endpoint — most reliable, no JS required
	resp = requests.get(
	f"https://html.duckduckgo.com/html/?q={encoded}",
	headers=headers,
	timeout=8,
	)

	if resp.status_code != 200:
	return ""

	# Parse results from HTML using simple string extraction
	html = resp.text
	results = []

	# Extract result blocks between <div class="result"> tags
	import re
	# Extract titles and snippets
	titles = re.findall(r'class="result__title"[^>]>.?<a[^>]>(.?)</a>', html, re.DOTALL)
	snippets = re.findall(r'class="result__snippet"[^>]>(.?)</div>', html, re.DOTALL)
	urls = re.findall(r'class="result__url"[^>]>(.?)</span>', html, re.DOTALL)

	# Clean HTML tags from extracted text
	def strip_tags(text):
	return re.sub(r'<[^>]+>', '', text).strip()

	count = min(max_results, len(titles), len(snippets))
	if count == 0:
	return ""

	context = "=== Web Search Results ===\n"
	for i in range(count):
	title = strip_tags(titles[i])
	snippet = strip_tags(snippets[i])
	url = strip_tags(urls[i]) if i < len(urls) else ""
	context += f"\n[{i+1}] {title}\n{snippet}\nSource: {url}\n"
	context += "\n=== End of Web Results ===\n"
	return context

	except Exception as e:
	print(f" [web_search] error: {e}")
	return ""


	def inject_web_context(messages: list) -> list:
	"""Inject DuckDuckGo results as system context before last user message."""
	if not messages:
	return messages

	last_user = next(
	(m for m in reversed(messages) if m.get("role") == "user"), None
	)
	if not last_user:
	return messages

	user_text = last_user.get("content", "")
	print(f" [web_search] searching: {user_text[:60]}...")
	context = web_search(user_text)

	if not context:
	print(" [web_search] no results, continuing without web context")
	return messages

	print(f" [web_search] injected {len(context)} chars of context")

	web_system = {
	"role": "system",
	"content": (
	"You have access to the following real-time web search results. "
	"Use them to answer the user's question accurately and concisely. "
	"Always mention the source when using web data. "
	"If the results are not relevant, rely on your own knowledge.\n\n"
	+ context
	)
	}

	new_messages = []
	inserted = False
	for m in messages:
	if m is last_user and not inserted:
	new_messages.append(web_system)
	inserted = True
	new_messages.append(m)
	return new_messages


	# ---------------------------
	# PROMPT BUILDER
	# ---------------------------

	def build_prompt(messages: list, fmt: str = "chatml") -> str:

	if fmt == "gemma":
	prompt = "<bos>"
	for m in messages:
	role = m.get("role", "user")
	content = m.get("content", "").strip()
	if not content:
	continue
	if role == "system":
	prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n"
	elif role == "user":
	prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
	elif role == "assistant":
	prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n"
	prompt += "<start_of_turn>model\n"
	return prompt

	# ChatML (Qwen2.5, Qwen3.5)
	prompt = ""
	has_system = any(m.get("role") == "system" for m in messages)
	if not has_system:
	prompt += "<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n"
	for m in messages:
	role = m.get("role", "user")
	content = m.get("content", "").strip()
	if not content:
	continue
	if role == "system":
	prompt += f"<\|im_start\|>system\n{content}<\|im_end\|>\n"
	elif role == "user":
	prompt += f"<\|im_start\|>user\n{content}<\|im_end\|>\n"
	elif role == "assistant":
	prompt += f"<\|im_start\|>assistant\n{content}<\|im_end\|>\n"
	prompt += "<\|im_start\|>assistant\n"
	return prompt


	# ---------------------------
	# MODEL RESOLVER
	# ---------------------------

	def resolve_model(name: str) -> str:
	name = (name or DEFAULT_MODEL).lower().strip()
	if name in MODELS:
	return name
	for key in MODELS:
	if key in name or name in key:
	return key
	return DEFAULT_MODEL


	# ---------------------------
	# DOWNLOAD + START (all in background)
	# ---------------------------

	def download_model(cfg: dict):
	if not os.path.exists(cfg["path"]):
	print(f"Downloading {cfg['file']} ...")
	downloaded = hf_hub_download(repo_id=cfg["repo"], filename=cfg["file"])
	os.system(f"cp '{downloaded}' '{cfg['path']}'")
	print(f" ✓ saved to {cfg['path']}")


	def start_llama(model_name: str, cfg: dict):
	download_model(cfg)

	print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
	log = open(f"llama_{model_name}.log", "w")

	process = subprocess.Popen([
	LLAMA_SERVER,
	"-m", cfg["path"],
	"--host", "0.0.0.0",
	"--port", str(cfg["port"]),
	"-c", str(cfg["ctx"]),
	"--threads", str(cfg["threads"]),
	"--batch-size", str(cfg["batch"]),
	"-ngl", "0",
	"-np", "1",
	], stdout=log, stderr=log)

	url = f"http://localhost:{cfg['port']}/health"
	for i in range(90):
	time.sleep(2)
	try:
	r = requests.get(url, timeout=2)
	if r.status_code == 200:
	_server_ready[model_name] = True
	print(f" ✓ {model_name} ready (took ~{(i+1)*2}s)")
	return process
	except Exception:
	pass
	try:
	with open(f"llama_{model_name}.log") as lf:
	lines = [l.strip() for l in lf.read().splitlines() if l.strip()]
	print(f" [{model_name}] {lines[-1] if lines else 'starting...'}")
	except Exception:
	print(f" waiting for {model_name}... ({i+1}/90)")

	print(f" ✗ {model_name} failed — check llama_{model_name}.log")
	return None


	def setup_all():
	"""All downloads + server starts run here in background. Port 7860 binds instantly."""
	os.makedirs("models", exist_ok=True)
	for name, cfg in MODELS.items():
	threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()


	# Kick off everything in background immediately — uvicorn binds port 7860 first
	threading.Thread(target=setup_all, daemon=True).start()


	# ---------------------------
	# READINESS GUARD
	# ---------------------------

	def wait_for_model(model_key: str, timeout: int = 300):
	deadline = time.time() + timeout
	while time.time() < deadline:
	if _server_ready.get(model_key):
	return
	time.sleep(1)
	raise HTTPException(
	status_code=503,
	detail=f"Model '{model_key}' is still loading. Please wait and retry."
	)


	# ---------------------------
	# HELPERS
	# ---------------------------

	def model_meta(name: str, cfg: dict) -> dict:
	size = os.path.getsize(cfg["path"]) if os.path.exists(cfg["path"]) else 0
	digest = ""
	if os.path.exists(cfg["path"]):
	with open(cfg["path"], "rb") as f:
	digest = hashlib.md5(f.read(65536)).hexdigest()
	return {
	"name": name,
	"model": name,
	"modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
	"size": size,
	"digest": f"sha256:{digest}",
	"details": {
	"format": "gguf",
	"family": cfg["family"],
	"families": [cfg["family"]],
	"parameter_size": cfg["param_size"],
	"quantization_level": "Q4_K_M",
	},
	}


	def llama_params(options: Optional[dict], fmt: str = "chatml") -> dict:
	o = options or {}
	if fmt == "gemma":
	default_stop = ["<end_of_turn>", "<eos>"]
	else:
	default_stop = ["<\|im_end\|>", "<\|endoftext\|>", "</think>"]
	return {
	"temperature": o.get("temperature", 0.7),
	"top_p": o.get("top_p", 0.9),
	"top_k": o.get("top_k", 40),
	"repeat_penalty": o.get("repeat_penalty", 1.1),
	"n_predict": o.get("num_predict", 1024),
	"stop": o.get("stop", default_stop),
	}


	# ---------------------------
	# ROOT
	# ---------------------------

	@app.get("/")
	def root():
	return {
	"status": "running",
	"models_ready": dict(_server_ready),
	"usage": {
	"⚡ fast coding": "qwen2.5-coder-1.5b",
	"🧠 thinking": "qwen3.5-4b (add /think to message)",
	"🌐 translation": "gemma3-4b",
	"🔍 internet/news": "qwen3.5-0.8b (auto web search every message)",
	}
	}


	# ---------------------------
	# /health (HEAD for UptimeRobot)
	# ---------------------------

	@app.head("/health")
	def health_head():
	return Response(status_code=200)

	@app.get("/health")
	def health_get():
	return {"status": "ok", "ready": all(_server_ready.values())}


	# ---------------------------
	# /api/tags
	# ---------------------------

	@app.get("/api/tags")
	def tags():
	return {"models": [model_meta(n, c) for n, c in MODELS.items()]}


	# ---------------------------
	# /api/show
	# ---------------------------

	@app.post("/api/show")
	def show(body: dict):
	key = resolve_model(body.get("name", DEFAULT_MODEL))
	cfg = MODELS[key]
	meta = model_meta(key, cfg)
	meta["modelfile"] = f"FROM {key}\n"
	meta["parameters"] = "num_ctx 2048\nnum_predict 1024"
	if cfg["fmt"] == "gemma":
	meta["template"] = "{{ .Prompt }}"
	else:
	meta["template"] = (
	"<\|im_start\|>system\n{{ .System }}<\|im_end\|>\n"
	"<\|im_start\|>user\n{{ .Prompt }}<\|im_end\|>\n"
	"<\|im_start\|>assistant\n"
	)
	return meta


	# ---------------------------
	# /api/ps
	# ---------------------------

	@app.get("/api/ps")
	def ps():
	running = []
	for name, cfg in MODELS.items():
	if _server_ready.get(name):
	m = model_meta(name, cfg)
	m["expires_at"] = "0001-01-01T00:00:00Z"
	m["size_vram"] = 0
	running.append(m)
	return {"models": running}


	# ---------------------------
	# /api/generate
	# ---------------------------

	@app.post("/api/generate")
	def generate(req: GenerateRequest):
	key = resolve_model(req.model)
	cfg = MODELS[key]
	wait_for_model(key)

	params = llama_params(req.options, fmt=cfg["fmt"])
	params["prompt"] = req.prompt
	params["stream"] = req.stream

	r = requests.post(
	f"http://localhost:{cfg['port']}/completion",
	json=params, stream=req.stream, timeout=180,
	)

	if not req.stream:
	text = r.json().get("content", "").strip()
	return {"model": req.model, "response": text, "done": True, "done_reason": "stop"}

	def stream_gen():
	for line in r.iter_lines():
	if not line:
	continue
	line = line.decode("utf-8").strip()
	if line.startswith("data:"):
	line = line[5:].strip()
	try:
	data = json.loads(line)
	except Exception:
	continue
	token = data.get("content", "")
	done = data.get("stop", False)
	yield json.dumps({"model": req.model, "response": token, "done": done}) + "\n"
	if done:
	break
	yield json.dumps({"model": req.model, "response": "", "done": True, "done_reason": "stop"}) + "\n"

	return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
	headers={"Cache-Control": "no-cache"})


	# ---------------------------
	# /api/chat
	# ---------------------------

	@app.post("/api/chat")
	def chat(req: ChatRequest):
	key = resolve_model(req.model)
	cfg = MODELS[key]
	wait_for_model(key)

	messages = req.messages
	if cfg.get("web_search", False):
	messages = inject_web_context(messages)

	prompt = build_prompt(messages, fmt=cfg["fmt"])
	params = llama_params(req.options, fmt=cfg["fmt"])
	params["prompt"] = prompt
	params["stream"] = req.stream

	r = requests.post(
	f"http://localhost:{cfg['port']}/completion",
	json=params, stream=req.stream, timeout=180,
	)

	if not req.stream:
	text = r.json().get("content", "").strip()
	return JSONResponse({
	"model": req.model,
	"message": {"role": "assistant", "content": text},
	"done": True, "done_reason": "stop",
	})

	def stream_gen():
	for line in r.iter_lines():
	if not line:
	continue
	line = line.decode("utf-8").strip()
	if line.startswith("data:"):
	line = line[5:].strip()
	try:
	data = json.loads(line)
	except Exception:
	continue
	token = data.get("content", "")
	done = data.get("stop", False)
	yield json.dumps({
	"model": req.model,
	"message": {"role": "assistant", "content": token},
	"done": done,
	}) + "\n"
	if done:
	break
	yield json.dumps({"model": req.model, "done": True, "done_reason": "stop"}) + "\n"

	return StreamingResponse(stream_gen(), media_type="application/x-ndjson",
	headers={"Cache-Control": "no-cache"})


	# ---------------------------
	# START
	# ---------------------------

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860, workers=1)