Spaces:
Running
Running
| """ | |
| Ollama-compatible API server | |
| βββββββββββββββββββββββββββββββββββββββββ | |
| β‘ qwen2.5-coder-1.5b β coding, quick replies (port 8080) | |
| π§ qwen3.5-4b β thinking, hard problems (port 8081) | |
| π gemma3-4b β translation, general chat (port 8082) | |
| π qwen3.5-0.8b β internet queries, news, fast (port 8083) | |
| βββββββββββββββββββββββββββββββββββββββββ | |
| NO extra packages β web search uses only requests (already installed) | |
| Downloads + server starts run in background β port 7860 binds instantly | |
| βββββββββββββββββββββββββββββββββββββββββ | |
| """ | |
| from fastapi import FastAPI, HTTPException, Response | |
| from fastapi.responses import StreamingResponse, JSONResponse | |
| from pydantic import BaseModel | |
| from huggingface_hub import hf_hub_download | |
| import subprocess | |
| import requests | |
| import uvicorn | |
| import os | |
| import json | |
| import time | |
| import hashlib | |
| import threading | |
| import urllib.parse | |
| from typing import Optional | |
| app = FastAPI() | |
| # --------------------------- | |
| # MODEL CONFIGS | |
| # --------------------------- | |
| MODELS = { | |
| "qwen2.5-coder-1.5b": { | |
| "path": "models/qwen2.5-coder-1.5b.gguf", | |
| "repo": "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF", | |
| "file": "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf", | |
| "port": 8080, | |
| "param_size": "1.5B", | |
| "family": "qwen2.5", | |
| "fmt": "chatml", | |
| "web_search": False, | |
| "threads": 2, | |
| "ctx": 8192, | |
| "batch": 512, | |
| }, | |
| "qwen3.5-4b": { | |
| "path": "models/qwen3.5-4b.gguf", | |
| "repo": "bartowski/Qwen_Qwen3.5-4B-GGUF", | |
| "file": "Qwen_Qwen3.5-4B-Q4_K_M.gguf", | |
| "port": 8081, | |
| "param_size": "4B", | |
| "family": "qwen3.5", | |
| "fmt": "chatml", | |
| "web_search": False, | |
| "threads": 2, | |
| "ctx": 8192, | |
| "batch": 512, | |
| }, | |
| "gemma3-4b": { | |
| "path": "models/gemma3-4b.gguf", | |
| "repo": "bartowski/google_gemma-3-4b-it-GGUF", | |
| "file": "google_gemma-3-4b-it-Q4_K_M.gguf", | |
| "port": 8082, | |
| "param_size": "4B", | |
| "family": "gemma3", | |
| "fmt": "gemma", | |
| "web_search": False, | |
| "threads": 2, | |
| "ctx": 8192, | |
| "batch": 512, | |
| }, | |
| "qwen3.5-0.8b": { | |
| "path": "models/qwen3.5-0.8b.gguf", | |
| "repo": "bartowski/Qwen_Qwen3.5-0.8B-GGUF", | |
| "file": "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf", | |
| "port": 8083, | |
| "param_size": "0.8B", | |
| "family": "qwen3.5", | |
| "fmt": "chatml", | |
| "web_search": True, | |
| "threads": 2, | |
| "ctx": 8192, | |
| "batch": 512, | |
| }, | |
| } | |
| DEFAULT_MODEL = "qwen2.5-coder-1.5b" | |
| LLAMA_SERVER = "./llama.cpp/build/bin/llama-server" | |
| _server_ready: dict = {k: False for k in MODELS} | |
| # --------------------------- | |
| # REQUEST MODELS | |
| # --------------------------- | |
| class ChatRequest(BaseModel): | |
| model: str = DEFAULT_MODEL | |
| messages: list | |
| stream: bool = True | |
| options: Optional[dict] = None | |
| class GenerateRequest(BaseModel): | |
| model: str = DEFAULT_MODEL | |
| prompt: str | |
| stream: bool = False | |
| options: Optional[dict] = None | |
| # --------------------------- | |
| # WEB SEARCH β pure requests, no extra package | |
| # --------------------------- | |
| def web_search(query: str, max_results: int = 3) -> str: | |
| """ | |
| DuckDuckGo search using only the `requests` library. | |
| Uses DDG's JSON API β no API key, no extra packages. | |
| """ | |
| try: | |
| # Step 1: get vqd token (DDG requires this for search) | |
| encoded = urllib.parse.quote(query) | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (compatible; LLM-Search/1.0)", | |
| } | |
| # Use DDG lite HTML endpoint β most reliable, no JS required | |
| resp = requests.get( | |
| f"https://html.duckduckgo.com/html/?q={encoded}", | |
| headers=headers, | |
| timeout=8, | |
| ) | |
| if resp.status_code != 200: | |
| return "" | |
| # Parse results from HTML using simple string extraction | |
| html = resp.text | |
| results = [] | |
| # Extract result blocks between <div class="result"> tags | |
| import re | |
| # Extract titles and snippets | |
| titles = re.findall(r'class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>', html, re.DOTALL) | |
| snippets = re.findall(r'class="result__snippet"[^>]*>(.*?)</div>', html, re.DOTALL) | |
| urls = re.findall(r'class="result__url"[^>]*>(.*?)</span>', html, re.DOTALL) | |
| # Clean HTML tags from extracted text | |
| def strip_tags(text): | |
| return re.sub(r'<[^>]+>', '', text).strip() | |
| count = min(max_results, len(titles), len(snippets)) | |
| if count == 0: | |
| return "" | |
| context = "=== Web Search Results ===\n" | |
| for i in range(count): | |
| title = strip_tags(titles[i]) | |
| snippet = strip_tags(snippets[i]) | |
| url = strip_tags(urls[i]) if i < len(urls) else "" | |
| context += f"\n[{i+1}] {title}\n{snippet}\nSource: {url}\n" | |
| context += "\n=== End of Web Results ===\n" | |
| return context | |
| except Exception as e: | |
| print(f" [web_search] error: {e}") | |
| return "" | |
| def inject_web_context(messages: list) -> list: | |
| """Inject DuckDuckGo results as system context before last user message.""" | |
| if not messages: | |
| return messages | |
| last_user = next( | |
| (m for m in reversed(messages) if m.get("role") == "user"), None | |
| ) | |
| if not last_user: | |
| return messages | |
| user_text = last_user.get("content", "") | |
| print(f" [web_search] searching: {user_text[:60]}...") | |
| context = web_search(user_text) | |
| if not context: | |
| print(" [web_search] no results, continuing without web context") | |
| return messages | |
| print(f" [web_search] injected {len(context)} chars of context") | |
| web_system = { | |
| "role": "system", | |
| "content": ( | |
| "You have access to the following real-time web search results. " | |
| "Use them to answer the user's question accurately and concisely. " | |
| "Always mention the source when using web data. " | |
| "If the results are not relevant, rely on your own knowledge.\n\n" | |
| + context | |
| ) | |
| } | |
| new_messages = [] | |
| inserted = False | |
| for m in messages: | |
| if m is last_user and not inserted: | |
| new_messages.append(web_system) | |
| inserted = True | |
| new_messages.append(m) | |
| return new_messages | |
| # --------------------------- | |
| # PROMPT BUILDER | |
| # --------------------------- | |
| def build_prompt(messages: list, fmt: str = "chatml") -> str: | |
| if fmt == "gemma": | |
| prompt = "<bos>" | |
| for m in messages: | |
| role = m.get("role", "user") | |
| content = m.get("content", "").strip() | |
| if not content: | |
| continue | |
| if role == "system": | |
| prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n" | |
| elif role == "user": | |
| prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n" | |
| elif role == "assistant": | |
| prompt += f"<start_of_turn>model\n{content}<end_of_turn>\n" | |
| prompt += "<start_of_turn>model\n" | |
| return prompt | |
| # ChatML (Qwen2.5, Qwen3.5) | |
| prompt = "" | |
| has_system = any(m.get("role") == "system" for m in messages) | |
| if not has_system: | |
| prompt += "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" | |
| for m in messages: | |
| role = m.get("role", "user") | |
| content = m.get("content", "").strip() | |
| if not content: | |
| continue | |
| if role == "system": | |
| prompt += f"<|im_start|>system\n{content}<|im_end|>\n" | |
| elif role == "user": | |
| prompt += f"<|im_start|>user\n{content}<|im_end|>\n" | |
| elif role == "assistant": | |
| prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n" | |
| prompt += "<|im_start|>assistant\n" | |
| return prompt | |
| # --------------------------- | |
| # MODEL RESOLVER | |
| # --------------------------- | |
| def resolve_model(name: str) -> str: | |
| name = (name or DEFAULT_MODEL).lower().strip() | |
| if name in MODELS: | |
| return name | |
| for key in MODELS: | |
| if key in name or name in key: | |
| return key | |
| return DEFAULT_MODEL | |
| # --------------------------- | |
| # DOWNLOAD + START (all in background) | |
| # --------------------------- | |
| def download_model(cfg: dict): | |
| if not os.path.exists(cfg["path"]): | |
| print(f"Downloading {cfg['file']} ...") | |
| downloaded = hf_hub_download(repo_id=cfg["repo"], filename=cfg["file"]) | |
| os.system(f"cp '{downloaded}' '{cfg['path']}'") | |
| print(f" β saved to {cfg['path']}") | |
| def start_llama(model_name: str, cfg: dict): | |
| download_model(cfg) | |
| print(f"Starting llama-server for {model_name} on port {cfg['port']} ...") | |
| log = open(f"llama_{model_name}.log", "w") | |
| process = subprocess.Popen([ | |
| LLAMA_SERVER, | |
| "-m", cfg["path"], | |
| "--host", "0.0.0.0", | |
| "--port", str(cfg["port"]), | |
| "-c", str(cfg["ctx"]), | |
| "--threads", str(cfg["threads"]), | |
| "--batch-size", str(cfg["batch"]), | |
| "-ngl", "0", | |
| "-np", "1", | |
| ], stdout=log, stderr=log) | |
| url = f"http://localhost:{cfg['port']}/health" | |
| for i in range(90): | |
| time.sleep(2) | |
| try: | |
| r = requests.get(url, timeout=2) | |
| if r.status_code == 200: | |
| _server_ready[model_name] = True | |
| print(f" β {model_name} ready (took ~{(i+1)*2}s)") | |
| return process | |
| except Exception: | |
| pass | |
| try: | |
| with open(f"llama_{model_name}.log") as lf: | |
| lines = [l.strip() for l in lf.read().splitlines() if l.strip()] | |
| print(f" [{model_name}] {lines[-1] if lines else 'starting...'}") | |
| except Exception: | |
| print(f" waiting for {model_name}... ({i+1}/90)") | |
| print(f" β {model_name} failed β check llama_{model_name}.log") | |
| return None | |
| def setup_all(): | |
| """All downloads + server starts run here in background. Port 7860 binds instantly.""" | |
| os.makedirs("models", exist_ok=True) | |
| for name, cfg in MODELS.items(): | |
| threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start() | |
| # Kick off everything in background immediately β uvicorn binds port 7860 first | |
| threading.Thread(target=setup_all, daemon=True).start() | |
| # --------------------------- | |
| # READINESS GUARD | |
| # --------------------------- | |
| def wait_for_model(model_key: str, timeout: int = 300): | |
| deadline = time.time() + timeout | |
| while time.time() < deadline: | |
| if _server_ready.get(model_key): | |
| return | |
| time.sleep(1) | |
| raise HTTPException( | |
| status_code=503, | |
| detail=f"Model '{model_key}' is still loading. Please wait and retry." | |
| ) | |
| # --------------------------- | |
| # HELPERS | |
| # --------------------------- | |
| def model_meta(name: str, cfg: dict) -> dict: | |
| size = os.path.getsize(cfg["path"]) if os.path.exists(cfg["path"]) else 0 | |
| digest = "" | |
| if os.path.exists(cfg["path"]): | |
| with open(cfg["path"], "rb") as f: | |
| digest = hashlib.md5(f.read(65536)).hexdigest() | |
| return { | |
| "name": name, | |
| "model": name, | |
| "modified_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"), | |
| "size": size, | |
| "digest": f"sha256:{digest}", | |
| "details": { | |
| "format": "gguf", | |
| "family": cfg["family"], | |
| "families": [cfg["family"]], | |
| "parameter_size": cfg["param_size"], | |
| "quantization_level": "Q4_K_M", | |
| }, | |
| } | |
| def llama_params(options: Optional[dict], fmt: str = "chatml") -> dict: | |
| o = options or {} | |
| if fmt == "gemma": | |
| default_stop = ["<end_of_turn>", "<eos>"] | |
| else: | |
| default_stop = ["<|im_end|>", "<|endoftext|>", "</think>"] | |
| return { | |
| "temperature": o.get("temperature", 0.7), | |
| "top_p": o.get("top_p", 0.9), | |
| "top_k": o.get("top_k", 40), | |
| "repeat_penalty": o.get("repeat_penalty", 1.1), | |
| "n_predict": o.get("num_predict", 1024), | |
| "stop": o.get("stop", default_stop), | |
| } | |
| # --------------------------- | |
| # ROOT | |
| # --------------------------- | |
| def root(): | |
| return { | |
| "status": "running", | |
| "models_ready": dict(_server_ready), | |
| "usage": { | |
| "β‘ fast coding": "qwen2.5-coder-1.5b", | |
| "π§ thinking": "qwen3.5-4b (add /think to message)", | |
| "π translation": "gemma3-4b", | |
| "π internet/news": "qwen3.5-0.8b (auto web search every message)", | |
| } | |
| } | |
| # --------------------------- | |
| # /health (HEAD for UptimeRobot) | |
| # --------------------------- | |
| def health_head(): | |
| return Response(status_code=200) | |
| def health_get(): | |
| return {"status": "ok", "ready": all(_server_ready.values())} | |
| # --------------------------- | |
| # /api/tags | |
| # --------------------------- | |
| def tags(): | |
| return {"models": [model_meta(n, c) for n, c in MODELS.items()]} | |
| # --------------------------- | |
| # /api/show | |
| # --------------------------- | |
| def show(body: dict): | |
| key = resolve_model(body.get("name", DEFAULT_MODEL)) | |
| cfg = MODELS[key] | |
| meta = model_meta(key, cfg) | |
| meta["modelfile"] = f"FROM {key}\n" | |
| meta["parameters"] = "num_ctx 2048\nnum_predict 1024" | |
| if cfg["fmt"] == "gemma": | |
| meta["template"] = "{{ .Prompt }}" | |
| else: | |
| meta["template"] = ( | |
| "<|im_start|>system\n{{ .System }}<|im_end|>\n" | |
| "<|im_start|>user\n{{ .Prompt }}<|im_end|>\n" | |
| "<|im_start|>assistant\n" | |
| ) | |
| return meta | |
| # --------------------------- | |
| # /api/ps | |
| # --------------------------- | |
| def ps(): | |
| running = [] | |
| for name, cfg in MODELS.items(): | |
| if _server_ready.get(name): | |
| m = model_meta(name, cfg) | |
| m["expires_at"] = "0001-01-01T00:00:00Z" | |
| m["size_vram"] = 0 | |
| running.append(m) | |
| return {"models": running} | |
| # --------------------------- | |
| # /api/generate | |
| # --------------------------- | |
| def generate(req: GenerateRequest): | |
| key = resolve_model(req.model) | |
| cfg = MODELS[key] | |
| wait_for_model(key) | |
| params = llama_params(req.options, fmt=cfg["fmt"]) | |
| params["prompt"] = req.prompt | |
| params["stream"] = req.stream | |
| r = requests.post( | |
| f"http://localhost:{cfg['port']}/completion", | |
| json=params, stream=req.stream, timeout=180, | |
| ) | |
| if not req.stream: | |
| text = r.json().get("content", "").strip() | |
| return {"model": req.model, "response": text, "done": True, "done_reason": "stop"} | |
| def stream_gen(): | |
| for line in r.iter_lines(): | |
| if not line: | |
| continue | |
| line = line.decode("utf-8").strip() | |
| if line.startswith("data:"): | |
| line = line[5:].strip() | |
| try: | |
| data = json.loads(line) | |
| except Exception: | |
| continue | |
| token = data.get("content", "") | |
| done = data.get("stop", False) | |
| yield json.dumps({"model": req.model, "response": token, "done": done}) + "\n" | |
| if done: | |
| break | |
| yield json.dumps({"model": req.model, "response": "", "done": True, "done_reason": "stop"}) + "\n" | |
| return StreamingResponse(stream_gen(), media_type="application/x-ndjson", | |
| headers={"Cache-Control": "no-cache"}) | |
| # --------------------------- | |
| # /api/chat | |
| # --------------------------- | |
| def chat(req: ChatRequest): | |
| key = resolve_model(req.model) | |
| cfg = MODELS[key] | |
| wait_for_model(key) | |
| messages = req.messages | |
| if cfg.get("web_search", False): | |
| messages = inject_web_context(messages) | |
| prompt = build_prompt(messages, fmt=cfg["fmt"]) | |
| params = llama_params(req.options, fmt=cfg["fmt"]) | |
| params["prompt"] = prompt | |
| params["stream"] = req.stream | |
| r = requests.post( | |
| f"http://localhost:{cfg['port']}/completion", | |
| json=params, stream=req.stream, timeout=180, | |
| ) | |
| if not req.stream: | |
| text = r.json().get("content", "").strip() | |
| return JSONResponse({ | |
| "model": req.model, | |
| "message": {"role": "assistant", "content": text}, | |
| "done": True, "done_reason": "stop", | |
| }) | |
| def stream_gen(): | |
| for line in r.iter_lines(): | |
| if not line: | |
| continue | |
| line = line.decode("utf-8").strip() | |
| if line.startswith("data:"): | |
| line = line[5:].strip() | |
| try: | |
| data = json.loads(line) | |
| except Exception: | |
| continue | |
| token = data.get("content", "") | |
| done = data.get("stop", False) | |
| yield json.dumps({ | |
| "model": req.model, | |
| "message": {"role": "assistant", "content": token}, | |
| "done": done, | |
| }) + "\n" | |
| if done: | |
| break | |
| yield json.dumps({"model": req.model, "done": True, "done_reason": "stop"}) + "\n" | |
| return StreamingResponse(stream_gen(), media_type="application/x-ndjson", | |
| headers={"Cache-Control": "no-cache"}) | |
| # --------------------------- | |
| # START | |
| # --------------------------- | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860, workers=1) |