Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

mrmadblack commited on 23 days ago

Commit

fac1d0b

verified ·

1 Parent(s): b1758f7

Update server.py

Browse files

Files changed (1) hide show

server.py +54 -33

server.py CHANGED Viewed

@@ -6,8 +6,8 @@ Ollama-compatible API server
   🌐 gemma3-4b           → translation, general chat     (port 8082)
   🔍 qwen3.5-0.8b        → internet queries, news, fast  (port 8083)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-  KEY FIX: downloads + llama-server starts run in background thread
-  so uvicorn binds to port 7860 IMMEDIATELY — no HF startup timeout
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 """
@@ -15,7 +15,6 @@ from fastapi import FastAPI, HTTPException, Response
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
-from ddgs import DDGS
 import subprocess
 import requests
 import uvicorn
@@ -24,6 +23,7 @@ import json
 import time
 import hashlib
 import threading
 from typing import Optional
 app = FastAPI()
@@ -90,8 +90,6 @@ MODELS = {
 DEFAULT_MODEL = "qwen2.5-coder-1.5b"
 LLAMA_SERVER  = "./llama.cpp/build/bin/llama-server"
-# Tracks readiness of each model
 _server_ready: dict = {k: False for k in MODELS}
@@ -114,33 +112,69 @@ class GenerateRequest(BaseModel):
 # ---------------------------
-# WEB SEARCH  (DuckDuckGo RAG)
 # ---------------------------
 def web_search(query: str, max_results: int = 3) -> str:
-    """Search DuckDuckGo — free, no API key needed."""
     try:
-        with DDGS() as ddgs:
-            results = list(ddgs.text(query, max_results=max_results))
-        if not results:
             return ""
         context = "=== Web Search Results ===\n"
-        for i, r in enumerate(results, 1):
-            title = r.get("title", "").strip()
-            body  = r.get("body",  "").strip()
-            href  = r.get("href",  "").strip()
-            context += f"\n[{i}] {title}\n{body}\nSource: {href}\n"
         context += "\n=== End of Web Results ===\n"
         return context
     except Exception as e:
         print(f"  [web_search] error: {e}")
         return ""
 def inject_web_context(messages: list) -> list:
-    """Search DuckDuckGo using last user message, inject as system context."""
     if not messages:
         return messages
     last_user = next(
         (m for m in reversed(messages) if m.get("role") == "user"), None
     )
@@ -235,7 +269,7 @@ def resolve_model(name: str) -> str:
 # ---------------------------
-# DOWNLOAD + START  (background)
 # ---------------------------
 def download_model(cfg: dict):
@@ -247,7 +281,6 @@ def download_model(cfg: dict):
 def start_llama(model_name: str, cfg: dict):
-    # Download first (blocks only this thread)
     download_model(cfg)
     print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
@@ -288,21 +321,13 @@ def start_llama(model_name: str, cfg: dict):
 def setup_all():
-    """
-    KEY FIX: Run all downloads + llama-server starts in ONE background thread.
-    This lets uvicorn bind to port 7860 immediately on startup.
-    Models become available as they finish loading (readiness guard handles the rest).
-    """
     os.makedirs("models", exist_ok=True)
-    threads = []
     for name, cfg in MODELS.items():
-        t = threading.Thread(target=start_llama, args=(name, cfg), daemon=True)
-        t.start()
-        threads.append(t)
-    # Don't join — let them run in background
-# Start everything in background immediately
 threading.Thread(target=setup_all, daemon=True).start()
@@ -311,10 +336,6 @@ threading.Thread(target=setup_all, daemon=True).start()
 # ---------------------------
 def wait_for_model(model_key: str, timeout: int = 300):
-    """
-    Block the request until the model is ready.
-    Timeout is 300s (5 min) to cover cold download + load time.
-    """
     deadline = time.time() + timeout
     while time.time() < deadline:
         if _server_ready.get(model_key):

   🌐 gemma3-4b           → translation, general chat     (port 8082)
   🔍 qwen3.5-0.8b        → internet queries, news, fast  (port 8083)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+  NO extra packages — web search uses only requests (already installed)
+  Downloads + server starts run in background — port 7860 binds instantly
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 """
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 import subprocess
 import requests
 import uvicorn
 import time
 import hashlib
 import threading
+import urllib.parse
 from typing import Optional
 app = FastAPI()
 DEFAULT_MODEL = "qwen2.5-coder-1.5b"
 LLAMA_SERVER  = "./llama.cpp/build/bin/llama-server"
 _server_ready: dict = {k: False for k in MODELS}
 # ---------------------------
+# WEB SEARCH — pure requests, no extra package
 # ---------------------------
 def web_search(query: str, max_results: int = 3) -> str:
+    """
+    DuckDuckGo search using only the `requests` library.
+    Uses DDG's JSON API — no API key, no extra packages.
+    """
     try:
+        # Step 1: get vqd token (DDG requires this for search)
+        encoded = urllib.parse.quote(query)
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; LLM-Search/1.0)",
+        }
+        # Use DDG lite HTML endpoint — most reliable, no JS required
+        resp = requests.get(
+            f"https://html.duckduckgo.com/html/?q={encoded}",
+            headers=headers,
+            timeout=8,
+        )
+        if resp.status_code != 200:
             return ""
+        # Parse results from HTML using simple string extraction
+        html = resp.text
+        results = []
+        # Extract result blocks between <div class="result"> tags
+        import re
+        # Extract titles and snippets
+        titles   = re.findall(r'class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>', html, re.DOTALL)
+        snippets = re.findall(r'class="result__snippet"[^>]*>(.*?)</div>', html, re.DOTALL)
+        urls     = re.findall(r'class="result__url"[^>]*>(.*?)</span>', html, re.DOTALL)
+        # Clean HTML tags from extracted text
+        def strip_tags(text):
+            return re.sub(r'<[^>]+>', '', text).strip()
+        count = min(max_results, len(titles), len(snippets))
+        if count == 0:
+            return ""
         context = "=== Web Search Results ===\n"
+        for i in range(count):
+            title   = strip_tags(titles[i])
+            snippet = strip_tags(snippets[i])
+            url     = strip_tags(urls[i]) if i < len(urls) else ""
+            context += f"\n[{i+1}] {title}\n{snippet}\nSource: {url}\n"
         context += "\n=== End of Web Results ===\n"
         return context
     except Exception as e:
         print(f"  [web_search] error: {e}")
         return ""
 def inject_web_context(messages: list) -> list:
+    """Inject DuckDuckGo results as system context before last user message."""
     if not messages:
         return messages
     last_user = next(
         (m for m in reversed(messages) if m.get("role") == "user"), None
     )
 # ---------------------------
+# DOWNLOAD + START  (all in background)
 # ---------------------------
 def download_model(cfg: dict):
 def start_llama(model_name: str, cfg: dict):
     download_model(cfg)
     print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
 def setup_all():
+    """All downloads + server starts run here in background. Port 7860 binds instantly."""
     os.makedirs("models", exist_ok=True)
     for name, cfg in MODELS.items():
+        threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()
+# Kick off everything in background immediately — uvicorn binds port 7860 first
 threading.Thread(target=setup_all, daemon=True).start()
 # ---------------------------
 def wait_for_model(model_key: str, timeout: int = 300):
     deadline = time.time() + timeout
     while time.time() < deadline:
         if _server_ready.get(model_key):