Spaces:

mrmadblack
/

llm-api

Running

App Files Files Community

mrmadblack commited on Mar 16

Commit

4f32f3f

verified ·

1 Parent(s): d4d90dd

Update server.py

Browse files

Files changed (1) hide show

server.py +39 -52

server.py CHANGED Viewed

@@ -2,15 +2,13 @@
 Ollama-compatible API server
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
   ⚡ qwen2.5-coder-1.5b  → coding, quick replies        (port 8080)
-  🧠 qwen3.5-4b          → thinking, hard problems       (port 8081)  ← upgraded
   🌐 gemma3-4b           → translation, general chat     (port 8082)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-  🔍 Web search (RAG)    → auto-injects DuckDuckGo results into prompt
-                           triggers on: latest, today, news, price,
-                           who is, current, 2025, weather, etc.
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
 pip install: duckduckgo-search
 """
@@ -27,7 +25,6 @@ import json
 import time
 import hashlib
 import threading
-import re
 from typing import Optional
 app = FastAPI()
@@ -46,12 +43,12 @@ MODELS = {
         "param_size": "1.5B",
         "family":     "qwen2.5",
         "fmt":        "chatml",
-        "web_search": True,   # coding model — no web search needed
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
-    "qwen3.5-4b": {            # 🧠 THINKING — hard bugs, architecture, logic (/think)
         "path":       "models/qwen3.5-4b.gguf",
         "repo":       "bartowski/Qwen_Qwen3.5-4B-GGUF",
         "file":       "Qwen_Qwen3.5-4B-Q4_K_M.gguf",
@@ -59,12 +56,12 @@ MODELS = {
         "param_size": "4B",
         "family":     "qwen3.5",
         "fmt":        "chatml",
-        "web_search": True,    # thinking model — benefits from web context
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
-    "gemma3-4b": {             # 🌐 GENERAL — translation, Tamil↔English, daily chat
         "path":       "models/gemma3-4b.gguf",
         "repo":       "bartowski/google_gemma-3-4b-it-GGUF",
         "file":       "google_gemma-3-4b-it-Q4_K_M.gguf",
@@ -72,7 +69,20 @@ MODELS = {
         "param_size": "4B",
         "family":     "gemma3",
         "fmt":        "gemma",
-        "web_search": True,    # general model — benefits from web context
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
@@ -105,25 +115,8 @@ class GenerateRequest(BaseModel):
 # WEB SEARCH  (DuckDuckGo RAG)
 # ---------------------------
-# Keywords that suggest the user needs current/internet info
-WEB_TRIGGERS = [
-    "latest", "current", "today", "now", "news", "2025", "2026",
-    "price", "weather", "who is", "what is the", "when did", "when is",
-    "recent", "new version", "update", "release", "stock", "score",
-    "result", "live", "trending", "announced", "launched", "happened",
-]
-def needs_web_search(text: str) -> bool:
-    """Detect if the user message needs live internet info."""
-    text_lower = text.lower()
-    return any(trigger in text_lower for trigger in WEB_TRIGGERS)
 def web_search(query: str, max_results: int = 3) -> str:
-    """
-    Search DuckDuckGo and return a formatted context block.
-    Free, no API key needed.
-    """
     try:
         with DDGS() as ddgs:
             results = list(ddgs.text(query, max_results=max_results))
@@ -147,14 +140,12 @@ def web_search(query: str, max_results: int = 3) -> str:
 def inject_web_context(messages: list) -> list:
     """
-    Check the last user message. If it needs web info,
-    search DuckDuckGo and inject results as a system message
-    right before the user's message.
     """
     if not messages:
         return messages
-    # Get the last user message
     last_user = next(
         (m for m in reversed(messages) if m.get("role") == "user"),
         None
@@ -164,23 +155,21 @@ def inject_web_context(messages: list) -> list:
     user_text = last_user.get("content", "")
-    if not needs_web_search(user_text):
-        return messages   # no search needed
-    print(f"  [web_search] triggered for: {user_text[:60]}...")
     context = web_search(user_text)
     if not context:
-        return messages   # search failed silently, carry on without it
     print(f"  [web_search] injected {len(context)} chars of context")
-    # Build new message list with web context injected as a system message
     web_system = {
         "role": "system",
         "content": (
             "You have access to the following real-time web search results. "
-            "Use them to answer the user's question accurately. "
             "If the results are not relevant, rely on your own knowledge.\n\n"
             + context
         )
@@ -213,7 +202,6 @@ def build_prompt(messages: list, fmt: str = "chatml") -> str:
             if not content:
                 continue
             if role == "system":
-                # Gemma3 has no system role — prepend as first user turn
                 prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n"
             elif role == "user":
                 prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
@@ -222,7 +210,7 @@ def build_prompt(messages: list, fmt: str = "chatml") -> str:
         prompt += "<start_of_turn>model\n"
         return prompt
-    # ── ChatML format (Qwen2.5-Coder, Qwen3.5) ─────────────────
     prompt = ""
     has_system = any(m.get("role") == "system" for m in messages)
     if not has_system:
@@ -294,13 +282,13 @@ def start_llama(model_name: str, cfg: dict):
         "-c",           str(cfg["ctx"]),
         "--threads",    str(cfg["threads"]),
         "--batch-size", str(cfg["batch"]),
-        "-ngl",         "0",   # CPU only
-        "-np",          "1",   # 1 parallel slot
     ], stdout=log, stderr=log)
     url = f"http://localhost:{cfg['port']}/health"
-    for i in range(90):   # wait up to 3 min
         time.sleep(2)
         try:
             r = requests.get(url, timeout=2)
@@ -395,10 +383,10 @@ def root():
         "status": "running",
         "models_ready": dict(_server_ready),
         "usage": {
-            "fast coding":  "qwen2.5-coder-1.5b",
-            "thinking":     "qwen3.5-4b  (add /think to your message)",
-            "translation":  "gemma3-4b",
-            "web_search":   "auto — triggers on keywords like: latest, today, news, price..."
         }
     }
@@ -520,8 +508,7 @@ def chat(req: ChatRequest):
     wait_for_model(key)
-    # ── Web Search RAG ──────────────────────────────────────────
-    # Only for models with web_search enabled (qwen3.5-4b, gemma3-4b)
     messages = req.messages
     if cfg.get("web_search", False):
         messages = inject_web_context(messages)

 Ollama-compatible API server
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
   ⚡ qwen2.5-coder-1.5b  → coding, quick replies        (port 8080)
+  🧠 qwen3.5-4b          → thinking, hard problems       (port 8081)
   🌐 gemma3-4b           → translation, general chat     (port 8082)
+  🔍 qwen3.5-0.8b        → internet queries, news, fast  (port 8083) ← NEW
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+  Web search only on qwen3.5-0.8b — big models stay fast
+  Total RAM: ~6.8GB / 16GB
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 pip install: duckduckgo-search
 """
 import time
 import hashlib
 import threading
 from typing import Optional
 app = FastAPI()
         "param_size": "1.5B",
         "family":     "qwen2.5",
         "fmt":        "chatml",
+        "web_search": False,
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
+    "qwen3.5-4b": {            # 🧠 THINKING — hard bugs, architecture (/think)
         "path":       "models/qwen3.5-4b.gguf",
         "repo":       "bartowski/Qwen_Qwen3.5-4B-GGUF",
         "file":       "Qwen_Qwen3.5-4B-Q4_K_M.gguf",
         "param_size": "4B",
         "family":     "qwen3.5",
         "fmt":        "chatml",
+        "web_search": False,   # ← removed, stays fast now
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
     },
+    "gemma3-4b": {             # 🌐 GENERAL — translation, Tamil↔English, chat
         "path":       "models/gemma3-4b.gguf",
         "repo":       "bartowski/google_gemma-3-4b-it-GGUF",
         "file":       "google_gemma-3-4b-it-Q4_K_M.gguf",
         "param_size": "4B",
         "family":     "gemma3",
         "fmt":        "gemma",
+        "web_search": False,   # ← removed, stays fast now
+        "threads":    2,
+        "ctx":        2048,
+        "batch":      512,
+    },
+    "qwen3.5-0.8b": {          # 🔍 INTERNET — news, prices, latest info (small+fast)
+        "path":       "models/qwen3.5-0.8b.gguf",
+        "repo":       "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
+        "file":       "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
+        "port":       8083,
+        "param_size": "0.8B",
+        "family":     "qwen3.5",
+        "fmt":        "chatml",
+        "web_search": True,    # ← ONLY this model does web search
         "threads":    2,
         "ctx":        2048,
         "batch":      512,
 # WEB SEARCH  (DuckDuckGo RAG)
 # ---------------------------
 def web_search(query: str, max_results: int = 3) -> str:
+    """Search DuckDuckGo — free, no API key needed."""
     try:
         with DDGS() as ddgs:
             results = list(ddgs.text(query, max_results=max_results))
 def inject_web_context(messages: list) -> list:
     """
+    Always search DuckDuckGo using the last user message as query.
+    Inject results as a system message right before the user turn.
     """
     if not messages:
         return messages
     last_user = next(
         (m for m in reversed(messages) if m.get("role") == "user"),
         None
     user_text = last_user.get("content", "")
+    print(f"  [web_search] searching: {user_text[:60]}...")
     context = web_search(user_text)
     if not context:
+        print("  [web_search] no results, continuing without web context")
+        return messages
     print(f"  [web_search] injected {len(context)} chars of context")
     web_system = {
         "role": "system",
         "content": (
             "You have access to the following real-time web search results. "
+            "Use them to answer the user's question accurately and concisely. "
+            "Always mention the source when using web data. "
             "If the results are not relevant, rely on your own knowledge.\n\n"
             + context
         )
             if not content:
                 continue
             if role == "system":
                 prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n"
             elif role == "user":
                 prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
         prompt += "<start_of_turn>model\n"
         return prompt
+    # ── ChatML format (Qwen2.5-Coder, Qwen3.5-4B, Qwen3.5-0.8B) ─
     prompt = ""
     has_system = any(m.get("role") == "system" for m in messages)
     if not has_system:
         "-c",           str(cfg["ctx"]),
         "--threads",    str(cfg["threads"]),
         "--batch-size", str(cfg["batch"]),
+        "-ngl",         "0",
+        "-np",          "1",
     ], stdout=log, stderr=log)
     url = f"http://localhost:{cfg['port']}/health"
+    for i in range(90):
         time.sleep(2)
         try:
             r = requests.get(url, timeout=2)
         "status": "running",
         "models_ready": dict(_server_ready),
         "usage": {
+            "⚡ fast coding":    "qwen2.5-coder-1.5b",
+            "🧠 thinking":       "qwen3.5-4b  (add /think to message)",
+            "🌐 translation":    "gemma3-4b",
+            "🔍 internet/news":  "qwen3.5-0.8b  (auto web search on every message)",
         }
     }
     wait_for_model(key)
+    # Web search RAG — ONLY for qwen3.5-0.8b
     messages = req.messages
     if cfg.get("web_search", False):
         messages = inject_web_context(messages)