mrmadblack commited on
Commit
4f32f3f
Β·
verified Β·
1 Parent(s): d4d90dd

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +39 -52
server.py CHANGED
@@ -2,15 +2,13 @@
2
  Ollama-compatible API server
3
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4
  ⚑ qwen2.5-coder-1.5b β†’ coding, quick replies (port 8080)
5
- 🧠 qwen3.5-4b β†’ thinking, hard problems (port 8081) ← upgraded
6
  🌐 gemma3-4b β†’ translation, general chat (port 8082)
 
7
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
8
- πŸ” Web search (RAG) β†’ auto-injects DuckDuckGo results into prompt
9
- triggers on: latest, today, news, price,
10
- who is, current, 2025, weather, etc.
11
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
12
- Optimized for HuggingFace free tier: 2 vCPU, 16GB RAM
13
-
14
  pip install: duckduckgo-search
15
  """
16
 
@@ -27,7 +25,6 @@ import json
27
  import time
28
  import hashlib
29
  import threading
30
- import re
31
  from typing import Optional
32
 
33
  app = FastAPI()
@@ -46,12 +43,12 @@ MODELS = {
46
  "param_size": "1.5B",
47
  "family": "qwen2.5",
48
  "fmt": "chatml",
49
- "web_search": True, # coding model β€” no web search needed
50
  "threads": 2,
51
  "ctx": 2048,
52
  "batch": 512,
53
  },
54
- "qwen3.5-4b": { # 🧠 THINKING β€” hard bugs, architecture, logic (/think)
55
  "path": "models/qwen3.5-4b.gguf",
56
  "repo": "bartowski/Qwen_Qwen3.5-4B-GGUF",
57
  "file": "Qwen_Qwen3.5-4B-Q4_K_M.gguf",
@@ -59,12 +56,12 @@ MODELS = {
59
  "param_size": "4B",
60
  "family": "qwen3.5",
61
  "fmt": "chatml",
62
- "web_search": True, # thinking model β€” benefits from web context
63
  "threads": 2,
64
  "ctx": 2048,
65
  "batch": 512,
66
  },
67
- "gemma3-4b": { # 🌐 GENERAL β€” translation, Tamil↔English, daily chat
68
  "path": "models/gemma3-4b.gguf",
69
  "repo": "bartowski/google_gemma-3-4b-it-GGUF",
70
  "file": "google_gemma-3-4b-it-Q4_K_M.gguf",
@@ -72,7 +69,20 @@ MODELS = {
72
  "param_size": "4B",
73
  "family": "gemma3",
74
  "fmt": "gemma",
75
- "web_search": True, # general model β€” benefits from web context
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  "threads": 2,
77
  "ctx": 2048,
78
  "batch": 512,
@@ -105,25 +115,8 @@ class GenerateRequest(BaseModel):
105
  # WEB SEARCH (DuckDuckGo RAG)
106
  # ---------------------------
107
 
108
- # Keywords that suggest the user needs current/internet info
109
- WEB_TRIGGERS = [
110
- "latest", "current", "today", "now", "news", "2025", "2026",
111
- "price", "weather", "who is", "what is the", "when did", "when is",
112
- "recent", "new version", "update", "release", "stock", "score",
113
- "result", "live", "trending", "announced", "launched", "happened",
114
- ]
115
-
116
- def needs_web_search(text: str) -> bool:
117
- """Detect if the user message needs live internet info."""
118
- text_lower = text.lower()
119
- return any(trigger in text_lower for trigger in WEB_TRIGGERS)
120
-
121
-
122
  def web_search(query: str, max_results: int = 3) -> str:
123
- """
124
- Search DuckDuckGo and return a formatted context block.
125
- Free, no API key needed.
126
- """
127
  try:
128
  with DDGS() as ddgs:
129
  results = list(ddgs.text(query, max_results=max_results))
@@ -147,14 +140,12 @@ def web_search(query: str, max_results: int = 3) -> str:
147
 
148
  def inject_web_context(messages: list) -> list:
149
  """
150
- Check the last user message. If it needs web info,
151
- search DuckDuckGo and inject results as a system message
152
- right before the user's message.
153
  """
154
  if not messages:
155
  return messages
156
 
157
- # Get the last user message
158
  last_user = next(
159
  (m for m in reversed(messages) if m.get("role") == "user"),
160
  None
@@ -164,23 +155,21 @@ def inject_web_context(messages: list) -> list:
164
 
165
  user_text = last_user.get("content", "")
166
 
167
- if not needs_web_search(user_text):
168
- return messages # no search needed
169
-
170
- print(f" [web_search] triggered for: {user_text[:60]}...")
171
  context = web_search(user_text)
172
 
173
  if not context:
174
- return messages # search failed silently, carry on without it
 
175
 
176
  print(f" [web_search] injected {len(context)} chars of context")
177
 
178
- # Build new message list with web context injected as a system message
179
  web_system = {
180
  "role": "system",
181
  "content": (
182
  "You have access to the following real-time web search results. "
183
- "Use them to answer the user's question accurately. "
 
184
  "If the results are not relevant, rely on your own knowledge.\n\n"
185
  + context
186
  )
@@ -213,7 +202,6 @@ def build_prompt(messages: list, fmt: str = "chatml") -> str:
213
  if not content:
214
  continue
215
  if role == "system":
216
- # Gemma3 has no system role β€” prepend as first user turn
217
  prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n"
218
  elif role == "user":
219
  prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
@@ -222,7 +210,7 @@ def build_prompt(messages: list, fmt: str = "chatml") -> str:
222
  prompt += "<start_of_turn>model\n"
223
  return prompt
224
 
225
- # ── ChatML format (Qwen2.5-Coder, Qwen3.5) ─────────────────
226
  prompt = ""
227
  has_system = any(m.get("role") == "system" for m in messages)
228
  if not has_system:
@@ -294,13 +282,13 @@ def start_llama(model_name: str, cfg: dict):
294
  "-c", str(cfg["ctx"]),
295
  "--threads", str(cfg["threads"]),
296
  "--batch-size", str(cfg["batch"]),
297
- "-ngl", "0", # CPU only
298
- "-np", "1", # 1 parallel slot
299
  ], stdout=log, stderr=log)
300
 
301
  url = f"http://localhost:{cfg['port']}/health"
302
 
303
- for i in range(90): # wait up to 3 min
304
  time.sleep(2)
305
  try:
306
  r = requests.get(url, timeout=2)
@@ -395,10 +383,10 @@ def root():
395
  "status": "running",
396
  "models_ready": dict(_server_ready),
397
  "usage": {
398
- "fast coding": "qwen2.5-coder-1.5b",
399
- "thinking": "qwen3.5-4b (add /think to your message)",
400
- "translation": "gemma3-4b",
401
- "web_search": "auto β€” triggers on keywords like: latest, today, news, price..."
402
  }
403
  }
404
 
@@ -520,8 +508,7 @@ def chat(req: ChatRequest):
520
 
521
  wait_for_model(key)
522
 
523
- # ── Web Search RAG ──────────────────────────────────────────
524
- # Only for models with web_search enabled (qwen3.5-4b, gemma3-4b)
525
  messages = req.messages
526
  if cfg.get("web_search", False):
527
  messages = inject_web_context(messages)
 
2
  Ollama-compatible API server
3
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4
  ⚑ qwen2.5-coder-1.5b β†’ coding, quick replies (port 8080)
5
+ 🧠 qwen3.5-4b β†’ thinking, hard problems (port 8081)
6
  🌐 gemma3-4b β†’ translation, general chat (port 8082)
7
+ πŸ” qwen3.5-0.8b β†’ internet queries, news, fast (port 8083) ← NEW
8
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
9
+ Web search only on qwen3.5-0.8b β€” big models stay fast
10
+ Total RAM: ~6.8GB / 16GB
 
11
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
 
12
  pip install: duckduckgo-search
13
  """
14
 
 
25
  import time
26
  import hashlib
27
  import threading
 
28
  from typing import Optional
29
 
30
  app = FastAPI()
 
43
  "param_size": "1.5B",
44
  "family": "qwen2.5",
45
  "fmt": "chatml",
46
+ "web_search": False,
47
  "threads": 2,
48
  "ctx": 2048,
49
  "batch": 512,
50
  },
51
+ "qwen3.5-4b": { # 🧠 THINKING β€” hard bugs, architecture (/think)
52
  "path": "models/qwen3.5-4b.gguf",
53
  "repo": "bartowski/Qwen_Qwen3.5-4B-GGUF",
54
  "file": "Qwen_Qwen3.5-4B-Q4_K_M.gguf",
 
56
  "param_size": "4B",
57
  "family": "qwen3.5",
58
  "fmt": "chatml",
59
+ "web_search": False, # ← removed, stays fast now
60
  "threads": 2,
61
  "ctx": 2048,
62
  "batch": 512,
63
  },
64
+ "gemma3-4b": { # 🌐 GENERAL β€” translation, Tamil↔English, chat
65
  "path": "models/gemma3-4b.gguf",
66
  "repo": "bartowski/google_gemma-3-4b-it-GGUF",
67
  "file": "google_gemma-3-4b-it-Q4_K_M.gguf",
 
69
  "param_size": "4B",
70
  "family": "gemma3",
71
  "fmt": "gemma",
72
+ "web_search": False, # ← removed, stays fast now
73
+ "threads": 2,
74
+ "ctx": 2048,
75
+ "batch": 512,
76
+ },
77
+ "qwen3.5-0.8b": { # πŸ” INTERNET β€” news, prices, latest info (small+fast)
78
+ "path": "models/qwen3.5-0.8b.gguf",
79
+ "repo": "bartowski/Qwen_Qwen3.5-0.8B-GGUF",
80
+ "file": "Qwen_Qwen3.5-0.8B-Q4_K_M.gguf",
81
+ "port": 8083,
82
+ "param_size": "0.8B",
83
+ "family": "qwen3.5",
84
+ "fmt": "chatml",
85
+ "web_search": True, # ← ONLY this model does web search
86
  "threads": 2,
87
  "ctx": 2048,
88
  "batch": 512,
 
115
  # WEB SEARCH (DuckDuckGo RAG)
116
  # ---------------------------
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def web_search(query: str, max_results: int = 3) -> str:
119
+ """Search DuckDuckGo β€” free, no API key needed."""
 
 
 
120
  try:
121
  with DDGS() as ddgs:
122
  results = list(ddgs.text(query, max_results=max_results))
 
140
 
141
  def inject_web_context(messages: list) -> list:
142
  """
143
+ Always search DuckDuckGo using the last user message as query.
144
+ Inject results as a system message right before the user turn.
 
145
  """
146
  if not messages:
147
  return messages
148
 
 
149
  last_user = next(
150
  (m for m in reversed(messages) if m.get("role") == "user"),
151
  None
 
155
 
156
  user_text = last_user.get("content", "")
157
 
158
+ print(f" [web_search] searching: {user_text[:60]}...")
 
 
 
159
  context = web_search(user_text)
160
 
161
  if not context:
162
+ print(" [web_search] no results, continuing without web context")
163
+ return messages
164
 
165
  print(f" [web_search] injected {len(context)} chars of context")
166
 
 
167
  web_system = {
168
  "role": "system",
169
  "content": (
170
  "You have access to the following real-time web search results. "
171
+ "Use them to answer the user's question accurately and concisely. "
172
+ "Always mention the source when using web data. "
173
  "If the results are not relevant, rely on your own knowledge.\n\n"
174
  + context
175
  )
 
202
  if not content:
203
  continue
204
  if role == "system":
 
205
  prompt += f"<start_of_turn>user\n[Context] {content}<end_of_turn>\n"
206
  elif role == "user":
207
  prompt += f"<start_of_turn>user\n{content}<end_of_turn>\n"
 
210
  prompt += "<start_of_turn>model\n"
211
  return prompt
212
 
213
+ # ── ChatML format (Qwen2.5-Coder, Qwen3.5-4B, Qwen3.5-0.8B) ─
214
  prompt = ""
215
  has_system = any(m.get("role") == "system" for m in messages)
216
  if not has_system:
 
282
  "-c", str(cfg["ctx"]),
283
  "--threads", str(cfg["threads"]),
284
  "--batch-size", str(cfg["batch"]),
285
+ "-ngl", "0",
286
+ "-np", "1",
287
  ], stdout=log, stderr=log)
288
 
289
  url = f"http://localhost:{cfg['port']}/health"
290
 
291
+ for i in range(90):
292
  time.sleep(2)
293
  try:
294
  r = requests.get(url, timeout=2)
 
383
  "status": "running",
384
  "models_ready": dict(_server_ready),
385
  "usage": {
386
+ "⚑ fast coding": "qwen2.5-coder-1.5b",
387
+ "🧠 thinking": "qwen3.5-4b (add /think to message)",
388
+ "🌐 translation": "gemma3-4b",
389
+ "πŸ” internet/news": "qwen3.5-0.8b (auto web search on every message)",
390
  }
391
  }
392
 
 
508
 
509
  wait_for_model(key)
510
 
511
+ # Web search RAG β€” ONLY for qwen3.5-0.8b
 
512
  messages = req.messages
513
  if cfg.get("web_search", False):
514
  messages = inject_web_context(messages)