mrmadblack commited on
Commit
fac1d0b
Β·
verified Β·
1 Parent(s): b1758f7

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +54 -33
server.py CHANGED
@@ -6,8 +6,8 @@ Ollama-compatible API server
6
  🌐 gemma3-4b β†’ translation, general chat (port 8082)
7
  πŸ” qwen3.5-0.8b β†’ internet queries, news, fast (port 8083)
8
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
9
- KEY FIX: downloads + llama-server starts run in background thread
10
- so uvicorn binds to port 7860 IMMEDIATELY β€” no HF startup timeout
11
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
12
  """
13
 
@@ -15,7 +15,6 @@ from fastapi import FastAPI, HTTPException, Response
15
  from fastapi.responses import StreamingResponse, JSONResponse
16
  from pydantic import BaseModel
17
  from huggingface_hub import hf_hub_download
18
- from ddgs import DDGS
19
  import subprocess
20
  import requests
21
  import uvicorn
@@ -24,6 +23,7 @@ import json
24
  import time
25
  import hashlib
26
  import threading
 
27
  from typing import Optional
28
 
29
  app = FastAPI()
@@ -90,8 +90,6 @@ MODELS = {
90
 
91
  DEFAULT_MODEL = "qwen2.5-coder-1.5b"
92
  LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
93
-
94
- # Tracks readiness of each model
95
  _server_ready: dict = {k: False for k in MODELS}
96
 
97
 
@@ -114,33 +112,69 @@ class GenerateRequest(BaseModel):
114
 
115
 
116
  # ---------------------------
117
- # WEB SEARCH (DuckDuckGo RAG)
118
  # ---------------------------
119
 
120
  def web_search(query: str, max_results: int = 3) -> str:
121
- """Search DuckDuckGo β€” free, no API key needed."""
 
 
 
122
  try:
123
- with DDGS() as ddgs:
124
- results = list(ddgs.text(query, max_results=max_results))
125
- if not results:
 
 
 
 
 
 
 
 
 
 
 
126
  return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  context = "=== Web Search Results ===\n"
128
- for i, r in enumerate(results, 1):
129
- title = r.get("title", "").strip()
130
- body = r.get("body", "").strip()
131
- href = r.get("href", "").strip()
132
- context += f"\n[{i}] {title}\n{body}\nSource: {href}\n"
133
  context += "\n=== End of Web Results ===\n"
134
  return context
 
135
  except Exception as e:
136
  print(f" [web_search] error: {e}")
137
  return ""
138
 
139
 
140
  def inject_web_context(messages: list) -> list:
141
- """Search DuckDuckGo using last user message, inject as system context."""
142
  if not messages:
143
  return messages
 
144
  last_user = next(
145
  (m for m in reversed(messages) if m.get("role") == "user"), None
146
  )
@@ -235,7 +269,7 @@ def resolve_model(name: str) -> str:
235
 
236
 
237
  # ---------------------------
238
- # DOWNLOAD + START (background)
239
  # ---------------------------
240
 
241
  def download_model(cfg: dict):
@@ -247,7 +281,6 @@ def download_model(cfg: dict):
247
 
248
 
249
  def start_llama(model_name: str, cfg: dict):
250
- # Download first (blocks only this thread)
251
  download_model(cfg)
252
 
253
  print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
@@ -288,21 +321,13 @@ def start_llama(model_name: str, cfg: dict):
288
 
289
 
290
  def setup_all():
291
- """
292
- KEY FIX: Run all downloads + llama-server starts in ONE background thread.
293
- This lets uvicorn bind to port 7860 immediately on startup.
294
- Models become available as they finish loading (readiness guard handles the rest).
295
- """
296
  os.makedirs("models", exist_ok=True)
297
- threads = []
298
  for name, cfg in MODELS.items():
299
- t = threading.Thread(target=start_llama, args=(name, cfg), daemon=True)
300
- t.start()
301
- threads.append(t)
302
- # Don't join β€” let them run in background
303
 
304
 
305
- # Start everything in background immediately
306
  threading.Thread(target=setup_all, daemon=True).start()
307
 
308
 
@@ -311,10 +336,6 @@ threading.Thread(target=setup_all, daemon=True).start()
311
  # ---------------------------
312
 
313
  def wait_for_model(model_key: str, timeout: int = 300):
314
- """
315
- Block the request until the model is ready.
316
- Timeout is 300s (5 min) to cover cold download + load time.
317
- """
318
  deadline = time.time() + timeout
319
  while time.time() < deadline:
320
  if _server_ready.get(model_key):
 
6
  🌐 gemma3-4b β†’ translation, general chat (port 8082)
7
  πŸ” qwen3.5-0.8b β†’ internet queries, news, fast (port 8083)
8
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
9
+ NO extra packages β€” web search uses only requests (already installed)
10
+ Downloads + server starts run in background β€” port 7860 binds instantly
11
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
12
  """
13
 
 
15
  from fastapi.responses import StreamingResponse, JSONResponse
16
  from pydantic import BaseModel
17
  from huggingface_hub import hf_hub_download
 
18
  import subprocess
19
  import requests
20
  import uvicorn
 
23
  import time
24
  import hashlib
25
  import threading
26
+ import urllib.parse
27
  from typing import Optional
28
 
29
  app = FastAPI()
 
90
 
91
  DEFAULT_MODEL = "qwen2.5-coder-1.5b"
92
  LLAMA_SERVER = "./llama.cpp/build/bin/llama-server"
 
 
93
  _server_ready: dict = {k: False for k in MODELS}
94
 
95
 
 
112
 
113
 
114
  # ---------------------------
115
+ # WEB SEARCH β€” pure requests, no extra package
116
  # ---------------------------
117
 
118
  def web_search(query: str, max_results: int = 3) -> str:
119
+ """
120
+ DuckDuckGo search using only the `requests` library.
121
+ Uses DDG's JSON API β€” no API key, no extra packages.
122
+ """
123
  try:
124
+ # Step 1: get vqd token (DDG requires this for search)
125
+ encoded = urllib.parse.quote(query)
126
+ headers = {
127
+ "User-Agent": "Mozilla/5.0 (compatible; LLM-Search/1.0)",
128
+ }
129
+
130
+ # Use DDG lite HTML endpoint β€” most reliable, no JS required
131
+ resp = requests.get(
132
+ f"https://html.duckduckgo.com/html/?q={encoded}",
133
+ headers=headers,
134
+ timeout=8,
135
+ )
136
+
137
+ if resp.status_code != 200:
138
  return ""
139
+
140
+ # Parse results from HTML using simple string extraction
141
+ html = resp.text
142
+ results = []
143
+
144
+ # Extract result blocks between <div class="result"> tags
145
+ import re
146
+ # Extract titles and snippets
147
+ titles = re.findall(r'class="result__title"[^>]*>.*?<a[^>]*>(.*?)</a>', html, re.DOTALL)
148
+ snippets = re.findall(r'class="result__snippet"[^>]*>(.*?)</div>', html, re.DOTALL)
149
+ urls = re.findall(r'class="result__url"[^>]*>(.*?)</span>', html, re.DOTALL)
150
+
151
+ # Clean HTML tags from extracted text
152
+ def strip_tags(text):
153
+ return re.sub(r'<[^>]+>', '', text).strip()
154
+
155
+ count = min(max_results, len(titles), len(snippets))
156
+ if count == 0:
157
+ return ""
158
+
159
  context = "=== Web Search Results ===\n"
160
+ for i in range(count):
161
+ title = strip_tags(titles[i])
162
+ snippet = strip_tags(snippets[i])
163
+ url = strip_tags(urls[i]) if i < len(urls) else ""
164
+ context += f"\n[{i+1}] {title}\n{snippet}\nSource: {url}\n"
165
  context += "\n=== End of Web Results ===\n"
166
  return context
167
+
168
  except Exception as e:
169
  print(f" [web_search] error: {e}")
170
  return ""
171
 
172
 
173
  def inject_web_context(messages: list) -> list:
174
+ """Inject DuckDuckGo results as system context before last user message."""
175
  if not messages:
176
  return messages
177
+
178
  last_user = next(
179
  (m for m in reversed(messages) if m.get("role") == "user"), None
180
  )
 
269
 
270
 
271
  # ---------------------------
272
+ # DOWNLOAD + START (all in background)
273
  # ---------------------------
274
 
275
  def download_model(cfg: dict):
 
281
 
282
 
283
  def start_llama(model_name: str, cfg: dict):
 
284
  download_model(cfg)
285
 
286
  print(f"Starting llama-server for {model_name} on port {cfg['port']} ...")
 
321
 
322
 
323
  def setup_all():
324
+ """All downloads + server starts run here in background. Port 7860 binds instantly."""
 
 
 
 
325
  os.makedirs("models", exist_ok=True)
 
326
  for name, cfg in MODELS.items():
327
+ threading.Thread(target=start_llama, args=(name, cfg), daemon=True).start()
 
 
 
328
 
329
 
330
+ # Kick off everything in background immediately β€” uvicorn binds port 7860 first
331
  threading.Thread(target=setup_all, daemon=True).start()
332
 
333
 
 
336
  # ---------------------------
337
 
338
  def wait_for_model(model_key: str, timeout: int = 300):
 
 
 
 
339
  deadline = time.time() + timeout
340
  while time.time() < deadline:
341
  if _server_ready.get(model_key):