neuralbroker commited on
Commit
6561a10
·
verified ·
1 Parent(s): df1ef5e

Update clean backend-only project docs and eval

Browse files
Files changed (1) hide show
  1. server.py +266 -60
server.py CHANGED
@@ -2,9 +2,9 @@
2
  """
3
  BlitzKode backend server.
4
 
5
- Serves the bundled frontend and proxies prompts to a local GGUF model
6
- through llama.cpp. Model is loaded lazily so the module stays importable
7
- in tests and environments where the model artifact is not present yet.
8
  """
9
 
10
  from __future__ import annotations
@@ -13,24 +13,26 @@ import asyncio
13
  import json
14
  import logging
15
  import os
16
- import queue
17
  import threading
18
  import time
19
  import urllib.error
20
  import urllib.parse
21
  import urllib.request
 
 
22
  from contextlib import asynccontextmanager, suppress
23
  from dataclasses import dataclass
24
  from dataclasses import field as dataclass_field
 
25
  from pathlib import Path
26
  from typing import Any, Literal, cast
27
 
28
  import llama_cpp
29
  import uvicorn
30
- from fastapi import FastAPI, HTTPException, Request
31
  from fastapi.middleware.cors import CORSMiddleware
32
- from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
33
- from fastapi.staticfiles import StaticFiles
34
  from pydantic import BaseModel, Field
35
  from starlette.middleware.base import BaseHTTPMiddleware
36
 
@@ -39,23 +41,26 @@ APP_VERSION = "2.0"
39
  CREATOR = "Sajad"
40
  ROOT_DIR = Path(__file__).resolve().parent
41
  DEFAULT_MODEL_PATH = ROOT_DIR / "blitzkode.gguf"
42
- DEFAULT_FRONTEND_DIST_PATH = ROOT_DIR / "frontend" / "dist" / "index.html"
43
  DEFAULT_CONTEXT = 2048
44
  DEFAULT_MAX_PROMPT_LENGTH = 4000
45
  DEFAULT_MAX_TOKENS = 512
46
  DEFAULT_RATE_LIMIT_MAX = 30
47
  DEFAULT_MAX_SEARCH_RESULTS = 5
48
  DEFAULT_SEARCH_TIMEOUT_SECONDS = 8
 
49
  DEFAULT_MAX_MESSAGES = 20
 
 
50
  STOP_TOKENS = ["<|im_end|>", "<|im_start|>user"]
51
 
52
  SYSTEM_PROMPT = (
53
  "<|im_start|>system\n"
54
  "You are BlitzKode, an AI coding assistant created by Sajad. "
55
  "You are an expert in Python, JavaScript, Java, C++, and other programming languages. "
56
- "For coding work, first understand the user's goal and constraints, then provide a short plan before code when useful. "
57
- "Do not invent APIs, file contents, citations, or execution results. "
58
- "If evidence is missing, say what is unknown and give a safe next step. "
 
59
  "Write clean, efficient, and well-documented code. Keep responses concise and practical.<|im_end|>"
60
  )
61
 
@@ -84,13 +89,6 @@ def _path_from_env(name: str, default: Path) -> Path:
84
  return Path(value) if value else default
85
 
86
 
87
- def _frontend_path_from_env() -> Path:
88
- value = os.getenv("BLITZKODE_FRONTEND_PATH")
89
- if value:
90
- return Path(value)
91
- return DEFAULT_FRONTEND_DIST_PATH
92
-
93
-
94
  def _validate_prompt(prompt: str, max_length: int) -> tuple[str, JSONResponse | None]:
95
  prompt = prompt.strip()
96
  if not prompt:
@@ -103,17 +101,51 @@ def _validate_prompt(prompt: str, max_length: int) -> tuple[str, JSONResponse |
103
  return prompt, None
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  @dataclass(slots=True)
107
  class Settings:
108
  root_dir: Path = ROOT_DIR
109
  model_path: Path = dataclass_field(default_factory=lambda: _path_from_env("BLITZKODE_MODEL_PATH", DEFAULT_MODEL_PATH))
110
- frontend_path: Path = dataclass_field(default_factory=_frontend_path_from_env)
111
  host: str = os.getenv("BLITZKODE_HOST", "0.0.0.0")
112
  port: int = _int_from_env("BLITZKODE_PORT", 7860)
113
  n_gpu_layers: int = _int_from_env("BLITZKODE_GPU_LAYERS", 0)
114
  n_ctx: int = _int_from_env("BLITZKODE_N_CTX", DEFAULT_CONTEXT)
115
  n_threads: int = _int_from_env("BLITZKODE_THREADS", max(1, min(8, os.cpu_count() or 1)))
116
- n_batch: int = _int_from_env("BLITZKODE_BATCH", 128)
 
 
 
 
 
 
 
117
  max_prompt_length: int = _int_from_env("BLITZKODE_MAX_PROMPT_LENGTH", DEFAULT_MAX_PROMPT_LENGTH)
118
  preload_model: bool = _bool_from_env("BLITZKODE_PRELOAD_MODEL", default=False)
119
  cors_origins: str = os.getenv("BLITZKODE_CORS_ORIGINS", "http://localhost:7860")
@@ -121,6 +153,7 @@ class Settings:
121
  web_search_enabled: bool = _bool_from_env("BLITZKODE_WEB_SEARCH", default=True)
122
  search_timeout_seconds: int = _int_from_env("BLITZKODE_SEARCH_TIMEOUT", DEFAULT_SEARCH_TIMEOUT_SECONDS)
123
  max_search_results: int = _int_from_env("BLITZKODE_MAX_SEARCH_RESULTS", DEFAULT_MAX_SEARCH_RESULTS)
 
124
 
125
 
126
  class MessageItem(BaseModel):
@@ -166,9 +199,74 @@ class SearchResult:
166
  }
167
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  class WebSearchService:
170
  def __init__(self, settings: Settings):
171
  self.settings = settings
 
 
172
 
173
  @property
174
  def enabled(self) -> bool:
@@ -208,6 +306,37 @@ class WebSearchService:
208
  title = text.split(" - ", 1)[0]
209
  self._append_result(results, seen_urls, title, url, text, max_results)
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  def search(self, query: str, max_results: int = DEFAULT_MAX_SEARCH_RESULTS, deep: bool = False) -> list[dict[str, str]]:
212
  if not self.enabled:
213
  raise RuntimeError("Web search is disabled. Set BLITZKODE_WEB_SEARCH=true to enable it.")
@@ -217,6 +346,13 @@ class WebSearchService:
217
  raise ValueError("Search query is required")
218
 
219
  limit = min(max_results, max(1, self.settings.max_search_results), 10)
 
 
 
 
 
 
 
220
  results: list[SearchResult] = []
221
  seen_urls: set[str] = set()
222
 
@@ -235,8 +371,14 @@ class WebSearchService:
235
  f"https://api.duckduckgo.com/?{params}",
236
  headers={"User-Agent": f"{APP_NAME}/{APP_VERSION}"},
237
  )
238
- with urllib.request.urlopen(request, timeout=self.settings.search_timeout_seconds) as response:
239
- payload = json.loads(response.read().decode("utf-8"))
 
 
 
 
 
 
240
 
241
  self._append_result(
242
  results,
@@ -248,7 +390,10 @@ class WebSearchService:
248
  )
249
  self._collect_related_topics(payload.get("RelatedTopics", []), results, seen_urls, limit)
250
 
251
- return [result.as_dict() for result in results]
 
 
 
252
 
253
 
254
  class ModelService:
@@ -294,20 +439,18 @@ class ModelService:
294
 
295
  start_time = time.perf_counter()
296
  try:
297
- self._llm = llama_cpp.Llama(
298
- model_path=str(self.settings.model_path),
299
- n_gpu_layers=self.settings.n_gpu_layers,
300
- n_ctx=self.settings.n_ctx,
301
- n_threads=self.settings.n_threads,
302
- n_batch=self.settings.n_batch,
303
- verbose=False,
304
- use_mmap=True,
305
- use_mlock=False,
306
- seed=-1,
307
- )
308
  self._load_time_seconds = time.perf_counter() - start_time
309
  self._last_error = None
310
- logger.info("Model loaded in %.2fs (gpu_layers=%d)", self._load_time_seconds, self.settings.n_gpu_layers)
 
 
 
 
 
 
 
 
311
  except Exception as exc:
312
  self._last_error = str(exc)
313
  logger.error("Model load failed: %s", exc)
@@ -315,6 +458,45 @@ class ModelService:
315
 
316
  return self._llm
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  def build_prompt(self, req: GenerateRequest) -> str:
319
  parts = [SYSTEM_PROMPT]
320
  for msg in req.messages:
@@ -345,7 +527,7 @@ class ModelService:
345
  )
346
  if len(research_prompt) > max_length:
347
  research_prompt = research_prompt[: max_length - 120].rstrip() + "\n\n[Context truncated to fit prompt limit.]"
348
- return req.model_copy(update={"prompt": research_prompt})
349
 
350
  def _gen_params(self, req: GenerateRequest) -> dict:
351
  return {
@@ -372,8 +554,8 @@ class ModelService:
372
  finally:
373
  self._busy = False
374
 
375
- def _run_stream(self, req: GenerateRequest, out_q: queue.Queue):
376
- """Runs streaming inference in a worker thread, puts tokens into out_q."""
377
  try:
378
  llm = self.load_model()
379
  self._busy = True
@@ -386,16 +568,16 @@ class ModelService:
386
  text = token["choices"][0].get("text", "")
387
  if text:
388
  token_count += 1
389
- out_q.put(f"data: {json.dumps({'token': text})}\n\n")
390
  elapsed = time.perf_counter() - start
391
  logger.info("Streamed %d tokens in %.2fs", token_count, elapsed)
392
- out_q.put("data: [DONE]\n\n")
393
  except Exception as exc:
394
  logger.error("Stream error: %s", exc)
395
- out_q.put(f"data: {json.dumps({'error': str(exc)})}\n\n")
396
  finally:
397
  self._busy = False
398
- out_q.put(None)
399
 
400
 
401
  def _check_api_key(request: Request, settings: Settings) -> JSONResponse | None:
@@ -417,7 +599,7 @@ class RateLimitMiddleware(BaseHTTPMiddleware):
417
  super().__init__(app)
418
  self._max = max_requests
419
  self._window = window_seconds
420
- self._clients: dict[str, list[float]] = {}
421
  self._lock = threading.Lock()
422
  self._cleanup_done = 0
423
 
@@ -431,11 +613,13 @@ class RateLimitMiddleware(BaseHTTPMiddleware):
431
  self._cleanup_done = 0
432
  with self._lock:
433
  cutoff = now - self._window
434
- self._clients = {ip: [t for t in ts if t >= cutoff] for ip, ts in self._clients.items() if ts}
435
 
436
  with self._lock:
437
- timestamps = self._clients.get(client_ip, [])
438
- timestamps = [t for t in timestamps if now - t < self._window]
 
 
439
  if len(timestamps) >= self._max:
440
  return JSONResponse(
441
  {"error": "Rate limit exceeded. Try again later."},
@@ -443,7 +627,6 @@ class RateLimitMiddleware(BaseHTTPMiddleware):
443
  headers={"Retry-After": str(self._window)},
444
  )
445
  timestamps.append(now)
446
- self._clients[client_ip] = timestamps
447
  return await call_next(request)
448
 
449
 
@@ -483,10 +666,6 @@ def create_app(settings: Settings | None = None) -> FastAPI:
483
  app.state.model_service = model_service
484
  app.state.search_service = search_service
485
 
486
- frontend_assets_path = settings.frontend_path.parent / "assets"
487
- if frontend_assets_path.exists():
488
- app.mount("/assets", StaticFiles(directory=str(frontend_assets_path)), name="frontend-assets")
489
-
490
  cors_origins = [o.strip() for o in settings.cors_origins.split(",") if o.strip()]
491
  app.add_middleware(
492
  CORSMiddleware,
@@ -501,20 +680,23 @@ def create_app(settings: Settings | None = None) -> FastAPI:
501
 
502
  @app.get("/")
503
  async def root():
504
- if not settings.frontend_path.exists():
505
- raise HTTPException(status_code=404, detail="Frontend build is missing. Run `npm install` and `npm run build` in frontend/.")
506
- return FileResponse(str(settings.frontend_path))
 
 
 
 
507
 
508
  @app.get("/health")
509
  async def health():
510
- status = "healthy" if model_service.model_exists else "degraded"
511
  return JSONResponse(
512
  {
513
  "status": status,
514
  "model_loaded": model_service.model_loaded,
515
  "model_path": str(settings.model_path),
516
  "model_exists": model_service.model_exists,
517
- "frontend_exists": settings.frontend_path.exists(),
518
  "version": APP_VERSION,
519
  "gpu_layers": settings.n_gpu_layers,
520
  "last_error": model_service.last_error,
@@ -532,6 +714,12 @@ def create_app(settings: Settings | None = None) -> FastAPI:
532
  if err:
533
  return err
534
 
 
 
 
 
 
 
535
  async with model_lock:
536
  try:
537
  sanitized = req.model_copy(update={"prompt": prompt})
@@ -584,20 +772,33 @@ def create_app(settings: Settings | None = None) -> FastAPI:
584
  if not model_service.model_exists:
585
  return JSONResponse({"error": f"Model not found at {settings.model_path}"}, status_code=503)
586
 
 
 
 
 
 
 
 
 
 
587
  sanitized = req.model_copy(update={"prompt": prompt})
588
 
589
  async def _locked_stream():
590
  async with model_lock:
591
- token_q: queue.Queue = queue.Queue()
 
 
 
 
 
592
  thread = threading.Thread(
593
  target=model_service._run_stream,
594
- args=(sanitized, token_q),
595
  daemon=True,
596
  )
597
  thread.start()
598
- # Use thread-safe queue.get() instead of deprecated get_running_loop()
599
  while True:
600
- chunk = await asyncio.to_thread(token_q.get)
601
  if chunk is None:
602
  break
603
  yield chunk
@@ -636,6 +837,11 @@ def create_app(settings: Settings | None = None) -> FastAPI:
636
  "mode": f"{'GPU' if settings.n_gpu_layers > 0 else 'CPU'} (llama.cpp)",
637
  "gpu_layers": settings.n_gpu_layers,
638
  "context_window": settings.n_ctx,
 
 
 
 
 
639
  "model_loaded": model_service.model_loaded,
640
  "load_time_seconds": model_service.load_time_seconds,
641
  "busy": model_service.busy,
 
2
  """
3
  BlitzKode backend server.
4
 
5
+ Exposes a FastAPI backend for local GGUF inference through llama.cpp.
6
+ Model is loaded lazily so the module stays importable in tests and
7
+ environments where the model artifact is not present yet.
8
  """
9
 
10
  from __future__ import annotations
 
13
  import json
14
  import logging
15
  import os
16
+ import re
17
  import threading
18
  import time
19
  import urllib.error
20
  import urllib.parse
21
  import urllib.request
22
+ from collections import deque
23
+ from collections.abc import Callable
24
  from contextlib import asynccontextmanager, suppress
25
  from dataclasses import dataclass
26
  from dataclasses import field as dataclass_field
27
+ from html.parser import HTMLParser
28
  from pathlib import Path
29
  from typing import Any, Literal, cast
30
 
31
  import llama_cpp
32
  import uvicorn
33
+ from fastapi import FastAPI, Request
34
  from fastapi.middleware.cors import CORSMiddleware
35
+ from fastapi.responses import JSONResponse, StreamingResponse
 
36
  from pydantic import BaseModel, Field
37
  from starlette.middleware.base import BaseHTTPMiddleware
38
 
 
41
  CREATOR = "Sajad"
42
  ROOT_DIR = Path(__file__).resolve().parent
43
  DEFAULT_MODEL_PATH = ROOT_DIR / "blitzkode.gguf"
 
44
  DEFAULT_CONTEXT = 2048
45
  DEFAULT_MAX_PROMPT_LENGTH = 4000
46
  DEFAULT_MAX_TOKENS = 512
47
  DEFAULT_RATE_LIMIT_MAX = 30
48
  DEFAULT_MAX_SEARCH_RESULTS = 5
49
  DEFAULT_SEARCH_TIMEOUT_SECONDS = 8
50
+ DEFAULT_SEARCH_CACHE_TTL_SECONDS = 300
51
  DEFAULT_MAX_MESSAGES = 20
52
+ DEFAULT_BATCH = 256
53
+ DEFAULT_PROMPT_CACHE_BYTES = 64 * 1024 * 1024
54
  STOP_TOKENS = ["<|im_end|>", "<|im_start|>user"]
55
 
56
  SYSTEM_PROMPT = (
57
  "<|im_start|>system\n"
58
  "You are BlitzKode, an AI coding assistant created by Sajad. "
59
  "You are an expert in Python, JavaScript, Java, C++, and other programming languages. "
60
+ "For coding work, first identify the user's goal, constraints, and any unknowns. "
61
+ "If asked about a library, API, file, function, citation, execution result, or repository detail that is not provided, "
62
+ "do not fabricate it. Say you do not know and explain how to verify it. "
63
+ "Prefer safe minimal fixes over speculative code. "
64
  "Write clean, efficient, and well-documented code. Keep responses concise and practical.<|im_end|>"
65
  )
66
 
 
89
  return Path(value) if value else default
90
 
91
 
 
 
 
 
 
 
 
92
  def _validate_prompt(prompt: str, max_length: int) -> tuple[str, JSONResponse | None]:
93
  prompt = prompt.strip()
94
  if not prompt:
 
101
  return prompt, None
102
 
103
 
104
+ _SIGNATURE_QUERY_RE = re.compile(
105
+ r"(?:signature|how\s+(?:do|can)\s+i\s+use|usage\s+of|docs?\s+for).{0,120}?"
106
+ r"(?P<symbol>[A-Za-z_][\w.]*\s*\([^)]*\)|[A-Za-z_][\w.]*\s+function)",
107
+ re.IGNORECASE | re.DOTALL,
108
+ )
109
+ _CODE_CONTEXT_RE = re.compile(r"```|\b(def|class|function|interface|type|import|from)\b|\{\s*\"|\bsource\s+code\b", re.IGNORECASE)
110
+
111
+
112
+ def _grounding_guard_response(prompt: str, has_external_context: bool = False) -> str | None:
113
+ """Prevents confident fabricated API/function signatures when no source/docs are provided.
114
+
115
+ Small local models can ignore system instructions around unknown symbols. This guardrail only triggers for direct API/signature
116
+ lookup questions that contain no pasted source/docs context. It does not block normal code-generation tasks.
117
+ """
118
+ if has_external_context or _CODE_CONTEXT_RE.search(prompt):
119
+ return None
120
+
121
+ match = _SIGNATURE_QUERY_RE.search(prompt)
122
+ if not match:
123
+ return None
124
+
125
+ symbol = " ".join(match.group("symbol").split())
126
+ return (
127
+ f"I don't have enough verified context to know the signature or usage of `{symbol}`. "
128
+ "Please provide the source code or official documentation, or enable research mode so I can ground the answer."
129
+ )
130
+
131
+
132
  @dataclass(slots=True)
133
  class Settings:
134
  root_dir: Path = ROOT_DIR
135
  model_path: Path = dataclass_field(default_factory=lambda: _path_from_env("BLITZKODE_MODEL_PATH", DEFAULT_MODEL_PATH))
 
136
  host: str = os.getenv("BLITZKODE_HOST", "0.0.0.0")
137
  port: int = _int_from_env("BLITZKODE_PORT", 7860)
138
  n_gpu_layers: int = _int_from_env("BLITZKODE_GPU_LAYERS", 0)
139
  n_ctx: int = _int_from_env("BLITZKODE_N_CTX", DEFAULT_CONTEXT)
140
  n_threads: int = _int_from_env("BLITZKODE_THREADS", max(1, min(8, os.cpu_count() or 1)))
141
+ n_threads_batch: int = _int_from_env("BLITZKODE_THREADS_BATCH", max(1, min(8, os.cpu_count() or 1)))
142
+ n_batch: int = _int_from_env("BLITZKODE_BATCH", DEFAULT_BATCH)
143
+ n_ubatch: int = _int_from_env("BLITZKODE_UBATCH", min(DEFAULT_BATCH, 128))
144
+ prompt_cache_enabled: bool = _bool_from_env("BLITZKODE_PROMPT_CACHE", default=True)
145
+ prompt_cache_bytes: int = _int_from_env("BLITZKODE_PROMPT_CACHE_BYTES", DEFAULT_PROMPT_CACHE_BYTES)
146
+ use_mmap: bool = _bool_from_env("BLITZKODE_USE_MMAP", default=True)
147
+ use_mlock: bool = _bool_from_env("BLITZKODE_USE_MLOCK", default=False)
148
+ offload_kqv: bool = _bool_from_env("BLITZKODE_OFFLOAD_KQV", default=True)
149
  max_prompt_length: int = _int_from_env("BLITZKODE_MAX_PROMPT_LENGTH", DEFAULT_MAX_PROMPT_LENGTH)
150
  preload_model: bool = _bool_from_env("BLITZKODE_PRELOAD_MODEL", default=False)
151
  cors_origins: str = os.getenv("BLITZKODE_CORS_ORIGINS", "http://localhost:7860")
 
153
  web_search_enabled: bool = _bool_from_env("BLITZKODE_WEB_SEARCH", default=True)
154
  search_timeout_seconds: int = _int_from_env("BLITZKODE_SEARCH_TIMEOUT", DEFAULT_SEARCH_TIMEOUT_SECONDS)
155
  max_search_results: int = _int_from_env("BLITZKODE_MAX_SEARCH_RESULTS", DEFAULT_MAX_SEARCH_RESULTS)
156
+ search_cache_ttl_seconds: int = _int_from_env("BLITZKODE_SEARCH_CACHE_TTL", DEFAULT_SEARCH_CACHE_TTL_SECONDS)
157
 
158
 
159
  class MessageItem(BaseModel):
 
199
  }
200
 
201
 
202
+ class DuckDuckGoHTMLParser(HTMLParser):
203
+ def __init__(self, max_results: int):
204
+ super().__init__(convert_charrefs=True)
205
+ self.max_results = max_results
206
+ self.results: list[dict[str, str]] = []
207
+ self._active_field: Literal["title", "snippet"] | None = None
208
+ self._active_href = ""
209
+ self._text_parts: list[str] = []
210
+
211
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
212
+ if tag != "a":
213
+ return
214
+
215
+ attr_map = {name: value or "" for name, value in attrs}
216
+ classes = set(attr_map.get("class", "").split())
217
+ if "result__a" in classes:
218
+ if len(self.results) >= self.max_results:
219
+ return
220
+ self._active_field = "title"
221
+ elif "result__snippet" in classes:
222
+ self._active_field = "snippet"
223
+ else:
224
+ return
225
+
226
+ self._active_href = attr_map.get("href", "")
227
+ self._text_parts = []
228
+
229
+ def handle_data(self, data: str) -> None:
230
+ if self._active_field:
231
+ self._text_parts.append(data)
232
+
233
+ def handle_endtag(self, tag: str) -> None:
234
+ if tag != "a" or not self._active_field:
235
+ return
236
+
237
+ text = " ".join("".join(self._text_parts).split())
238
+ url = self._unwrap_result_url(self._active_href)
239
+
240
+ if self._active_field == "title" and text and url and len(self.results) < self.max_results:
241
+ self.results.append({"title": text, "url": url, "snippet": ""})
242
+ elif self._active_field == "snippet" and text and self.results:
243
+ target = next((item for item in reversed(self.results) if not item["snippet"]), None)
244
+ if target:
245
+ target["snippet"] = text
246
+
247
+ self._active_field = None
248
+ self._active_href = ""
249
+ self._text_parts = []
250
+
251
+ @staticmethod
252
+ def _unwrap_result_url(href: str) -> str:
253
+ href = href.strip()
254
+ if href.startswith("//"):
255
+ href = f"https:{href}"
256
+
257
+ parsed = urllib.parse.urlparse(href)
258
+ if "duckduckgo.com" in parsed.netloc and parsed.path.startswith("/l/"):
259
+ target = urllib.parse.parse_qs(parsed.query).get("uddg", [""])[0]
260
+ if target:
261
+ return target
262
+ return href
263
+
264
+
265
  class WebSearchService:
266
  def __init__(self, settings: Settings):
267
  self.settings = settings
268
+ self._cache: dict[tuple[str, int, bool], tuple[float, list[dict[str, str]]]] = {}
269
+ self._cache_lock = threading.Lock()
270
 
271
  @property
272
  def enabled(self) -> bool:
 
306
  title = text.split(" - ", 1)[0]
307
  self._append_result(results, seen_urls, title, url, text, max_results)
308
 
309
+ def _read_search_payload(self, request: urllib.request.Request) -> dict[str, Any]:
310
+ with urllib.request.urlopen(request, timeout=self.settings.search_timeout_seconds) as response:
311
+ raw = response.read().decode("utf-8")
312
+ try:
313
+ payload = json.loads(raw)
314
+ except json.JSONDecodeError as exc:
315
+ raise RuntimeError("Search provider returned an invalid JSON response") from exc
316
+ if not isinstance(payload, dict):
317
+ raise RuntimeError("Search provider returned an unexpected response shape")
318
+ return payload
319
+
320
+ def _search_html(
321
+ self,
322
+ query: str,
323
+ results: list[SearchResult],
324
+ seen_urls: set[str],
325
+ max_results: int,
326
+ ) -> None:
327
+ params = urllib.parse.urlencode({"q": query})
328
+ request = urllib.request.Request(
329
+ f"https://html.duckduckgo.com/html/?{params}",
330
+ headers={"User-Agent": f"Mozilla/5.0 {APP_NAME}/{APP_VERSION}"},
331
+ )
332
+ with urllib.request.urlopen(request, timeout=self.settings.search_timeout_seconds) as response:
333
+ raw = response.read().decode("utf-8", errors="replace")
334
+
335
+ parser = DuckDuckGoHTMLParser(max_results)
336
+ parser.feed(raw)
337
+ for item in parser.results:
338
+ self._append_result(results, seen_urls, item["title"], item["url"], item["snippet"], max_results)
339
+
340
  def search(self, query: str, max_results: int = DEFAULT_MAX_SEARCH_RESULTS, deep: bool = False) -> list[dict[str, str]]:
341
  if not self.enabled:
342
  raise RuntimeError("Web search is disabled. Set BLITZKODE_WEB_SEARCH=true to enable it.")
 
346
  raise ValueError("Search query is required")
347
 
348
  limit = min(max_results, max(1, self.settings.max_search_results), 10)
349
+ cache_key = (query.lower(), limit, deep)
350
+ now = time.monotonic()
351
+ with self._cache_lock:
352
+ cached = self._cache.get(cache_key)
353
+ if cached and now - cached[0] < self.settings.search_cache_ttl_seconds:
354
+ return [dict(item) for item in cached[1]]
355
+
356
  results: list[SearchResult] = []
357
  seen_urls: set[str] = set()
358
 
 
371
  f"https://api.duckduckgo.com/?{params}",
372
  headers={"User-Agent": f"{APP_NAME}/{APP_VERSION}"},
373
  )
374
+ try:
375
+ payload = self._read_search_payload(request)
376
+ except RuntimeError as exc:
377
+ result_count = len(results)
378
+ self._search_html(variant, results, seen_urls, limit)
379
+ if len(results) == result_count:
380
+ raise exc
381
+ continue
382
 
383
  self._append_result(
384
  results,
 
390
  )
391
  self._collect_related_topics(payload.get("RelatedTopics", []), results, seen_urls, limit)
392
 
393
+ search_payload = [result.as_dict() for result in results]
394
+ with self._cache_lock:
395
+ self._cache[cache_key] = (time.monotonic(), search_payload)
396
+ return [dict(item) for item in search_payload]
397
 
398
 
399
  class ModelService:
 
439
 
440
  start_time = time.perf_counter()
441
  try:
442
+ self._llm = self._create_llama()
 
 
 
 
 
 
 
 
 
 
443
  self._load_time_seconds = time.perf_counter() - start_time
444
  self._last_error = None
445
+ self._configure_prompt_cache(self._llm)
446
+ logger.info(
447
+ "Model loaded in %.2fs (gpu_layers=%d, ctx=%d, threads=%d, batch=%d)",
448
+ self._load_time_seconds,
449
+ self.settings.n_gpu_layers,
450
+ self.settings.n_ctx,
451
+ self.settings.n_threads,
452
+ self.settings.n_batch,
453
+ )
454
  except Exception as exc:
455
  self._last_error = str(exc)
456
  logger.error("Model load failed: %s", exc)
 
458
 
459
  return self._llm
460
 
461
+ def _create_llama(self) -> llama_cpp.Llama:
462
+ kwargs: dict[str, Any] = {
463
+ "model_path": str(self.settings.model_path),
464
+ "n_gpu_layers": self.settings.n_gpu_layers,
465
+ "n_ctx": self.settings.n_ctx,
466
+ "n_threads": self.settings.n_threads,
467
+ "n_threads_batch": self.settings.n_threads_batch,
468
+ "n_batch": self.settings.n_batch,
469
+ "n_ubatch": self.settings.n_ubatch,
470
+ "offload_kqv": self.settings.offload_kqv,
471
+ "verbose": False,
472
+ "use_mmap": self.settings.use_mmap,
473
+ "use_mlock": self.settings.use_mlock,
474
+ "seed": -1,
475
+ }
476
+ try:
477
+ return llama_cpp.Llama(**kwargs)
478
+ except TypeError as exc:
479
+ message = str(exc)
480
+ unsupported = [key for key in ("n_threads_batch", "n_ubatch", "offload_kqv") if key in message]
481
+ if not unsupported:
482
+ raise
483
+ for key in unsupported:
484
+ kwargs.pop(key, None)
485
+ logger.warning("Retrying model load without unsupported llama.cpp options: %s", ", ".join(unsupported))
486
+ return llama_cpp.Llama(**kwargs)
487
+
488
+ def _configure_prompt_cache(self, llm: llama_cpp.Llama) -> None:
489
+ if not self.settings.prompt_cache_enabled or self.settings.prompt_cache_bytes <= 0:
490
+ return
491
+ cache_cls = getattr(llama_cpp, "LlamaRAMCache", None)
492
+ set_cache = getattr(llm, "set_cache", None)
493
+ if cache_cls is None or set_cache is None:
494
+ return
495
+ try:
496
+ set_cache(cache_cls(capacity_bytes=self.settings.prompt_cache_bytes))
497
+ except Exception as exc:
498
+ logger.warning("Prompt cache setup skipped: %s", exc)
499
+
500
  def build_prompt(self, req: GenerateRequest) -> str:
501
  parts = [SYSTEM_PROMPT]
502
  for msg in req.messages:
 
527
  )
528
  if len(research_prompt) > max_length:
529
  research_prompt = research_prompt[: max_length - 120].rstrip() + "\n\n[Context truncated to fit prompt limit.]"
530
+ return cast(GenerateRequest, req.model_copy(update={"prompt": research_prompt}))
531
 
532
  def _gen_params(self, req: GenerateRequest) -> dict:
533
  return {
 
554
  finally:
555
  self._busy = False
556
 
557
+ def _run_stream(self, req: GenerateRequest, emit: Callable[[str | None], None]):
558
+ """Runs streaming inference in a worker thread and emits SSE chunks."""
559
  try:
560
  llm = self.load_model()
561
  self._busy = True
 
568
  text = token["choices"][0].get("text", "")
569
  if text:
570
  token_count += 1
571
+ emit(f"data: {json.dumps({'token': text})}\n\n")
572
  elapsed = time.perf_counter() - start
573
  logger.info("Streamed %d tokens in %.2fs", token_count, elapsed)
574
+ emit("data: [DONE]\n\n")
575
  except Exception as exc:
576
  logger.error("Stream error: %s", exc)
577
+ emit(f"data: {json.dumps({'error': str(exc)})}\n\n")
578
  finally:
579
  self._busy = False
580
+ emit(None)
581
 
582
 
583
  def _check_api_key(request: Request, settings: Settings) -> JSONResponse | None:
 
599
  super().__init__(app)
600
  self._max = max_requests
601
  self._window = window_seconds
602
+ self._clients: dict[str, deque[float]] = {}
603
  self._lock = threading.Lock()
604
  self._cleanup_done = 0
605
 
 
613
  self._cleanup_done = 0
614
  with self._lock:
615
  cutoff = now - self._window
616
+ self._clients = {ip: deque(t for t in ts if t >= cutoff) for ip, ts in self._clients.items() if ts}
617
 
618
  with self._lock:
619
+ timestamps = self._clients.setdefault(client_ip, deque())
620
+ cutoff = now - self._window
621
+ while timestamps and timestamps[0] < cutoff:
622
+ timestamps.popleft()
623
  if len(timestamps) >= self._max:
624
  return JSONResponse(
625
  {"error": "Rate limit exceeded. Try again later."},
 
627
  headers={"Retry-After": str(self._window)},
628
  )
629
  timestamps.append(now)
 
630
  return await call_next(request)
631
 
632
 
 
666
  app.state.model_service = model_service
667
  app.state.search_service = search_service
668
 
 
 
 
 
669
  cors_origins = [o.strip() for o in settings.cors_origins.split(",") if o.strip()]
670
  app.add_middleware(
671
  CORSMiddleware,
 
680
 
681
  @app.get("/")
682
  async def root():
683
+ return JSONResponse(
684
+ {
685
+ "name": APP_NAME,
686
+ "version": APP_VERSION,
687
+ "message": "BlitzKode backend API is running. Use /info for endpoint details.",
688
+ }
689
+ )
690
 
691
  @app.get("/health")
692
  async def health():
693
+ status = "healthy" if model_service.model_exists and not model_service.last_error else "degraded"
694
  return JSONResponse(
695
  {
696
  "status": status,
697
  "model_loaded": model_service.model_loaded,
698
  "model_path": str(settings.model_path),
699
  "model_exists": model_service.model_exists,
 
700
  "version": APP_VERSION,
701
  "gpu_layers": settings.n_gpu_layers,
702
  "last_error": model_service.last_error,
 
714
  if err:
715
  return err
716
 
717
+ guard_response = _grounding_guard_response(prompt)
718
+ if guard_response:
719
+ return JSONResponse(
720
+ {"response": guard_response, "creator": CREATOR, "model": APP_NAME, "version": APP_VERSION, "guarded": True}
721
+ )
722
+
723
  async with model_lock:
724
  try:
725
  sanitized = req.model_copy(update={"prompt": prompt})
 
772
  if not model_service.model_exists:
773
  return JSONResponse({"error": f"Model not found at {settings.model_path}"}, status_code=503)
774
 
775
+ guard_response = _grounding_guard_response(prompt)
776
+ if guard_response:
777
+
778
+ async def _guarded_stream():
779
+ yield f"data: {json.dumps({'token': guard_response})}\n\n"
780
+ yield "data: [DONE]\n\n"
781
+
782
+ return StreamingResponse(_guarded_stream(), media_type="text/event-stream")
783
+
784
  sanitized = req.model_copy(update={"prompt": prompt})
785
 
786
  async def _locked_stream():
787
  async with model_lock:
788
+ loop = asyncio.get_running_loop()
789
+ token_q: asyncio.Queue[str | None] = asyncio.Queue()
790
+
791
+ def emit(chunk: str | None) -> None:
792
+ loop.call_soon_threadsafe(token_q.put_nowait, chunk)
793
+
794
  thread = threading.Thread(
795
  target=model_service._run_stream,
796
+ args=(sanitized, emit),
797
  daemon=True,
798
  )
799
  thread.start()
 
800
  while True:
801
+ chunk = await token_q.get()
802
  if chunk is None:
803
  break
804
  yield chunk
 
837
  "mode": f"{'GPU' if settings.n_gpu_layers > 0 else 'CPU'} (llama.cpp)",
838
  "gpu_layers": settings.n_gpu_layers,
839
  "context_window": settings.n_ctx,
840
+ "threads": settings.n_threads,
841
+ "threads_batch": settings.n_threads_batch,
842
+ "batch": settings.n_batch,
843
+ "ubatch": settings.n_ubatch,
844
+ "prompt_cache_enabled": settings.prompt_cache_enabled,
845
  "model_loaded": model_service.model_loaded,
846
  "load_time_seconds": model_service.load_time_seconds,
847
  "busy": model_service.busy,