Spaces:
Running on Zero
Running on Zero
| """ | |
| llama.cpp HTTP client wrapper for FormScout. | |
| Wraps the llama.cpp server's /completion and /embedding endpoints. | |
| Falls back gracefully when the server is unavailable. | |
| Model: Qwen3-VL-8B-Instruct (Q4_K_M GGUF) for VLM inference. | |
| Model: Qwen3-VL-Embedding-8B (Q4_K_M GGUF) for embeddings. | |
| Params: 8B each (shared backbone). | |
| License: Apache-2.0. | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from typing import Any | |
| import requests | |
| from formscout import config | |
| logger = logging.getLogger(__name__) | |
| _TIMEOUT = 120 # seconds — VLM can be slow | |
| class LlamaCppClient: | |
| """HTTP client for a llama.cpp server instance.""" | |
| def __init__(self, host: str | None = None, port: int | None = None): | |
| self.host = host or config.LLAMA_CPP_HOST | |
| self.port = port or config.LLAMA_CPP_PORT_VLM | |
| self.base_url = f"http://{self.host}:{self.port}" | |
| def available(self) -> bool: | |
| """Check if the server is reachable.""" | |
| try: | |
| r = requests.get(f"{self.base_url}/health", timeout=5) | |
| return r.status_code == 200 | |
| except (requests.ConnectionError, requests.Timeout): | |
| return False | |
| def complete( | |
| self, | |
| prompt: str, | |
| images: list[str] | None = None, | |
| max_tokens: int = 512, | |
| temperature: float = 0.1, | |
| stop: list[str] | None = None, | |
| ) -> dict[str, Any]: | |
| """ | |
| Send a chat-completion request (OpenAI-compatible /v1/chat/completions — | |
| required for multimodal: llama-server routes images through the mmproj | |
| only on this endpoint). Returns parsed JSON if the response is JSON, | |
| otherwise returns {"text": raw_text}. | |
| Args: | |
| prompt: The text prompt (system + user combined). | |
| images: Optional list of base64-encoded JPEGs or file paths. | |
| max_tokens: Max generation tokens. | |
| temperature: Sampling temperature. | |
| stop: Stop sequences (default: none — JSON output must not be truncated). | |
| """ | |
| content: list[dict[str, Any]] = [{"type": "text", "text": prompt}] | |
| for img in images or []: | |
| if len(img) < 4096 and Path(img).exists(): | |
| with open(img, "rb") as f: | |
| b64 = base64.b64encode(f.read()).decode() | |
| else: | |
| b64 = img # already base64 | |
| content.append({ | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, | |
| }) | |
| payload: dict[str, Any] = { | |
| "messages": [{"role": "user", "content": content}], | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| } | |
| if stop: | |
| payload["stop"] = stop | |
| try: | |
| r = requests.post( | |
| f"{self.base_url}/v1/chat/completions", | |
| json=payload, | |
| timeout=_TIMEOUT, | |
| ) | |
| r.raise_for_status() | |
| result = r.json() | |
| text = result["choices"][0]["message"]["content"] or "" | |
| return self._parse_json_reply(text) | |
| except requests.ConnectionError: | |
| return {"error": "llama.cpp server not available", "text": ""} | |
| except requests.Timeout: | |
| return {"error": "llama.cpp server timeout", "text": ""} | |
| except Exception as e: | |
| return {"error": str(e), "text": ""} | |
| def _parse_json_reply(text: str) -> dict[str, Any]: | |
| """Parse model output as JSON, tolerating markdown fences.""" | |
| stripped = text.strip() | |
| if stripped.startswith("```"): | |
| stripped = stripped.split("\n", 1)[-1] | |
| stripped = stripped.rsplit("```", 1)[0].strip() | |
| try: | |
| parsed = json.loads(stripped) | |
| if isinstance(parsed, dict): | |
| return parsed | |
| except (json.JSONDecodeError, TypeError): | |
| pass | |
| return {"text": text} | |
| class EmbeddingClient: | |
| """HTTP client for the llama.cpp embedding server.""" | |
| def __init__(self, host: str | None = None, port: int | None = None): | |
| self.host = host or config.LLAMA_CPP_HOST | |
| self.port = port or config.LLAMA_CPP_PORT_EMBED | |
| self.base_url = f"http://{self.host}:{self.port}" | |
| def available(self) -> bool: | |
| try: | |
| r = requests.get(f"{self.base_url}/health", timeout=5) | |
| return r.status_code == 200 | |
| except (requests.ConnectionError, requests.Timeout): | |
| return False | |
| def embed(self, text: str) -> list[float] | None: | |
| """Get embedding vector for text. Returns None on failure.""" | |
| try: | |
| r = requests.post( | |
| f"{self.base_url}/embedding", | |
| json={"content": text}, | |
| timeout=30, | |
| ) | |
| r.raise_for_status() | |
| data = r.json() | |
| return data.get("embedding") | |
| except Exception: | |
| return None | |