""" llama.cpp HTTP client wrapper for FormScout. Wraps the llama.cpp server's /completion and /embedding endpoints. Falls back gracefully when the server is unavailable. Model: Qwen3-VL-8B-Instruct (Q4_K_M GGUF) for VLM inference. Model: Qwen3-VL-Embedding-8B (Q4_K_M GGUF) for embeddings. Params: 8B each (shared backbone). License: Apache-2.0. """ from __future__ import annotations import base64 import json import logging from pathlib import Path from typing import Any import requests from formscout import config logger = logging.getLogger(__name__) _TIMEOUT = 120 # seconds — VLM can be slow class LlamaCppClient: """HTTP client for a llama.cpp server instance.""" def __init__(self, host: str | None = None, port: int | None = None): self.host = host or config.LLAMA_CPP_HOST self.port = port or config.LLAMA_CPP_PORT_VLM self.base_url = f"http://{self.host}:{self.port}" @property def available(self) -> bool: """Check if the server is reachable.""" try: r = requests.get(f"{self.base_url}/health", timeout=5) return r.status_code == 200 except (requests.ConnectionError, requests.Timeout): return False def complete( self, prompt: str, images: list[str] | None = None, max_tokens: int = 512, temperature: float = 0.1, stop: list[str] | None = None, ) -> dict[str, Any]: """ Send a chat-completion request (OpenAI-compatible /v1/chat/completions — required for multimodal: llama-server routes images through the mmproj only on this endpoint). Returns parsed JSON if the response is JSON, otherwise returns {"text": raw_text}. Args: prompt: The text prompt (system + user combined). images: Optional list of base64-encoded JPEGs or file paths. max_tokens: Max generation tokens. temperature: Sampling temperature. stop: Stop sequences (default: none — JSON output must not be truncated). """ content: list[dict[str, Any]] = [{"type": "text", "text": prompt}] for img in images or []: if len(img) < 4096 and Path(img).exists(): with open(img, "rb") as f: b64 = base64.b64encode(f.read()).decode() else: b64 = img # already base64 content.append({ "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, }) payload: dict[str, Any] = { "messages": [{"role": "user", "content": content}], "max_tokens": max_tokens, "temperature": temperature, } if stop: payload["stop"] = stop try: r = requests.post( f"{self.base_url}/v1/chat/completions", json=payload, timeout=_TIMEOUT, ) r.raise_for_status() result = r.json() text = result["choices"][0]["message"]["content"] or "" return self._parse_json_reply(text) except requests.ConnectionError: return {"error": "llama.cpp server not available", "text": ""} except requests.Timeout: return {"error": "llama.cpp server timeout", "text": ""} except Exception as e: return {"error": str(e), "text": ""} @staticmethod def _parse_json_reply(text: str) -> dict[str, Any]: """Parse model output as JSON, tolerating markdown fences.""" stripped = text.strip() if stripped.startswith("```"): stripped = stripped.split("\n", 1)[-1] stripped = stripped.rsplit("```", 1)[0].strip() try: parsed = json.loads(stripped) if isinstance(parsed, dict): return parsed except (json.JSONDecodeError, TypeError): pass return {"text": text} class EmbeddingClient: """HTTP client for the llama.cpp embedding server.""" def __init__(self, host: str | None = None, port: int | None = None): self.host = host or config.LLAMA_CPP_HOST self.port = port or config.LLAMA_CPP_PORT_EMBED self.base_url = f"http://{self.host}:{self.port}" @property def available(self) -> bool: try: r = requests.get(f"{self.base_url}/health", timeout=5) return r.status_code == 200 except (requests.ConnectionError, requests.Timeout): return False def embed(self, text: str) -> list[float] | None: """Get embedding vector for text. Returns None on failure.""" try: r = requests.post( f"{self.base_url}/embedding", json={"content": text}, timeout=30, ) r.raise_for_status() data = r.json() return data.get("embedding") except Exception: return None