BladeSzaSza's picture
fix: define REPO_NAME in hf_upload.sh (ensure_blade_space referenced it)
4948993 verified
Raw
History Blame Contribute Delete
5.05 kB
"""
llama.cpp HTTP client wrapper for FormScout.
Wraps the llama.cpp server's /completion and /embedding endpoints.
Falls back gracefully when the server is unavailable.
Model: Qwen3-VL-8B-Instruct (Q4_K_M GGUF) for VLM inference.
Model: Qwen3-VL-Embedding-8B (Q4_K_M GGUF) for embeddings.
Params: 8B each (shared backbone).
License: Apache-2.0.
"""
from __future__ import annotations
import base64
import json
import logging
from pathlib import Path
from typing import Any
import requests
from formscout import config
logger = logging.getLogger(__name__)
_TIMEOUT = 120 # seconds — VLM can be slow
class LlamaCppClient:
"""HTTP client for a llama.cpp server instance."""
def __init__(self, host: str | None = None, port: int | None = None):
self.host = host or config.LLAMA_CPP_HOST
self.port = port or config.LLAMA_CPP_PORT_VLM
self.base_url = f"http://{self.host}:{self.port}"
@property
def available(self) -> bool:
"""Check if the server is reachable."""
try:
r = requests.get(f"{self.base_url}/health", timeout=5)
return r.status_code == 200
except (requests.ConnectionError, requests.Timeout):
return False
def complete(
self,
prompt: str,
images: list[str] | None = None,
max_tokens: int = 512,
temperature: float = 0.1,
stop: list[str] | None = None,
) -> dict[str, Any]:
"""
Send a chat-completion request (OpenAI-compatible /v1/chat/completions —
required for multimodal: llama-server routes images through the mmproj
only on this endpoint). Returns parsed JSON if the response is JSON,
otherwise returns {"text": raw_text}.
Args:
prompt: The text prompt (system + user combined).
images: Optional list of base64-encoded JPEGs or file paths.
max_tokens: Max generation tokens.
temperature: Sampling temperature.
stop: Stop sequences (default: none — JSON output must not be truncated).
"""
content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
for img in images or []:
if len(img) < 4096 and Path(img).exists():
with open(img, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
else:
b64 = img # already base64
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
})
payload: dict[str, Any] = {
"messages": [{"role": "user", "content": content}],
"max_tokens": max_tokens,
"temperature": temperature,
}
if stop:
payload["stop"] = stop
try:
r = requests.post(
f"{self.base_url}/v1/chat/completions",
json=payload,
timeout=_TIMEOUT,
)
r.raise_for_status()
result = r.json()
text = result["choices"][0]["message"]["content"] or ""
return self._parse_json_reply(text)
except requests.ConnectionError:
return {"error": "llama.cpp server not available", "text": ""}
except requests.Timeout:
return {"error": "llama.cpp server timeout", "text": ""}
except Exception as e:
return {"error": str(e), "text": ""}
@staticmethod
def _parse_json_reply(text: str) -> dict[str, Any]:
"""Parse model output as JSON, tolerating markdown fences."""
stripped = text.strip()
if stripped.startswith("```"):
stripped = stripped.split("\n", 1)[-1]
stripped = stripped.rsplit("```", 1)[0].strip()
try:
parsed = json.loads(stripped)
if isinstance(parsed, dict):
return parsed
except (json.JSONDecodeError, TypeError):
pass
return {"text": text}
class EmbeddingClient:
"""HTTP client for the llama.cpp embedding server."""
def __init__(self, host: str | None = None, port: int | None = None):
self.host = host or config.LLAMA_CPP_HOST
self.port = port or config.LLAMA_CPP_PORT_EMBED
self.base_url = f"http://{self.host}:{self.port}"
@property
def available(self) -> bool:
try:
r = requests.get(f"{self.base_url}/health", timeout=5)
return r.status_code == 200
except (requests.ConnectionError, requests.Timeout):
return False
def embed(self, text: str) -> list[float] | None:
"""Get embedding vector for text. Returns None on failure."""
try:
r = requests.post(
f"{self.base_url}/embedding",
json={"content": text},
timeout=30,
)
r.raise_for_status()
data = r.json()
return data.get("embedding")
except Exception:
return None