Spaces:

build-small-hackathon
/

small-functional-movement-screening

Running on Zero

App Files Files Community

small-functional-movement-screening / formscout /serving /llama_cpp.py

BladeSzaSza

fix: define REPO_NAME in hf_upload.sh (ensure_blade_space referenced it)

4948993 verified 19 days ago

Raw

History Blame Contribute Delete

5.05 kB

	"""
	llama.cpp HTTP client wrapper for FormScout.

	Wraps the llama.cpp server's /completion and /embedding endpoints.
	Falls back gracefully when the server is unavailable.

	Model: Qwen3-VL-8B-Instruct (Q4_K_M GGUF) for VLM inference.
	Model: Qwen3-VL-Embedding-8B (Q4_K_M GGUF) for embeddings.
	Params: 8B each (shared backbone).
	License: Apache-2.0.
	"""
	from __future__ import annotations

	import base64
	import json
	import logging
	from pathlib import Path
	from typing import Any

	import requests

	from formscout import config

	logger = logging.getLogger(__name__)

	_TIMEOUT = 120 # seconds — VLM can be slow


	class LlamaCppClient:
	"""HTTP client for a llama.cpp server instance."""

	def __init__(self, host: str \| None = None, port: int \| None = None):
	self.host = host or config.LLAMA_CPP_HOST
	self.port = port or config.LLAMA_CPP_PORT_VLM
	self.base_url = f"http://{self.host}:{self.port}"

	@property
	def available(self) -> bool:
	"""Check if the server is reachable."""
	try:
	r = requests.get(f"{self.base_url}/health", timeout=5)
	return r.status_code == 200
	except (requests.ConnectionError, requests.Timeout):
	return False

	def complete(
	self,
	prompt: str,
	images: list[str] \| None = None,
	max_tokens: int = 512,
	temperature: float = 0.1,
	stop: list[str] \| None = None,
	) -> dict[str, Any]:
	"""
	Send a chat-completion request (OpenAI-compatible /v1/chat/completions —
	required for multimodal: llama-server routes images through the mmproj
	only on this endpoint). Returns parsed JSON if the response is JSON,
	otherwise returns {"text": raw_text}.

	Args:
	prompt: The text prompt (system + user combined).
	images: Optional list of base64-encoded JPEGs or file paths.
	max_tokens: Max generation tokens.
	temperature: Sampling temperature.
	stop: Stop sequences (default: none — JSON output must not be truncated).
	"""
	content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
	for img in images or []:
	if len(img) < 4096 and Path(img).exists():
	with open(img, "rb") as f:
	b64 = base64.b64encode(f.read()).decode()
	else:
	b64 = img # already base64
	content.append({
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
	})

	payload: dict[str, Any] = {
	"messages": [{"role": "user", "content": content}],
	"max_tokens": max_tokens,
	"temperature": temperature,
	}
	if stop:
	payload["stop"] = stop

	try:
	r = requests.post(
	f"{self.base_url}/v1/chat/completions",
	json=payload,
	timeout=_TIMEOUT,
	)
	r.raise_for_status()
	result = r.json()
	text = result["choices"][0]["message"]["content"] or ""
	return self._parse_json_reply(text)
	except requests.ConnectionError:
	return {"error": "llama.cpp server not available", "text": ""}
	except requests.Timeout:
	return {"error": "llama.cpp server timeout", "text": ""}
	except Exception as e:
	return {"error": str(e), "text": ""}

	@staticmethod
	def _parse_json_reply(text: str) -> dict[str, Any]:
	"""Parse model output as JSON, tolerating markdown fences."""
	stripped = text.strip()
	if stripped.startswith("```"):
	stripped = stripped.split("\n", 1)[-1]
	stripped = stripped.rsplit("```", 1)[0].strip()
	try:
	parsed = json.loads(stripped)
	if isinstance(parsed, dict):
	return parsed
	except (json.JSONDecodeError, TypeError):
	pass
	return {"text": text}


	class EmbeddingClient:
	"""HTTP client for the llama.cpp embedding server."""

	def __init__(self, host: str \| None = None, port: int \| None = None):
	self.host = host or config.LLAMA_CPP_HOST
	self.port = port or config.LLAMA_CPP_PORT_EMBED
	self.base_url = f"http://{self.host}:{self.port}"

	@property
	def available(self) -> bool:
	try:
	r = requests.get(f"{self.base_url}/health", timeout=5)
	return r.status_code == 200
	except (requests.ConnectionError, requests.Timeout):
	return False

	def embed(self, text: str) -> list[float] \| None:
	"""Get embedding vector for text. Returns None on failure."""
	try:
	r = requests.post(
	f"{self.base_url}/embedding",
	json={"content": text},
	timeout=30,
	)
	r.raise_for_status()
	data = r.json()
	return data.get("embedding")
	except Exception:
	return None