Instructions to use LiquidAI/LFM2.5-VL-1.6B-Extract with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use LiquidAI/LFM2.5-VL-1.6B-Extract with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="LiquidAI/LFM2.5-VL-1.6B-Extract")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("LiquidAI/LFM2.5-VL-1.6B-Extract")
model = AutoModelForImageTextToText.from_pretrained("LiquidAI/LFM2.5-VL-1.6B-Extract")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use LiquidAI/LFM2.5-VL-1.6B-Extract with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "LiquidAI/LFM2.5-VL-1.6B-Extract"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LiquidAI/LFM2.5-VL-1.6B-Extract",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/LiquidAI/LFM2.5-VL-1.6B-Extract

SGLang

How to use LiquidAI/LFM2.5-VL-1.6B-Extract with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "LiquidAI/LFM2.5-VL-1.6B-Extract" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LiquidAI/LFM2.5-VL-1.6B-Extract",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "LiquidAI/LFM2.5-VL-1.6B-Extract" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LiquidAI/LFM2.5-VL-1.6B-Extract",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use LiquidAI/LFM2.5-VL-1.6B-Extract with Docker Model Runner:
```
docker model run hf.co/LiquidAI/LFM2.5-VL-1.6B-Extract
```

LFM2.5-VL-1.6B-Extract / model_eval /judge.py

zhkleciel

Super-squash branch 'main' using huggingface_hub

21073aa 4 days ago

raw

history blame contribute delete

10.5 kB

	"""OpenRouter-backed VLM judge.

	One prompt per sample (batched-over-keys in one JSON answer), run
	concurrently via asyncio. The VLM judge scores every key against the
	image and is the only quality signal we ship.

	Requires `OPENROUTER_API_KEY` in the environment. OpenRouter is
	OpenAI-compatible, so we point the `openai` SDK's `base_url` at it.
	"""

	from __future__ import annotations

	import asyncio
	import base64
	import json
	import logging
	import os
	import re
	from pathlib import Path
	from string import Template
	from typing import Any

	from openai import AsyncOpenAI
	from tqdm.asyncio import tqdm_asyncio

	logger = logging.getLogger(__name__)

	_PROMPT_DIR = Path(__file__).resolve().parent / "prompts"
	_VLM_JUDGE_TPL = Template((_PROMPT_DIR / "vlm_judge_batch.txt").read_text(encoding="utf-8"))

	OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"

	# Reasoning suppression for the VLM judge. Default model
	# (`qwen/qwen3.5-35b-a3b`) requires `enable_thinking=False` to avoid
	# burning the full token budget on internal thought and returning
	# `finish_reason=length` with no visible output. Both layers (top-level
	# `reasoning` + `chat_template_kwargs`) are sent so whichever the
	# underlying provider honors gets used; the other is ignored.
	_VLM_JUDGE_REASONING = {
	"reasoning": {"enabled": False},
	"chat_template_kwargs": {"enable_thinking": False},
	}


	# ─── per-key pre-classification ────────────────────────────────────────────


	def normalize_bool(gt_val: object, pred_val: object) -> object:
	"""If GT is bool and pred is a yes/no/true/false string, coerce."""
	if isinstance(gt_val, bool) and isinstance(pred_val, str):
	lower = pred_val.strip().lower()
	if lower in ("yes", "true"):
	return True
	if lower in ("no", "false"):
	return False
	return pred_val


	def initialize_per_key_evals(records: list[dict[str, Any]]) -> None:
	"""Build a `per_key` mapping `{key: {gt, pred}}` for the VLM judge to score."""
	for rec in records:
	per_key: dict[str, dict[str, Any]] = {}
	gt = rec["ground_truth"]
	pred = rec["prediction_json"]
	for key, gt_val in gt.items():
	pred_val = pred.get(key)
	if pred_val is not None:
	pred_val = normalize_bool(gt_val, pred_val)
	per_key[key] = {"gt": gt_val, "pred": pred_val}
	rec["per_key"] = per_key


	# ─── prompt building ───────────────────────────────────────────────────────


	def _vlm_attr_block(entries: list[tuple[str, str, object]]) -> str:
	lines: list[str] = []
	for key, desc, pred in entries:
	lines.append(f'- "{key}": {desc}\n predicted: {json.dumps(pred, ensure_ascii=False)}')
	return "\n".join(lines)


	def build_vlm_judge_prompt(entries: list[tuple[str, str, object]]) -> str:
	return _VLM_JUDGE_TPL.substitute(attributes=_vlm_attr_block(entries))


	# ─── response parsing ──────────────────────────────────────────────────────


	_NUM_RE = re.compile(r"(\d+\.?\d*)")


	def _clamp(x: float) -> float:
	return 0.0 if x < 0.0 else (1.0 if x > 1.0 else x)


	def parse_score(text: str \| None) -> float:
	if not text:
	return 0.0
	text = text.strip()
	try:
	return _clamp(float(text))
	except ValueError:
	m = _NUM_RE.search(text)
	if not m:
	return 0.0
	try:
	return _clamp(float(m.group(1)))
	except ValueError:
	return 0.0


	def parse_batch_scores(text: str \| None, expected_keys: list[str]) -> dict[str, float]:
	"""Parse `{key: score}` from judge output; missing keys default to 0.0."""
	result = {k: 0.0 for k in expected_keys}
	if not text:
	return result
	# Reuse the JSON extractor from extract.py — same logic.
	from extract import extract_json_strict_first

	parsed, _ = extract_json_strict_first(text)
	if not isinstance(parsed, dict):
	return result
	for k in expected_keys:
	v = parsed.get(k)
	if isinstance(v, bool):
	# Defensive: bool is a subclass of int. Don't accept it as a score.
	continue
	if isinstance(v, (int, float)):
	result[k] = _clamp(float(v))
	elif isinstance(v, str):
	# Judge sometimes returns string-formatted numbers like "0.8";
	# parse_score extracts the leading numeric.
	result[k] = parse_score(v)
	return result


	def per_sample_judge_avg(per_key: dict[str, dict[str, Any]], score_field: str) -> float \| None:
	"""Average a judge score across all keys of one sample. None if no scores."""
	scores = [
	float(v.get(score_field))
	for v in per_key.values()
	if isinstance(v.get(score_field), (int, float)) and not isinstance(v.get(score_field), bool)
	]
	return sum(scores) / len(scores) if scores else None


	# ─── OpenRouter client + concurrent dispatch ───────────────────────────────


	def _img_to_data_url(img_bytes: bytes) -> str:
	return f"data:image/jpeg;base64,{base64.b64encode(img_bytes).decode('ascii')}"


	def _make_client(api_key: str \| None) -> AsyncOpenAI:
	key = api_key or os.environ.get("OPENROUTER_API_KEY")
	if not key:
	raise RuntimeError(
	"OPENROUTER_API_KEY is not set. Get a key at https://openrouter.ai/keys "
	"and `export OPENROUTER_API_KEY=...` before running."
	)
	return AsyncOpenAI(base_url=OPENROUTER_BASE_URL, api_key=key, max_retries=3, timeout=120.0)


	async def _one_chat(
	client: AsyncOpenAI,
	*,
	model: str,
	system: str,
	user_text: str,
	image_bytes: bytes \| None,
	max_tokens: int,
	semaphore: asyncio.Semaphore,
	extra_body: dict[str, Any] \| None = None,
	max_retries_on_empty: int = 3,
	) -> str:
	"""Single OpenRouter chat call, rate-limited by `semaphore`.

	Some OpenRouter providers occasionally return HTTP 200 with empty content
	(no model output). Treating that as success silently fails the per-key
	JSON parse — so we retry up to `max_retries_on_empty` times before
	giving up.
	"""
	user_content: list[dict[str, Any]] = [{"type": "text", "text": user_text}]
	if image_bytes:
	user_content.insert(0, {"type": "image_url", "image_url": {"url": _img_to_data_url(image_bytes)}})
	messages = [
	{"role": "system", "content": system},
	{"role": "user", "content": user_content},
	]
	kwargs: dict[str, Any] = {
	"model": model,
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": 0.0,
	}
	if extra_body:
	kwargs["extra_body"] = extra_body

	async with semaphore:
	for attempt in range(max_retries_on_empty + 1):
	try:
	resp = await client.chat.completions.create(**kwargs)
	text = resp.choices[0].message.content or ""
	if text.strip():
	return text
	finish = getattr(resp.choices[0], "finish_reason", None)
	if attempt < max_retries_on_empty:
	logger.warning(
	"Empty response from %s (finish_reason=%s); retrying %d/%d.",
	model, finish, attempt + 1, max_retries_on_empty,
	)
	continue
	logger.warning(
	"Empty response from %s after %d retries (finish_reason=%s); giving up.",
	model, max_retries_on_empty, finish,
	)
	return ""
	except Exception as e:
	logger.warning("OpenRouter call failed (%s); returning empty string.", e)
	return ""
	return ""


	async def _run_concurrent(
	coros: list[Any],
	*,
	desc: str,
	) -> list[str]:
	"""Run coroutines concurrently with a tqdm progress bar."""
	return await tqdm_asyncio.gather(*coros, desc=desc)


	# ─── VLM judge ─────────────────────────────────────────────────────────────


	def run_vlm_judge(
	records: list[dict[str, Any]],
	*,
	sample_images: dict[str, bytes],
	model: str,
	max_tokens: int = 1024,
	concurrency: int = 16,
	api_key: str \| None = None,
	) -> None:
	"""Score every key against the image via an OpenRouter VLM."""
	plans: list[dict[str, Any]] = []
	prompts: list[str] = []
	imgs: list[bytes] = []
	for rec in records:
	per_key = rec.get("per_key", {})
	if not per_key:
	continue
	keys = list(per_key.keys())
	entries = [(k, rec["schema"].get(k, ""), per_key[k].get("pred")) for k in keys]
	img = sample_images.get(rec["key"], b"")
	if not img:
	logger.warning("VLM judge: missing image for sample %s", rec["key"])
	continue
	prompts.append(build_vlm_judge_prompt(entries))
	imgs.append(img)
	plans.append({"record": rec, "keys": keys})

	if not plans:
	for rec in records:
	rec["vlm_judge_avg"] = None
	return

	logger.info("VLM judge: scoring %d sample(s) × all keys via %s.", len(plans), model)

	client = _make_client(api_key)
	sem = asyncio.Semaphore(concurrency)
	coros = [
	_one_chat(
	client,
	model=model,
	system="You are a meticulous visual evaluator.",
	user_text=p,
	image_bytes=img,
	max_tokens=max_tokens,
	semaphore=sem,
	extra_body=_VLM_JUDGE_REASONING,
	)
	for p, img in zip(prompts, imgs)
	]
	raw_outputs = asyncio.run(_run_concurrent(coros, desc="VLM judge"))

	for plan, text in zip(plans, raw_outputs):
	scores = parse_batch_scores(text, plan["keys"])
	rec = plan["record"]
	rec["vlm_judge_raw"] = text
	for k in plan["keys"]:
	rec["per_key"][k]["vlm_score"] = scores.get(k, 0.0)

	for rec in records:
	rec["vlm_judge_avg"] = per_sample_judge_avg(rec["per_key"], "vlm_score")