Image-Text-to-Text
Transformers
Safetensors
English
Chinese
llava_onevision2
multimodal
vision-language
video-text-to-text
llava
llava-onevision-2
qwen3
conversational
custom_code
Instructions to use lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct", trust_remote_code=True) messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoModelForImageTextToText model = AutoModelForImageTextToText.from_pretrained("lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct
- SGLang
How to use lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct with Docker Model Runner:
docker model run hf.co/lmms-lab-encoder/LLaVA-OneVision-2-8B-Instruct
Add codec video backend & docs (processing_llava_onevision2.py)
Browse files- processing_llava_onevision2.py +114 -0
processing_llava_onevision2.py
CHANGED
|
@@ -79,6 +79,7 @@ class LlavaOnevision2Processor:
|
|
| 79 |
tokenizer=None,
|
| 80 |
video_processor=None,
|
| 81 |
chat_template: Optional[str] = None,
|
|
|
|
| 82 |
):
|
| 83 |
self.image_processor = image_processor
|
| 84 |
self.tokenizer = tokenizer
|
|
@@ -94,6 +95,9 @@ class LlavaOnevision2Processor:
|
|
| 94 |
getattr(image_processor, "merge_size", 2) if image_processor is not None else 2
|
| 95 |
)
|
| 96 |
|
|
|
|
|
|
|
|
|
|
| 97 |
# ------------------------------------------------------------------ utils
|
| 98 |
|
| 99 |
@classmethod
|
|
@@ -114,6 +118,7 @@ class LlavaOnevision2Processor:
|
|
| 114 |
kwargs.pop("_from_auto", None)
|
| 115 |
kwargs.pop("trust_remote_code", None)
|
| 116 |
kwargs.pop("code_revision", None)
|
|
|
|
| 117 |
|
| 118 |
# Use the SLOW Qwen2VLImageProcessor: the Fast variant has small
|
| 119 |
# normalization rounding differences that change pixel_values bit-for-bit.
|
|
@@ -141,10 +146,34 @@ class LlavaOnevision2Processor:
|
|
| 141 |
patch_size=getattr(image_processor, "patch_size", 14),
|
| 142 |
spatial_merge_size=getattr(image_processor, "merge_size", 2),
|
| 143 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
return cls(
|
| 145 |
image_processor=image_processor,
|
| 146 |
tokenizer=tokenizer,
|
| 147 |
video_processor=video_processor,
|
|
|
|
| 148 |
)
|
| 149 |
|
| 150 |
# ------------------------------------------------------------- chat helpers
|
|
@@ -167,6 +196,14 @@ class LlavaOnevision2Processor:
|
|
| 167 |
num_frames: Optional[int] = None,
|
| 168 |
max_frames: Optional[int] = None,
|
| 169 |
target_fps: Optional[float] = None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
**kwargs,
|
| 171 |
):
|
| 172 |
"""Process an aligned (text, images, videos) batch.
|
|
@@ -200,6 +237,83 @@ class LlavaOnevision2Processor:
|
|
| 200 |
|
| 201 |
out: dict = {}
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# ---------------- VIDEO PATH ----------------
|
| 204 |
# Process videos first so we can rewrite their placeholders into the
|
| 205 |
# text before tokenization.
|
|
|
|
| 79 |
tokenizer=None,
|
| 80 |
video_processor=None,
|
| 81 |
chat_template: Optional[str] = None,
|
| 82 |
+
codec_config: Optional[dict] = None,
|
| 83 |
):
|
| 84 |
self.image_processor = image_processor
|
| 85 |
self.tokenizer = tokenizer
|
|
|
|
| 95 |
getattr(image_processor, "merge_size", 2) if image_processor is not None else 2
|
| 96 |
)
|
| 97 |
|
| 98 |
+
# Codec config defaults (overridden per-call via ``codec_config=``).
|
| 99 |
+
self._codec_config_defaults: dict = dict(codec_config or {})
|
| 100 |
+
|
| 101 |
# ------------------------------------------------------------------ utils
|
| 102 |
|
| 103 |
@classmethod
|
|
|
|
| 118 |
kwargs.pop("_from_auto", None)
|
| 119 |
kwargs.pop("trust_remote_code", None)
|
| 120 |
kwargs.pop("code_revision", None)
|
| 121 |
+
codec_config_override = kwargs.pop("codec_config", None)
|
| 122 |
|
| 123 |
# Use the SLOW Qwen2VLImageProcessor: the Fast variant has small
|
| 124 |
# normalization rounding differences that change pixel_values bit-for-bit.
|
|
|
|
| 146 |
patch_size=getattr(image_processor, "patch_size", 14),
|
| 147 |
spatial_merge_size=getattr(image_processor, "merge_size", 2),
|
| 148 |
)
|
| 149 |
+
|
| 150 |
+
# Codec defaults are read from preprocessor_config.json's "codec" field.
|
| 151 |
+
# We load the JSON directly because Qwen2VLImageProcessor.from_pretrained
|
| 152 |
+
# may not preserve unknown top-level keys as attributes.
|
| 153 |
+
if codec_config_override is not None:
|
| 154 |
+
codec_defaults = codec_config_override
|
| 155 |
+
else:
|
| 156 |
+
codec_defaults = {}
|
| 157 |
+
try:
|
| 158 |
+
import json as _json
|
| 159 |
+
import os as _os
|
| 160 |
+
# Try local file first (downloaded snapshot), then HF Hub.
|
| 161 |
+
cfg_path = _os.path.join(pretrained_model_name_or_path, "preprocessor_config.json")
|
| 162 |
+
if _os.path.isfile(cfg_path):
|
| 163 |
+
with open(cfg_path, "r", encoding="utf-8") as _f:
|
| 164 |
+
codec_defaults = _json.load(_f).get("codec", {}) or {}
|
| 165 |
+
else:
|
| 166 |
+
from huggingface_hub import hf_hub_download
|
| 167 |
+
cfg_path = hf_hub_download(pretrained_model_name_or_path, "preprocessor_config.json")
|
| 168 |
+
with open(cfg_path, "r", encoding="utf-8") as _f:
|
| 169 |
+
codec_defaults = _json.load(_f).get("codec", {}) or {}
|
| 170 |
+
except Exception:
|
| 171 |
+
codec_defaults = {}
|
| 172 |
return cls(
|
| 173 |
image_processor=image_processor,
|
| 174 |
tokenizer=tokenizer,
|
| 175 |
video_processor=video_processor,
|
| 176 |
+
codec_config=codec_defaults,
|
| 177 |
)
|
| 178 |
|
| 179 |
# ------------------------------------------------------------- chat helpers
|
|
|
|
| 196 |
num_frames: Optional[int] = None,
|
| 197 |
max_frames: Optional[int] = None,
|
| 198 |
target_fps: Optional[float] = None,
|
| 199 |
+
# Codec video backend (in-processor codec preprocessing). When
|
| 200 |
+
# ``video_backend="codec"`` and ``videos`` is set, the codec pipeline
|
| 201 |
+
# (cv-preinfer) replaces the frame-sampling VideoProcessor. The codec
|
| 202 |
+
# canvas pixel budget is taken from ``max_pixels`` so the user only
|
| 203 |
+
# configures one pixel knob.
|
| 204 |
+
video_backend: str = "frames",
|
| 205 |
+
max_pixels: Optional[int] = None,
|
| 206 |
+
codec_config: Optional[dict] = None,
|
| 207 |
**kwargs,
|
| 208 |
):
|
| 209 |
"""Process an aligned (text, images, videos) batch.
|
|
|
|
| 237 |
|
| 238 |
out: dict = {}
|
| 239 |
|
| 240 |
+
# ---------------- CODEC VIDEO BACKEND ----------------
|
| 241 |
+
# Codec path: replaces the frame-sampling VideoProcessor entirely.
|
| 242 |
+
# Each video -> N canvases + src_patch_position; we rewrite the
|
| 243 |
+
# <|vision_start|>...<|vision_end|> span in `text` based on the codec
|
| 244 |
+
# patch_positions (one canvas worth of <|image_pad|>s per timestamp).
|
| 245 |
+
if videos is not None and str(video_backend).lower() == "codec":
|
| 246 |
+
try:
|
| 247 |
+
from .codec_video_processing_llava_onevision2 import (
|
| 248 |
+
CodecConfig, process_codec_video, drop_padding_canvases,
|
| 249 |
+
codec_positions_for_processor, rewrite_text_with_codec_positions,
|
| 250 |
+
codec_image_processor_outputs,
|
| 251 |
+
)
|
| 252 |
+
except ImportError:
|
| 253 |
+
from codec_video_processing_llava_onevision2 import (
|
| 254 |
+
CodecConfig, process_codec_video, drop_padding_canvases,
|
| 255 |
+
codec_positions_for_processor, rewrite_text_with_codec_positions,
|
| 256 |
+
codec_image_processor_outputs,
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Normalise to list[video_url].
|
| 260 |
+
if isinstance(videos, str):
|
| 261 |
+
videos_list = [videos]
|
| 262 |
+
else:
|
| 263 |
+
videos_list = list(videos)
|
| 264 |
+
|
| 265 |
+
# Build effective codec config: defaults < class-level < per-call.
|
| 266 |
+
cfg_kwargs = dict(self._codec_config_defaults)
|
| 267 |
+
if codec_config:
|
| 268 |
+
cfg_kwargs.update(codec_config)
|
| 269 |
+
# Unify pixel budget with image_processor.
|
| 270 |
+
effective_max_pixels = int(
|
| 271 |
+
max_pixels
|
| 272 |
+
if max_pixels is not None
|
| 273 |
+
else cfg_kwargs.get("max_pixels", getattr(self.image_processor, "max_pixels", 150000))
|
| 274 |
+
)
|
| 275 |
+
cfg_kwargs["max_pixels"] = effective_max_pixels
|
| 276 |
+
cfg = CodecConfig(**cfg_kwargs)
|
| 277 |
+
|
| 278 |
+
all_pixel_values, all_grid_thw, all_patch_positions = [], [], []
|
| 279 |
+
rewritten_texts = list(text)
|
| 280 |
+
if len(rewritten_texts) != len(videos_list):
|
| 281 |
+
if len(rewritten_texts) == 1 and len(videos_list) >= 1:
|
| 282 |
+
rewritten_texts = rewritten_texts * len(videos_list)
|
| 283 |
+
else:
|
| 284 |
+
raise ValueError(
|
| 285 |
+
f"codec video backend: got {len(rewritten_texts)} texts but {len(videos_list)} videos"
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
for idx, video_url in enumerate(videos_list):
|
| 289 |
+
payload = process_codec_video(video_url, cfg)
|
| 290 |
+
imgs, src_positions, _ = drop_padding_canvases(
|
| 291 |
+
payload["images"], payload["src_positions"]
|
| 292 |
+
)
|
| 293 |
+
if not imgs:
|
| 294 |
+
raise RuntimeError(f"codec produced no usable canvases for {video_url}")
|
| 295 |
+
image_data = codec_image_processor_outputs(
|
| 296 |
+
self.image_processor, imgs, max_pixels=effective_max_pixels
|
| 297 |
+
)
|
| 298 |
+
image_grid_thw = image_data["image_grid_thw"]
|
| 299 |
+
patch_positions = codec_positions_for_processor(
|
| 300 |
+
src_positions, image_grid_thw, device=image_grid_thw.device,
|
| 301 |
+
)
|
| 302 |
+
rewritten_texts[idx] = rewrite_text_with_codec_positions(
|
| 303 |
+
rewritten_texts[idx], patch_positions,
|
| 304 |
+
fps=float(payload["fps"]), decimals=1,
|
| 305 |
+
)
|
| 306 |
+
all_pixel_values.append(image_data["pixel_values"])
|
| 307 |
+
all_grid_thw.append(image_grid_thw)
|
| 308 |
+
all_patch_positions.append(patch_positions)
|
| 309 |
+
|
| 310 |
+
out["pixel_values"] = torch.cat(all_pixel_values, dim=0)
|
| 311 |
+
out["image_grid_thw"] = torch.cat(all_grid_thw, dim=0)
|
| 312 |
+
out["patch_positions"] = torch.cat(all_patch_positions, dim=0)
|
| 313 |
+
text = rewritten_texts
|
| 314 |
+
# Codec branch handled the video. Suppress the frame-sampling block below.
|
| 315 |
+
videos = None
|
| 316 |
+
|
| 317 |
# ---------------- VIDEO PATH ----------------
|
| 318 |
# Process videos first so we can rewrite their placeholders into the
|
| 319 |
# text before tokenization.
|