yiyexy commited on
Commit
9e7205f
·
verified ·
1 Parent(s): 193adfa

Initial release: LLaVA-OneVision2-8B-Instruct

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,101 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: image-text-to-text
4
+ license: apache-2.0
5
+ tags:
6
+ - multimodal
7
+ - vision-language
8
+ - image-text-to-text
9
+ - video-text-to-text
10
+ - llava
11
+ - llava-onevision
12
+ - qwen3
13
+ language:
14
+ - en
15
+ - zh
16
+ ---
17
+
18
+ # LLaVA-OneVision2-8B-Instruct
19
+
20
+ A multimodal vision-language model that handles **single images, multi-image, and video** inputs, built on a Qwen3-8B language backbone with a OneVision-style vision encoder.
21
+
22
+ The model is distributed as a HuggingFace `transformers` checkpoint with custom code (`trust_remote_code=True`).
23
+
24
+ ## Requirements
25
+
26
+ ```bash
27
+ pip install "transformers>=5.7.0" "torch>=2.4" pillow requests decord
28
+ ```
29
+
30
+ ## Quick start
31
+
32
+ The repository ships a ready-to-run `demo_inference.py` that covers both image and video paths.
33
+
34
+ ```bash
35
+ # Image (default sample image; no auth required)
36
+ python demo_inference.py
37
+
38
+ # Image, custom file + prompt
39
+ python demo_inference.py --mode image --media /path/to/cat.jpg \
40
+ --prompt "What is the cat doing?"
41
+
42
+ # Video (16 uniformly-sampled frames; max-pixels caps per-frame resolution for memory)
43
+ python demo_inference.py --mode video --media /path/to/clip.mp4 \
44
+ --num-frames 16 --max-pixels 200704 \
45
+ --prompt "Describe what happens in this video."
46
+ ```
47
+
48
+ ## Programmatic use
49
+
50
+ ```python
51
+ import torch
52
+ from transformers import AutoProcessor, AutoModelForImageTextToText
53
+ from PIL import Image
54
+
55
+ MODEL_ID = "lmms-lab-encoder/LLaVA-OneVision2-8B-Instruct"
56
+
57
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
58
+ model = AutoModelForImageTextToText.from_pretrained(
59
+ MODEL_ID, trust_remote_code=True, dtype=torch.bfloat16, device_map="cuda",
60
+ ).eval()
61
+
62
+ # ----- Image -----
63
+ image = Image.open("cat.jpg").convert("RGB")
64
+ messages = [{"role": "user", "content": [
65
+ {"type": "image"},
66
+ {"type": "text", "text": "Describe this image in detail."},
67
+ ]}]
68
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
69
+ inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
70
+ inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
71
+
72
+ out = model.generate(**inputs, max_new_tokens=256, do_sample=False)
73
+ print(processor.tokenizer.decode(out[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True))
74
+
75
+ # ----- Video -----
76
+ # Lower max_pixels if you hit OOM on long videos.
77
+ processor.video_processor.max_pixels = 200704
78
+
79
+ messages = [{"role": "user", "content": [
80
+ {"type": "video"},
81
+ {"type": "text", "text": "Describe what happens in this video."},
82
+ ]}]
83
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
84
+ inputs = processor(
85
+ text=[text], videos=["clip.mp4"], return_tensors="pt", padding=True,
86
+ num_frames=16, # exact frame count; or use target_fps / max_frames
87
+ )
88
+ inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
89
+ out = model.generate(**inputs, max_new_tokens=256, do_sample=False)
90
+ print(processor.tokenizer.decode(out[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True))
91
+ ```
92
+
93
+ ## Notes
94
+
95
+ - The vision tower is a OneVision-style encoder; the language backbone is **Qwen3-8B**.
96
+ - `chat_template.jinja` follows the Qwen3 chat format and emits `<|vision_start|>...<|vision_end|>` placeholders; the processor expands them per-frame for video.
97
+ - Inference was validated to be bit-exact at the pixel level and prefix-identical at the token level against the original reference implementation.
98
+
99
+ ## License
100
+
101
+ Apache-2.0 (model weights and code in this repository). The Qwen3-8B base is subject to its own license — see [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlavaOnevision2ForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_llava_onevision2.LlavaOnevision2Config",
7
+ "AutoModel": "modeling_llava_onevision2.LlavaOnevision2Model",
8
+ "AutoModelForImageTextToText": "modeling_llava_onevision2.LlavaOnevision2ForConditionalGeneration",
9
+ "AutoProcessor": "processing_llava_onevision2.LlavaOnevision2Processor",
10
+ "AutoVideoProcessor": "video_processing_llava_onevision2.LlavaOnevision2VideoProcessor"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "dtype": "bfloat16",
14
+ "eos_token_id": 151645,
15
+ "image_token_id": 151655,
16
+ "model_type": "llava_onevision2",
17
+ "text_config": {
18
+ "_name_or_path": "Qwen/Qwen3-8B",
19
+ "attention_bias": false,
20
+ "attention_dropout": 0.0,
21
+ "bos_token_id": 151643,
22
+ "dtype": "bfloat16",
23
+ "eos_token_id": 151645,
24
+ "head_dim": 128,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 4096,
27
+ "initializer_range": 0.02,
28
+ "intermediate_size": 12288,
29
+ "layer_types": [
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention"
66
+ ],
67
+ "max_position_embeddings": 262144,
68
+ "max_window_layers": 36,
69
+ "model_type": "qwen3",
70
+ "num_attention_heads": 32,
71
+ "num_hidden_layers": 36,
72
+ "num_key_value_heads": 8,
73
+ "pad_token_id": null,
74
+ "rms_norm_eps": 1e-06,
75
+ "rope_parameters": {
76
+ "rope_theta": 8000000,
77
+ "rope_type": "default"
78
+ },
79
+ "sliding_window": null,
80
+ "tie_word_embeddings": false,
81
+ "use_cache": true,
82
+ "use_sliding_window": false,
83
+ "vocab_size": 151936
84
+ },
85
+ "tie_word_embeddings": false,
86
+ "transformers_version": "5.7.0",
87
+ "video_token_id": 151656,
88
+ "vision_config": {
89
+ "attention_dropout": 0.0,
90
+ "frame_windows_size": 4,
91
+ "hidden_act": "gelu",
92
+ "hidden_size": 1024,
93
+ "image_size": 448,
94
+ "initializer_range": 0.02,
95
+ "intermediate_size": 4096,
96
+ "layer_norm_eps": 1e-06,
97
+ "layer_norm_type": "layer_norm",
98
+ "max_position_embeddings": 8192,
99
+ "model_type": "onevision_encoder",
100
+ "num_attention_heads": 16,
101
+ "num_channels": 3,
102
+ "num_hidden_layers": 24,
103
+ "out_hidden_size": 4096,
104
+ "patch_position_encoding_type": "absolute",
105
+ "patch_size": 14,
106
+ "rope_theta": 10000.0,
107
+ "spatial_merge_size": 2,
108
+ "text_hidden_size": 4096,
109
+ "tokens_per_second": 1,
110
+ "use_head": false,
111
+ "use_patch_position_encoding": false
112
+ },
113
+ "vision_end_token_id": 151653,
114
+ "vision_start_token_id": 151652
115
+ }
configuration_llava_onevision2.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub.dataclasses import strict
2
+
3
+ from transformers import CONFIG_MAPPING, AutoConfig
4
+ from transformers.configuration_utils import PreTrainedConfig
5
+
6
+
7
+ @strict
8
+ class LlavaOnevision2VisionConfig(PreTrainedConfig):
9
+ model_type = "onevision_encoder"
10
+ base_config_key = "vision_config"
11
+
12
+ hidden_size: int = 1024
13
+ intermediate_size: int = 4096
14
+ num_hidden_layers: int = 24
15
+ num_attention_heads: int = 16
16
+ num_channels: int = 3
17
+ image_size: int = 448
18
+ patch_size: int = 14
19
+ hidden_act: str = "gelu"
20
+ layer_norm_eps: float = 1e-6
21
+ layer_norm_type: str = "layer_norm"
22
+ attention_dropout: float = 0.0
23
+ initializer_range: float = 0.02
24
+ rope_theta: float = 10000.0
25
+ use_head: bool = False
26
+ out_hidden_size: int = 1024
27
+ spatial_merge_size: int = 2
28
+ tokens_per_second: int = 1
29
+ frame_windows_size: int = 4
30
+ use_patch_position_encoding: bool = False
31
+ patch_position_encoding_type: str = "absolute"
32
+ max_position_embeddings: int = 8192
33
+
34
+
35
+ @strict
36
+ class LlavaOnevision2Config(PreTrainedConfig):
37
+ r"""
38
+ This is the configuration class to store the configuration of a [`LlavaOnevision2Model`]. It is used to instantiate a
39
+ LlavaOnevision2Model model according to the specified arguments, defining the model architecture. Instantiating a configuration
40
+ with the defaults will yield a similar configuration to that of
41
+ Llava-Onevision 1.5 [lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct).
42
+
43
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
44
+ documentation from [`PreTrainedConfig`] for more information.
45
+
46
+ Args:
47
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3Config`):
48
+ The config object or dictionary of the text backbone.
49
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `LlavaOnevision2VisionConfig`):
50
+ The config object or dictionary of the vision backbone.
51
+ image_token_id (`int`, *optional*, defaults to 151655):
52
+ The image token index to encode the image prompt.
53
+ video_token_id (`int`, *optional*, defaults to 151656):
54
+ The video token index to encode the image prompt.
55
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
56
+ The token index to denote start of vision input.
57
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
58
+ The token index to denote end of vision input.
59
+ """
60
+
61
+ model_type = "llava_onevision2"
62
+ # `text_config` is resolved dynamically based on its `model_type` (defaults to `qwen3`),
63
+ # so we use `AutoConfig` here as a placeholder; `__post_init__` swaps it for the
64
+ # concrete config class via `CONFIG_MAPPING`.
65
+ sub_configs = {"vision_config": LlavaOnevision2VisionConfig, "text_config": AutoConfig}
66
+ keys_to_ignore_at_inference = ["past_key_values"]
67
+
68
+ text_config: dict | PreTrainedConfig | None = None
69
+ vision_config: dict | PreTrainedConfig | None = None
70
+ image_token_id: int = 151655
71
+ video_token_id: int = 151656
72
+ vision_start_token_id: int = 151652
73
+ vision_end_token_id: int = 151653
74
+ tie_word_embeddings: bool = False
75
+ # Generation-related token ids are mirrored from `text_config` in `__post_init__`
76
+ # so downstream tools (e.g. `generate`, vLLM) that read them at the top level keep working.
77
+ bos_token_id: int | None = None
78
+ eos_token_id: int | list[int] | None = None
79
+ pad_token_id: int | None = None
80
+
81
+ def __post_init__(self, **kwargs):
82
+ # Resolve vision_config
83
+ if isinstance(self.vision_config, dict):
84
+ self.vision_config = self.sub_configs["vision_config"](**self.vision_config)
85
+ elif self.vision_config is None:
86
+ self.vision_config = self.sub_configs["vision_config"]()
87
+
88
+ # Resolve text_config dynamically via CONFIG_MAPPING (defaults to qwen3)
89
+ if isinstance(self.text_config, dict):
90
+ text_model_type = self.text_config.get("model_type", "qwen3")
91
+ self.text_config["model_type"] = text_model_type
92
+ text_config_cls = CONFIG_MAPPING[text_model_type]
93
+ self.sub_configs["text_config"] = text_config_cls
94
+ self.text_config = text_config_cls(**self.text_config)
95
+ elif self.text_config is None:
96
+ text_config_cls = CONFIG_MAPPING["qwen3"]
97
+ self.sub_configs["text_config"] = text_config_cls
98
+ self.text_config = text_config_cls()
99
+
100
+ # Mirror generation-related token ids from text_config to the top level so
101
+ # downstream tools (e.g. `generate`, chat templates, vLLM) that read them
102
+ # from the top-level config keep working.
103
+ for tok_key in ("bos_token_id", "eos_token_id", "pad_token_id"):
104
+ text_val = getattr(self.text_config, tok_key, None)
105
+ if text_val is not None and getattr(self, tok_key, None) is None:
106
+ setattr(self, tok_key, text_val)
107
+
108
+ super().__post_init__(**kwargs)
109
+
110
+
111
+ __all__ = ["LlavaOnevision2Config", "LlavaOnevision2VisionConfig"]
demo_inference.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """End-to-end inference demo for LlavaOnevision2 (image + video).
2
+
3
+ This script shows the two canonical inference paths supported by the model:
4
+
5
+ * Image captioning (``--mode image``, default)
6
+ * Video captioning (``--mode video``)
7
+
8
+ Both modes share the same loading pattern:
9
+
10
+ from transformers import AutoProcessor, AutoModelForImageTextToText
11
+ processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
12
+ model = AutoModelForImageTextToText.from_pretrained(
13
+ model_dir, trust_remote_code=True, dtype=torch.bfloat16, device_map="cuda",
14
+ )
15
+
16
+ Examples
17
+ --------
18
+ # Image (default sample image from the web)
19
+ python demo_inference.py
20
+
21
+ # Image with a local file and a custom prompt
22
+ python demo_inference.py --mode image --media /path/to/cat.jpg \
23
+ --prompt "What is the cat doing?"
24
+
25
+ # Video
26
+ # - ``--num-frames`` selects exactly N frames (uniform sampling).
27
+ # - ``--max-pixels`` caps each frame's pixel budget. Lower it to fit smaller
28
+ # GPUs; 200704 (=448*448) is a safe default for a single ~80GB card.
29
+ python demo_inference.py --mode video --media /path/to/clip.mp4 \
30
+ --num-frames 16 --max-pixels 200704 \
31
+ --prompt "Describe what happens in this video."
32
+
33
+ Tested with:
34
+ transformers == 5.7.0
35
+ torch >= 2.4
36
+ decord, Pillow, requests
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import argparse
42
+ import io
43
+ import os
44
+ import sys
45
+
46
+ import torch
47
+
48
+ # Placeholder constants so the user can swap their own media in easily.
49
+ # (Public sample image from the transformers project; no auth required.)
50
+ DEFAULT_IMAGE_URL = "https://www.ilankelman.org/stopsigns/australia.jpg"
51
+ DEFAULT_VIDEO_PATH = "/path/to/your/video.mp4" # <-- replace me
52
+
53
+ DEFAULT_IMAGE_PROMPT = "Describe this image in detail."
54
+ DEFAULT_VIDEO_PROMPT = "Describe what happens in this video in detail."
55
+
56
+ # Default model. Override with ``--model /local/path`` to use a local checkpoint.
57
+ DEFAULT_MODEL = "lmms-lab-encoder/LLaVA-OneVision2-8B-Instruct"
58
+
59
+
60
+ def load_image(source: str):
61
+ """Load a PIL image from a local path or an http(s) URL."""
62
+ from PIL import Image
63
+
64
+ if source.startswith(("http://", "https://")):
65
+ import requests
66
+
67
+ resp = requests.get(source, stream=True, timeout=30)
68
+ resp.raise_for_status()
69
+ img = Image.open(io.BytesIO(resp.content))
70
+ else:
71
+ img = Image.open(source)
72
+ return img.convert("RGB")
73
+
74
+
75
+ def run_image(model, processor, media: str, prompt: str, max_new_tokens: int, device: str) -> str:
76
+ """Caption a single image."""
77
+ image = load_image(media)
78
+
79
+ messages = [
80
+ {
81
+ "role": "user",
82
+ "content": [
83
+ {"type": "image"},
84
+ {"type": "text", "text": prompt},
85
+ ],
86
+ }
87
+ ]
88
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
89
+
90
+ inputs = processor(
91
+ text=[text],
92
+ images=[image],
93
+ return_tensors="pt",
94
+ padding=True,
95
+ )
96
+ inputs = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()}
97
+
98
+ tok = processor.tokenizer
99
+ pad_id = tok.pad_token_id or tok.eos_token_id
100
+ with torch.inference_mode():
101
+ out_ids = model.generate(
102
+ **inputs,
103
+ max_new_tokens=max_new_tokens,
104
+ do_sample=False,
105
+ num_beams=1,
106
+ use_cache=True,
107
+ eos_token_id=tok.eos_token_id,
108
+ pad_token_id=pad_id,
109
+ )
110
+ prompt_len = inputs["input_ids"].shape[-1]
111
+ new_ids = out_ids[:, prompt_len:]
112
+ return tok.batch_decode(new_ids, skip_special_tokens=True)[0].strip()
113
+
114
+
115
+ def run_video(
116
+ model,
117
+ processor,
118
+ media: str,
119
+ prompt: str,
120
+ max_new_tokens: int,
121
+ device: str,
122
+ num_frames: int,
123
+ max_pixels: int,
124
+ ) -> str:
125
+ """Caption an mp4/avi/... video file.
126
+
127
+ Key processor knobs (all passed through ``__call__``):
128
+ * ``num_frames`` : force exactly N uniformly-sampled frames.
129
+ * ``max_frames`` : cap on auto-selected frame count (used when num_frames is None).
130
+ * ``target_fps`` : sample at this FPS, capped by ``max_frames``.
131
+
132
+ For memory control, lower the per-frame resolution by overriding
133
+ ``processor.video_processor.max_pixels`` before calling the processor.
134
+ """
135
+ if not os.path.exists(media):
136
+ raise FileNotFoundError(
137
+ f"Video file not found: {media!r}. Pass --media <path/to/video.mp4>."
138
+ )
139
+
140
+ # Constrain per-frame pixel budget (memory-friendly default for a single ~80GB GPU).
141
+ processor.video_processor.max_pixels = max_pixels
142
+
143
+ messages = [
144
+ {
145
+ "role": "user",
146
+ "content": [
147
+ {"type": "video"},
148
+ {"type": "text", "text": prompt},
149
+ ],
150
+ }
151
+ ]
152
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
153
+
154
+ inputs = processor(
155
+ text=[text],
156
+ videos=[media],
157
+ return_tensors="pt",
158
+ padding=True,
159
+ num_frames=num_frames, # force exactly N frames
160
+ )
161
+ inputs = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()}
162
+
163
+ tok = processor.tokenizer
164
+ pad_id = tok.pad_token_id or tok.eos_token_id
165
+ with torch.inference_mode():
166
+ out_ids = model.generate(
167
+ **inputs,
168
+ max_new_tokens=max_new_tokens,
169
+ do_sample=False,
170
+ num_beams=1,
171
+ use_cache=True,
172
+ eos_token_id=tok.eos_token_id,
173
+ pad_token_id=pad_id,
174
+ )
175
+ prompt_len = inputs["input_ids"].shape[-1]
176
+ new_ids = out_ids[:, prompt_len:]
177
+ return tok.batch_decode(new_ids, skip_special_tokens=True)[0].strip()
178
+
179
+
180
+ def main():
181
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
182
+ parser.add_argument(
183
+ "--model",
184
+ default=DEFAULT_MODEL,
185
+ help=f"HF repo id or local path to the model checkpoint (default: {DEFAULT_MODEL}).",
186
+ )
187
+ parser.add_argument(
188
+ "--mode",
189
+ choices=["image", "video"],
190
+ default="image",
191
+ help="Inference mode (default: image).",
192
+ )
193
+ parser.add_argument(
194
+ "--media",
195
+ default=None,
196
+ help=(
197
+ "Image path/URL (image mode) or video path (video mode). "
198
+ f"Defaults: image={DEFAULT_IMAGE_URL!r}, video={DEFAULT_VIDEO_PATH!r}."
199
+ ),
200
+ )
201
+ parser.add_argument("--prompt", default=None, help="User prompt sent alongside the media.")
202
+ parser.add_argument("--max-new-tokens", type=int, default=256)
203
+ parser.add_argument(
204
+ "--device",
205
+ default="cuda" if torch.cuda.is_available() else "cpu",
206
+ help="Device to load the model on.",
207
+ )
208
+ parser.add_argument(
209
+ "--dtype",
210
+ default="bfloat16",
211
+ choices=["bfloat16", "float16", "float32"],
212
+ help="Model dtype.",
213
+ )
214
+ # Video-only knobs (ignored in image mode).
215
+ parser.add_argument(
216
+ "--num-frames",
217
+ type=int,
218
+ default=16,
219
+ help="[video] Number of frames to sample (default: 16).",
220
+ )
221
+ parser.add_argument(
222
+ "--max-pixels",
223
+ type=int,
224
+ default=200704,
225
+ help="[video] Per-frame max pixel count (default: 200704 = 448*448).",
226
+ )
227
+ args = parser.parse_args()
228
+
229
+ # Defaults that depend on mode.
230
+ if args.media is None:
231
+ args.media = DEFAULT_IMAGE_URL if args.mode == "image" else DEFAULT_VIDEO_PATH
232
+ if args.prompt is None:
233
+ args.prompt = DEFAULT_IMAGE_PROMPT if args.mode == "image" else DEFAULT_VIDEO_PROMPT
234
+
235
+ dtype = getattr(torch, args.dtype)
236
+
237
+ from transformers import AutoModelForImageTextToText, AutoProcessor
238
+
239
+ print(f"[demo_inference] Loading processor from: {args.model}", flush=True)
240
+ processor = AutoProcessor.from_pretrained(args.model, trust_remote_code=True)
241
+
242
+ print(f"[demo_inference] Loading model on {args.device} ({args.dtype})...", flush=True)
243
+ model = AutoModelForImageTextToText.from_pretrained(
244
+ args.model,
245
+ trust_remote_code=True,
246
+ dtype=dtype,
247
+ device_map=args.device,
248
+ )
249
+ model.eval()
250
+
251
+ print(f"[demo_inference] Mode={args.mode} media={args.media}", flush=True)
252
+ if args.mode == "image":
253
+ caption = run_image(
254
+ model, processor, args.media, args.prompt, args.max_new_tokens, args.device,
255
+ )
256
+ else:
257
+ caption = run_video(
258
+ model, processor, args.media, args.prompt, args.max_new_tokens, args.device,
259
+ num_frames=args.num_frames, max_pixels=args.max_pixels,
260
+ )
261
+
262
+ print("\n========== OUTPUT ==========")
263
+ print(caption)
264
+ print("============================")
265
+ return 0
266
+
267
+
268
+ if __name__ == "__main__":
269
+ sys.exit(main())
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "5.7.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6510d6eb5c999d9f3c869f127016bd8bada90269c1e76d0bea6ab59ffe3ac876
3
+ size 5288213408
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e927a5f586ab4e43b0d13e29c0ae43d9b838850ebce106fc5d290507a8c4b3c3
3
+ size 4546819264
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:608b6d6d544935773de4653cb2c560c12fe2899b1d6bc7602f58ea2dbecb9fdc
3
+ size 5346916160
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99ef8daea1b127d97b724add790b1dfd28f0ae12a81a22cdfdf2f38c92177538
3
+ size 1872566720
model.safetensors.index.json ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 17054427136
4
+ },
5
+ "weight_map": {
6
+ "model.language_model.embed_tokens.weight": "model-00001-of-00004.safetensors",
7
+ "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
8
+ "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
9
+ "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
12
+ "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
13
+ "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
16
+ "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
19
+ "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
23
+ "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
24
+ "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
27
+ "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors",
30
+ "model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
34
+ "model.language_model.layers.10.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
35
+ "model.language_model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
36
+ "model.language_model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
37
+ "model.language_model.layers.10.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
38
+ "model.language_model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
39
+ "model.language_model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
40
+ "model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors",
41
+ "model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
42
+ "model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
43
+ "model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
44
+ "model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
45
+ "model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
46
+ "model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
47
+ "model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
48
+ "model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
49
+ "model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
50
+ "model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
51
+ "model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
52
+ "model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
53
+ "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
54
+ "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
55
+ "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
56
+ "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
57
+ "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
58
+ "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
59
+ "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
60
+ "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
61
+ "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
62
+ "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
63
+ "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
64
+ "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
65
+ "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
66
+ "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
67
+ "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
68
+ "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
69
+ "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
70
+ "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
71
+ "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
72
+ "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
73
+ "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
74
+ "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
75
+ "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
76
+ "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
77
+ "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
78
+ "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
79
+ "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
80
+ "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
81
+ "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
82
+ "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
83
+ "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
84
+ "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
85
+ "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
86
+ "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
87
+ "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
88
+ "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
89
+ "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
90
+ "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
91
+ "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
92
+ "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
93
+ "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
94
+ "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
95
+ "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
96
+ "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
97
+ "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
98
+ "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
99
+ "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
100
+ "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
101
+ "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
102
+ "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
103
+ "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
104
+ "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
105
+ "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
106
+ "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
107
+ "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
108
+ "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
109
+ "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
110
+ "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
111
+ "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
112
+ "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
113
+ "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
114
+ "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
115
+ "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
116
+ "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
117
+ "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
118
+ "model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
119
+ "model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
120
+ "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
121
+ "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
122
+ "model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
123
+ "model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
124
+ "model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
125
+ "model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
126
+ "model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
127
+ "model.language_model.layers.12.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
128
+ "model.language_model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
129
+ "model.language_model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
130
+ "model.language_model.layers.12.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
131
+ "model.language_model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
132
+ "model.language_model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
133
+ "model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
134
+ "model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
135
+ "model.language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
138
+ "model.language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
139
+ "model.language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
142
+ "model.language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
143
+ "model.language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
144
+ "model.language_model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
145
+ "model.language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
146
+ "model.language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
147
+ "model.language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
148
+ "model.language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
149
+ "model.language_model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
150
+ "model.language_model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
151
+ "model.language_model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
152
+ "model.language_model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
153
+ "model.language_model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
154
+ "model.language_model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
155
+ "model.language_model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
156
+ "model.language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
157
+ "model.language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
158
+ "model.language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
160
+ "model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
161
+ "model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
162
+ "model.language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
163
+ "model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
164
+ "model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
165
+ "model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
166
+ "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
167
+ "model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
168
+ "model.language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
169
+ "model.language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
170
+ "model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
171
+ "model.language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
172
+ "model.language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
173
+ "model.language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
174
+ "model.language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
175
+ "model.language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
176
+ "model.language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
177
+ "model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
178
+ "model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
179
+ "model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
180
+ "model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
181
+ "model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
182
+ "model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
183
+ "model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
184
+ "model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
185
+ "model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
186
+ "model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
187
+ "model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
188
+ "model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
189
+ "model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
190
+ "model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
191
+ "model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
192
+ "model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
193
+ "model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
194
+ "model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
195
+ "model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
196
+ "model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
197
+ "model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
198
+ "model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
199
+ "model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
200
+ "model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
201
+ "model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
202
+ "model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
203
+ "model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
204
+ "model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
205
+ "model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
206
+ "model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
207
+ "model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
208
+ "model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
209
+ "model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
210
+ "model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
211
+ "model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
212
+ "model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
213
+ "model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
214
+ "model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
215
+ "model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
216
+ "model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
217
+ "model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
218
+ "model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
219
+ "model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
220
+ "model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
221
+ "model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
222
+ "model.language_model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
223
+ "model.language_model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
224
+ "model.language_model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
225
+ "model.language_model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
226
+ "model.language_model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
227
+ "model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
228
+ "model.language_model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
229
+ "model.language_model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
230
+ "model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
231
+ "model.language_model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
232
+ "model.language_model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
233
+ "model.language_model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
234
+ "model.language_model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
235
+ "model.language_model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
236
+ "model.language_model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
237
+ "model.language_model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
238
+ "model.language_model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
239
+ "model.language_model.layers.23.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
240
+ "model.language_model.layers.23.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
241
+ "model.language_model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
242
+ "model.language_model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
243
+ "model.language_model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
244
+ "model.language_model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
245
+ "model.language_model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
246
+ "model.language_model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
247
+ "model.language_model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
248
+ "model.language_model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
249
+ "model.language_model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
250
+ "model.language_model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
251
+ "model.language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
252
+ "model.language_model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
253
+ "model.language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
254
+ "model.language_model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
255
+ "model.language_model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
256
+ "model.language_model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
257
+ "model.language_model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
258
+ "lm_head.weight": "model-00003-of-00004.safetensors",
259
+ "model.language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.language_model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.language_model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
263
+ "model.language_model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.language_model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
265
+ "model.language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
266
+ "model.language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
267
+ "model.language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
268
+ "model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
269
+ "model.language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
270
+ "model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
271
+ "model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
272
+ "model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
273
+ "model.language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
274
+ "model.language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
277
+ "model.language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
278
+ "model.language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
279
+ "model.language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
281
+ "model.language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
282
+ "model.language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.language_model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
284
+ "model.language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
285
+ "model.language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
286
+ "model.language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
288
+ "model.language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
289
+ "model.language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
290
+ "model.language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
291
+ "model.language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
292
+ "model.language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
293
+ "model.language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.language_model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
295
+ "model.language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
296
+ "model.language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
297
+ "model.language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
299
+ "model.language_model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
300
+ "model.language_model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
301
+ "model.language_model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.language_model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
303
+ "model.language_model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
304
+ "model.language_model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.language_model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
306
+ "model.language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
307
+ "model.language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
308
+ "model.language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
309
+ "model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
310
+ "model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
311
+ "model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
314
+ "model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
315
+ "model.language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.language_model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
317
+ "model.language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
318
+ "model.language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
319
+ "model.language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
320
+ "model.language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
321
+ "model.language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
322
+ "model.language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
323
+ "model.language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
325
+ "model.language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
326
+ "model.language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
327
+ "model.language_model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
328
+ "model.language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
329
+ "model.language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
330
+ "model.language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
331
+ "model.language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
332
+ "model.language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
333
+ "model.language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
334
+ "model.language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
335
+ "model.language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
336
+ "model.language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
337
+ "model.language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
338
+ "model.language_model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
339
+ "model.language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
340
+ "model.language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
342
+ "model.language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
343
+ "model.language_model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
344
+ "model.language_model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
345
+ "model.language_model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
346
+ "model.language_model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
347
+ "model.language_model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
348
+ "model.language_model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
349
+ "model.language_model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
350
+ "model.language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
351
+ "model.language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
352
+ "model.language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
353
+ "model.language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
354
+ "model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
355
+ "model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
356
+ "model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
357
+ "model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
358
+ "model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
359
+ "model.language_model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
360
+ "model.language_model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
361
+ "model.language_model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
362
+ "model.language_model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
363
+ "model.language_model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
364
+ "model.language_model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
365
+ "model.language_model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
366
+ "model.language_model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
367
+ "model.language_model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
368
+ "model.language_model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
369
+ "model.language_model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
370
+ "model.language_model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
371
+ "model.language_model.layers.35.input_layernorm.weight": "model-00003-of-00004.safetensors",
372
+ "model.language_model.layers.35.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
373
+ "model.language_model.layers.35.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
374
+ "model.language_model.layers.35.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
375
+ "model.language_model.norm.weight": "model-00003-of-00004.safetensors",
376
+ "model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
377
+ "model.language_model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
378
+ "model.language_model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
379
+ "model.language_model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
380
+ "model.language_model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
381
+ "model.language_model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
382
+ "model.language_model.layers.32.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
383
+ "model.language_model.layers.32.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
384
+ "model.language_model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
385
+ "model.language_model.layers.33.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
386
+ "model.language_model.layers.33.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
387
+ "model.language_model.layers.33.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
388
+ "model.language_model.layers.33.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
389
+ "model.language_model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
390
+ "model.language_model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
391
+ "model.language_model.layers.34.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
392
+ "model.language_model.layers.34.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
393
+ "model.language_model.layers.34.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
394
+ "model.language_model.layers.34.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
395
+ "model.language_model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
396
+ "model.language_model.layers.34.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
397
+ "model.language_model.layers.34.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
398
+ "model.language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
399
+ "model.language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
400
+ "model.language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
401
+ "model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
402
+ "model.language_model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
403
+ "model.language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
404
+ "model.language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
405
+ "model.visual.embeddings.patch_embedding.weight": "model-00004-of-00004.safetensors",
406
+ "model.visual.encoder.layers.0.layer_norm1.bias": "model-00004-of-00004.safetensors",
407
+ "model.visual.encoder.layers.0.layer_norm1.weight": "model-00004-of-00004.safetensors",
408
+ "model.visual.encoder.layers.0.layer_norm2.bias": "model-00004-of-00004.safetensors",
409
+ "model.visual.encoder.layers.0.layer_norm2.weight": "model-00004-of-00004.safetensors",
410
+ "model.visual.encoder.layers.0.mlp.fc1.bias": "model-00004-of-00004.safetensors",
411
+ "model.visual.encoder.layers.0.mlp.fc1.weight": "model-00004-of-00004.safetensors",
412
+ "model.visual.encoder.layers.0.mlp.fc2.bias": "model-00004-of-00004.safetensors",
413
+ "model.visual.encoder.layers.0.mlp.fc2.weight": "model-00004-of-00004.safetensors",
414
+ "model.visual.encoder.layers.0.self_attn.proj.bias": "model-00004-of-00004.safetensors",
415
+ "model.visual.encoder.layers.0.self_attn.proj.weight": "model-00004-of-00004.safetensors",
416
+ "model.visual.encoder.layers.0.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
417
+ "model.visual.encoder.layers.0.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
418
+ "model.visual.encoder.layers.1.layer_norm1.bias": "model-00004-of-00004.safetensors",
419
+ "model.visual.encoder.layers.1.layer_norm1.weight": "model-00004-of-00004.safetensors",
420
+ "model.visual.encoder.layers.1.layer_norm2.bias": "model-00004-of-00004.safetensors",
421
+ "model.visual.encoder.layers.1.layer_norm2.weight": "model-00004-of-00004.safetensors",
422
+ "model.visual.encoder.layers.1.mlp.fc1.bias": "model-00004-of-00004.safetensors",
423
+ "model.visual.encoder.layers.1.mlp.fc1.weight": "model-00004-of-00004.safetensors",
424
+ "model.visual.encoder.layers.1.mlp.fc2.bias": "model-00004-of-00004.safetensors",
425
+ "model.visual.encoder.layers.1.mlp.fc2.weight": "model-00004-of-00004.safetensors",
426
+ "model.visual.encoder.layers.1.self_attn.proj.bias": "model-00004-of-00004.safetensors",
427
+ "model.visual.encoder.layers.1.self_attn.proj.weight": "model-00004-of-00004.safetensors",
428
+ "model.visual.encoder.layers.1.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
429
+ "model.visual.encoder.layers.1.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
430
+ "model.visual.encoder.layers.10.layer_norm1.bias": "model-00004-of-00004.safetensors",
431
+ "model.visual.encoder.layers.10.layer_norm1.weight": "model-00004-of-00004.safetensors",
432
+ "model.visual.encoder.layers.10.layer_norm2.bias": "model-00004-of-00004.safetensors",
433
+ "model.visual.encoder.layers.10.layer_norm2.weight": "model-00004-of-00004.safetensors",
434
+ "model.visual.encoder.layers.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
435
+ "model.visual.encoder.layers.10.mlp.fc1.weight": "model-00004-of-00004.safetensors",
436
+ "model.visual.encoder.layers.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
437
+ "model.visual.encoder.layers.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
438
+ "model.visual.encoder.layers.10.self_attn.proj.bias": "model-00004-of-00004.safetensors",
439
+ "model.visual.encoder.layers.10.self_attn.proj.weight": "model-00004-of-00004.safetensors",
440
+ "model.visual.encoder.layers.10.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
441
+ "model.visual.encoder.layers.10.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
442
+ "model.visual.encoder.layers.11.layer_norm1.bias": "model-00004-of-00004.safetensors",
443
+ "model.visual.encoder.layers.11.layer_norm1.weight": "model-00004-of-00004.safetensors",
444
+ "model.visual.encoder.layers.11.layer_norm2.bias": "model-00004-of-00004.safetensors",
445
+ "model.visual.encoder.layers.11.layer_norm2.weight": "model-00004-of-00004.safetensors",
446
+ "model.visual.encoder.layers.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
447
+ "model.visual.encoder.layers.11.mlp.fc1.weight": "model-00004-of-00004.safetensors",
448
+ "model.visual.encoder.layers.11.mlp.fc2.bias": "model-00004-of-00004.safetensors",
449
+ "model.visual.encoder.layers.11.mlp.fc2.weight": "model-00004-of-00004.safetensors",
450
+ "model.visual.encoder.layers.11.self_attn.proj.bias": "model-00004-of-00004.safetensors",
451
+ "model.visual.encoder.layers.11.self_attn.proj.weight": "model-00004-of-00004.safetensors",
452
+ "model.visual.encoder.layers.11.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
453
+ "model.visual.encoder.layers.11.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
454
+ "model.visual.encoder.layers.12.layer_norm1.bias": "model-00004-of-00004.safetensors",
455
+ "model.visual.encoder.layers.12.layer_norm1.weight": "model-00004-of-00004.safetensors",
456
+ "model.visual.encoder.layers.12.layer_norm2.bias": "model-00004-of-00004.safetensors",
457
+ "model.visual.encoder.layers.12.layer_norm2.weight": "model-00004-of-00004.safetensors",
458
+ "model.visual.encoder.layers.12.mlp.fc1.bias": "model-00004-of-00004.safetensors",
459
+ "model.visual.encoder.layers.12.mlp.fc1.weight": "model-00004-of-00004.safetensors",
460
+ "model.visual.encoder.layers.12.mlp.fc2.bias": "model-00004-of-00004.safetensors",
461
+ "model.visual.encoder.layers.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
462
+ "model.visual.encoder.layers.12.self_attn.proj.bias": "model-00004-of-00004.safetensors",
463
+ "model.visual.encoder.layers.12.self_attn.proj.weight": "model-00004-of-00004.safetensors",
464
+ "model.visual.encoder.layers.12.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
465
+ "model.visual.encoder.layers.12.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
466
+ "model.visual.encoder.layers.13.layer_norm1.bias": "model-00004-of-00004.safetensors",
467
+ "model.visual.encoder.layers.13.layer_norm1.weight": "model-00004-of-00004.safetensors",
468
+ "model.visual.encoder.layers.13.layer_norm2.bias": "model-00004-of-00004.safetensors",
469
+ "model.visual.encoder.layers.13.layer_norm2.weight": "model-00004-of-00004.safetensors",
470
+ "model.visual.encoder.layers.13.mlp.fc1.bias": "model-00004-of-00004.safetensors",
471
+ "model.visual.encoder.layers.13.mlp.fc1.weight": "model-00004-of-00004.safetensors",
472
+ "model.visual.encoder.layers.13.mlp.fc2.bias": "model-00004-of-00004.safetensors",
473
+ "model.visual.encoder.layers.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
474
+ "model.visual.encoder.layers.13.self_attn.proj.bias": "model-00004-of-00004.safetensors",
475
+ "model.visual.encoder.layers.13.self_attn.proj.weight": "model-00004-of-00004.safetensors",
476
+ "model.visual.encoder.layers.13.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
477
+ "model.visual.encoder.layers.13.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
478
+ "model.visual.encoder.layers.14.layer_norm1.bias": "model-00004-of-00004.safetensors",
479
+ "model.visual.encoder.layers.14.layer_norm1.weight": "model-00004-of-00004.safetensors",
480
+ "model.visual.encoder.layers.14.layer_norm2.bias": "model-00004-of-00004.safetensors",
481
+ "model.visual.encoder.layers.14.layer_norm2.weight": "model-00004-of-00004.safetensors",
482
+ "model.visual.encoder.layers.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
483
+ "model.visual.encoder.layers.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
484
+ "model.visual.encoder.layers.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
485
+ "model.visual.encoder.layers.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
486
+ "model.visual.encoder.layers.14.self_attn.proj.bias": "model-00004-of-00004.safetensors",
487
+ "model.visual.encoder.layers.14.self_attn.proj.weight": "model-00004-of-00004.safetensors",
488
+ "model.visual.encoder.layers.14.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
489
+ "model.visual.encoder.layers.14.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
490
+ "model.visual.encoder.layers.15.layer_norm1.bias": "model-00004-of-00004.safetensors",
491
+ "model.visual.encoder.layers.15.layer_norm1.weight": "model-00004-of-00004.safetensors",
492
+ "model.visual.encoder.layers.15.layer_norm2.bias": "model-00004-of-00004.safetensors",
493
+ "model.visual.encoder.layers.15.layer_norm2.weight": "model-00004-of-00004.safetensors",
494
+ "model.visual.encoder.layers.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
495
+ "model.visual.encoder.layers.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
496
+ "model.visual.encoder.layers.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
497
+ "model.visual.encoder.layers.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
498
+ "model.visual.encoder.layers.15.self_attn.proj.bias": "model-00004-of-00004.safetensors",
499
+ "model.visual.encoder.layers.15.self_attn.proj.weight": "model-00004-of-00004.safetensors",
500
+ "model.visual.encoder.layers.15.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
501
+ "model.visual.encoder.layers.15.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
502
+ "model.visual.encoder.layers.16.layer_norm1.bias": "model-00004-of-00004.safetensors",
503
+ "model.visual.encoder.layers.16.layer_norm1.weight": "model-00004-of-00004.safetensors",
504
+ "model.visual.encoder.layers.16.layer_norm2.bias": "model-00004-of-00004.safetensors",
505
+ "model.visual.encoder.layers.16.layer_norm2.weight": "model-00004-of-00004.safetensors",
506
+ "model.visual.encoder.layers.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
507
+ "model.visual.encoder.layers.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
508
+ "model.visual.encoder.layers.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
509
+ "model.visual.encoder.layers.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
510
+ "model.visual.encoder.layers.16.self_attn.proj.bias": "model-00004-of-00004.safetensors",
511
+ "model.visual.encoder.layers.16.self_attn.proj.weight": "model-00004-of-00004.safetensors",
512
+ "model.visual.encoder.layers.16.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
513
+ "model.visual.encoder.layers.16.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
514
+ "model.visual.encoder.layers.17.layer_norm1.bias": "model-00004-of-00004.safetensors",
515
+ "model.visual.encoder.layers.17.layer_norm1.weight": "model-00004-of-00004.safetensors",
516
+ "model.visual.encoder.layers.17.layer_norm2.bias": "model-00004-of-00004.safetensors",
517
+ "model.visual.encoder.layers.17.layer_norm2.weight": "model-00004-of-00004.safetensors",
518
+ "model.visual.encoder.layers.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
519
+ "model.visual.encoder.layers.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
520
+ "model.visual.encoder.layers.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
521
+ "model.visual.encoder.layers.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
522
+ "model.visual.encoder.layers.17.self_attn.proj.bias": "model-00004-of-00004.safetensors",
523
+ "model.visual.encoder.layers.17.self_attn.proj.weight": "model-00004-of-00004.safetensors",
524
+ "model.visual.encoder.layers.17.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
525
+ "model.visual.encoder.layers.17.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
526
+ "model.visual.encoder.layers.18.layer_norm1.bias": "model-00004-of-00004.safetensors",
527
+ "model.visual.encoder.layers.18.layer_norm1.weight": "model-00004-of-00004.safetensors",
528
+ "model.visual.encoder.layers.18.layer_norm2.bias": "model-00004-of-00004.safetensors",
529
+ "model.visual.encoder.layers.18.layer_norm2.weight": "model-00004-of-00004.safetensors",
530
+ "model.visual.encoder.layers.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
531
+ "model.visual.encoder.layers.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
532
+ "model.visual.encoder.layers.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
533
+ "model.visual.encoder.layers.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
534
+ "model.visual.encoder.layers.18.self_attn.proj.bias": "model-00004-of-00004.safetensors",
535
+ "model.visual.encoder.layers.18.self_attn.proj.weight": "model-00004-of-00004.safetensors",
536
+ "model.visual.encoder.layers.18.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
537
+ "model.visual.encoder.layers.18.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
538
+ "model.visual.encoder.layers.19.layer_norm1.bias": "model-00004-of-00004.safetensors",
539
+ "model.visual.encoder.layers.19.layer_norm1.weight": "model-00004-of-00004.safetensors",
540
+ "model.visual.encoder.layers.19.layer_norm2.bias": "model-00004-of-00004.safetensors",
541
+ "model.visual.encoder.layers.19.layer_norm2.weight": "model-00004-of-00004.safetensors",
542
+ "model.visual.encoder.layers.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
543
+ "model.visual.encoder.layers.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
544
+ "model.visual.encoder.layers.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
545
+ "model.visual.encoder.layers.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
546
+ "model.visual.encoder.layers.19.self_attn.proj.bias": "model-00004-of-00004.safetensors",
547
+ "model.visual.encoder.layers.19.self_attn.proj.weight": "model-00004-of-00004.safetensors",
548
+ "model.visual.encoder.layers.19.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
549
+ "model.visual.encoder.layers.19.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
550
+ "model.visual.encoder.layers.2.layer_norm1.bias": "model-00004-of-00004.safetensors",
551
+ "model.visual.encoder.layers.2.layer_norm1.weight": "model-00004-of-00004.safetensors",
552
+ "model.visual.encoder.layers.2.layer_norm2.bias": "model-00004-of-00004.safetensors",
553
+ "model.visual.encoder.layers.2.layer_norm2.weight": "model-00004-of-00004.safetensors",
554
+ "model.visual.encoder.layers.2.mlp.fc1.bias": "model-00004-of-00004.safetensors",
555
+ "model.visual.encoder.layers.2.mlp.fc1.weight": "model-00004-of-00004.safetensors",
556
+ "model.visual.encoder.layers.2.mlp.fc2.bias": "model-00004-of-00004.safetensors",
557
+ "model.visual.encoder.layers.2.mlp.fc2.weight": "model-00004-of-00004.safetensors",
558
+ "model.visual.encoder.layers.2.self_attn.proj.bias": "model-00004-of-00004.safetensors",
559
+ "model.visual.encoder.layers.2.self_attn.proj.weight": "model-00004-of-00004.safetensors",
560
+ "model.visual.encoder.layers.2.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
561
+ "model.visual.encoder.layers.2.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
562
+ "model.visual.encoder.layers.20.layer_norm1.bias": "model-00004-of-00004.safetensors",
563
+ "model.visual.encoder.layers.20.layer_norm1.weight": "model-00004-of-00004.safetensors",
564
+ "model.visual.encoder.layers.20.layer_norm2.bias": "model-00004-of-00004.safetensors",
565
+ "model.visual.encoder.layers.20.layer_norm2.weight": "model-00004-of-00004.safetensors",
566
+ "model.visual.encoder.layers.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
567
+ "model.visual.encoder.layers.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
568
+ "model.visual.encoder.layers.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
569
+ "model.visual.encoder.layers.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
570
+ "model.visual.encoder.layers.20.self_attn.proj.bias": "model-00004-of-00004.safetensors",
571
+ "model.visual.encoder.layers.20.self_attn.proj.weight": "model-00004-of-00004.safetensors",
572
+ "model.visual.encoder.layers.20.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
573
+ "model.visual.encoder.layers.20.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
574
+ "model.visual.encoder.layers.21.layer_norm1.bias": "model-00004-of-00004.safetensors",
575
+ "model.visual.encoder.layers.21.layer_norm1.weight": "model-00004-of-00004.safetensors",
576
+ "model.visual.encoder.layers.21.layer_norm2.bias": "model-00004-of-00004.safetensors",
577
+ "model.visual.encoder.layers.21.layer_norm2.weight": "model-00004-of-00004.safetensors",
578
+ "model.visual.encoder.layers.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
579
+ "model.visual.encoder.layers.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
580
+ "model.visual.encoder.layers.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
581
+ "model.visual.encoder.layers.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
582
+ "model.visual.encoder.layers.21.self_attn.proj.bias": "model-00004-of-00004.safetensors",
583
+ "model.visual.encoder.layers.21.self_attn.proj.weight": "model-00004-of-00004.safetensors",
584
+ "model.visual.encoder.layers.21.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
585
+ "model.visual.encoder.layers.21.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
586
+ "model.visual.encoder.layers.22.layer_norm1.bias": "model-00004-of-00004.safetensors",
587
+ "model.visual.encoder.layers.22.layer_norm1.weight": "model-00004-of-00004.safetensors",
588
+ "model.visual.encoder.layers.22.layer_norm2.bias": "model-00004-of-00004.safetensors",
589
+ "model.visual.encoder.layers.22.layer_norm2.weight": "model-00004-of-00004.safetensors",
590
+ "model.visual.encoder.layers.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
591
+ "model.visual.encoder.layers.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
592
+ "model.visual.encoder.layers.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
593
+ "model.visual.encoder.layers.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
594
+ "model.visual.encoder.layers.22.self_attn.proj.bias": "model-00004-of-00004.safetensors",
595
+ "model.visual.encoder.layers.22.self_attn.proj.weight": "model-00004-of-00004.safetensors",
596
+ "model.visual.encoder.layers.22.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
597
+ "model.visual.encoder.layers.22.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
598
+ "model.visual.encoder.layers.23.layer_norm1.bias": "model-00004-of-00004.safetensors",
599
+ "model.visual.encoder.layers.23.layer_norm1.weight": "model-00004-of-00004.safetensors",
600
+ "model.visual.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
601
+ "model.visual.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
602
+ "model.visual.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
603
+ "model.visual.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
604
+ "model.visual.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
605
+ "model.visual.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
606
+ "model.visual.encoder.layers.23.self_attn.proj.bias": "model-00004-of-00004.safetensors",
607
+ "model.visual.encoder.layers.23.self_attn.proj.weight": "model-00004-of-00004.safetensors",
608
+ "model.visual.encoder.layers.23.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
609
+ "model.visual.encoder.layers.23.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
610
+ "model.visual.encoder.layers.3.layer_norm1.bias": "model-00004-of-00004.safetensors",
611
+ "model.visual.encoder.layers.3.layer_norm1.weight": "model-00004-of-00004.safetensors",
612
+ "model.visual.encoder.layers.3.layer_norm2.bias": "model-00004-of-00004.safetensors",
613
+ "model.visual.encoder.layers.3.layer_norm2.weight": "model-00004-of-00004.safetensors",
614
+ "model.visual.encoder.layers.3.mlp.fc1.bias": "model-00004-of-00004.safetensors",
615
+ "model.visual.encoder.layers.3.mlp.fc1.weight": "model-00004-of-00004.safetensors",
616
+ "model.visual.encoder.layers.3.mlp.fc2.bias": "model-00004-of-00004.safetensors",
617
+ "model.visual.encoder.layers.3.mlp.fc2.weight": "model-00004-of-00004.safetensors",
618
+ "model.visual.encoder.layers.3.self_attn.proj.bias": "model-00004-of-00004.safetensors",
619
+ "model.visual.encoder.layers.3.self_attn.proj.weight": "model-00004-of-00004.safetensors",
620
+ "model.visual.encoder.layers.3.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
621
+ "model.visual.encoder.layers.3.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
622
+ "model.visual.encoder.layers.4.layer_norm1.bias": "model-00004-of-00004.safetensors",
623
+ "model.visual.encoder.layers.4.layer_norm1.weight": "model-00004-of-00004.safetensors",
624
+ "model.visual.encoder.layers.4.layer_norm2.bias": "model-00004-of-00004.safetensors",
625
+ "model.visual.encoder.layers.4.layer_norm2.weight": "model-00004-of-00004.safetensors",
626
+ "model.visual.encoder.layers.4.mlp.fc1.bias": "model-00004-of-00004.safetensors",
627
+ "model.visual.encoder.layers.4.mlp.fc1.weight": "model-00004-of-00004.safetensors",
628
+ "model.visual.encoder.layers.4.mlp.fc2.bias": "model-00004-of-00004.safetensors",
629
+ "model.visual.encoder.layers.4.mlp.fc2.weight": "model-00004-of-00004.safetensors",
630
+ "model.visual.encoder.layers.4.self_attn.proj.bias": "model-00004-of-00004.safetensors",
631
+ "model.visual.encoder.layers.4.self_attn.proj.weight": "model-00004-of-00004.safetensors",
632
+ "model.visual.encoder.layers.4.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
633
+ "model.visual.encoder.layers.4.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
634
+ "model.visual.encoder.layers.5.layer_norm1.bias": "model-00004-of-00004.safetensors",
635
+ "model.visual.encoder.layers.5.layer_norm1.weight": "model-00004-of-00004.safetensors",
636
+ "model.visual.encoder.layers.5.layer_norm2.bias": "model-00004-of-00004.safetensors",
637
+ "model.visual.encoder.layers.5.layer_norm2.weight": "model-00004-of-00004.safetensors",
638
+ "model.visual.encoder.layers.5.mlp.fc1.bias": "model-00004-of-00004.safetensors",
639
+ "model.visual.encoder.layers.5.mlp.fc1.weight": "model-00004-of-00004.safetensors",
640
+ "model.visual.encoder.layers.5.mlp.fc2.bias": "model-00004-of-00004.safetensors",
641
+ "model.visual.encoder.layers.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
642
+ "model.visual.encoder.layers.5.self_attn.proj.bias": "model-00004-of-00004.safetensors",
643
+ "model.visual.encoder.layers.5.self_attn.proj.weight": "model-00004-of-00004.safetensors",
644
+ "model.visual.encoder.layers.5.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
645
+ "model.visual.encoder.layers.5.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
646
+ "model.visual.encoder.layers.6.layer_norm1.bias": "model-00004-of-00004.safetensors",
647
+ "model.visual.encoder.layers.6.layer_norm1.weight": "model-00004-of-00004.safetensors",
648
+ "model.visual.encoder.layers.6.layer_norm2.bias": "model-00004-of-00004.safetensors",
649
+ "model.visual.encoder.layers.6.layer_norm2.weight": "model-00004-of-00004.safetensors",
650
+ "model.visual.encoder.layers.6.mlp.fc1.bias": "model-00004-of-00004.safetensors",
651
+ "model.visual.encoder.layers.6.mlp.fc1.weight": "model-00004-of-00004.safetensors",
652
+ "model.visual.encoder.layers.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
653
+ "model.visual.encoder.layers.6.mlp.fc2.weight": "model-00004-of-00004.safetensors",
654
+ "model.visual.encoder.layers.6.self_attn.proj.bias": "model-00004-of-00004.safetensors",
655
+ "model.visual.encoder.layers.6.self_attn.proj.weight": "model-00004-of-00004.safetensors",
656
+ "model.visual.encoder.layers.6.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
657
+ "model.visual.encoder.layers.6.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
658
+ "model.visual.encoder.layers.7.layer_norm1.bias": "model-00004-of-00004.safetensors",
659
+ "model.visual.encoder.layers.7.layer_norm1.weight": "model-00004-of-00004.safetensors",
660
+ "model.visual.encoder.layers.7.layer_norm2.bias": "model-00004-of-00004.safetensors",
661
+ "model.visual.encoder.layers.7.layer_norm2.weight": "model-00004-of-00004.safetensors",
662
+ "model.visual.encoder.layers.7.mlp.fc1.bias": "model-00004-of-00004.safetensors",
663
+ "model.visual.encoder.layers.7.mlp.fc1.weight": "model-00004-of-00004.safetensors",
664
+ "model.visual.encoder.layers.7.mlp.fc2.bias": "model-00004-of-00004.safetensors",
665
+ "model.visual.encoder.layers.7.mlp.fc2.weight": "model-00004-of-00004.safetensors",
666
+ "model.visual.encoder.layers.7.self_attn.proj.bias": "model-00004-of-00004.safetensors",
667
+ "model.visual.encoder.layers.7.self_attn.proj.weight": "model-00004-of-00004.safetensors",
668
+ "model.visual.encoder.layers.7.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
669
+ "model.visual.encoder.layers.7.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
670
+ "model.visual.encoder.layers.8.layer_norm1.bias": "model-00004-of-00004.safetensors",
671
+ "model.visual.encoder.layers.8.layer_norm1.weight": "model-00004-of-00004.safetensors",
672
+ "model.visual.encoder.layers.8.layer_norm2.bias": "model-00004-of-00004.safetensors",
673
+ "model.visual.encoder.layers.8.layer_norm2.weight": "model-00004-of-00004.safetensors",
674
+ "model.visual.encoder.layers.8.mlp.fc1.bias": "model-00004-of-00004.safetensors",
675
+ "model.visual.encoder.layers.8.mlp.fc1.weight": "model-00004-of-00004.safetensors",
676
+ "model.visual.encoder.layers.8.mlp.fc2.bias": "model-00004-of-00004.safetensors",
677
+ "model.visual.encoder.layers.8.mlp.fc2.weight": "model-00004-of-00004.safetensors",
678
+ "model.visual.encoder.layers.8.self_attn.proj.bias": "model-00004-of-00004.safetensors",
679
+ "model.visual.encoder.layers.8.self_attn.proj.weight": "model-00004-of-00004.safetensors",
680
+ "model.visual.encoder.layers.8.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
681
+ "model.visual.encoder.layers.8.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
682
+ "model.visual.encoder.layers.9.layer_norm1.bias": "model-00004-of-00004.safetensors",
683
+ "model.visual.encoder.layers.9.layer_norm1.weight": "model-00004-of-00004.safetensors",
684
+ "model.visual.encoder.layers.9.layer_norm2.bias": "model-00004-of-00004.safetensors",
685
+ "model.visual.encoder.layers.9.layer_norm2.weight": "model-00004-of-00004.safetensors",
686
+ "model.visual.encoder.layers.9.mlp.fc1.bias": "model-00004-of-00004.safetensors",
687
+ "model.visual.encoder.layers.9.mlp.fc1.weight": "model-00004-of-00004.safetensors",
688
+ "model.visual.encoder.layers.9.mlp.fc2.bias": "model-00004-of-00004.safetensors",
689
+ "model.visual.encoder.layers.9.mlp.fc2.weight": "model-00004-of-00004.safetensors",
690
+ "model.visual.encoder.layers.9.self_attn.proj.bias": "model-00004-of-00004.safetensors",
691
+ "model.visual.encoder.layers.9.self_attn.proj.weight": "model-00004-of-00004.safetensors",
692
+ "model.visual.encoder.layers.9.self_attn.qkv.bias": "model-00004-of-00004.safetensors",
693
+ "model.visual.encoder.layers.9.self_attn.qkv.weight": "model-00004-of-00004.safetensors",
694
+ "model.visual.layernorm_pre.bias": "model-00004-of-00004.safetensors",
695
+ "model.visual.layernorm_pre.weight": "model-00004-of-00004.safetensors",
696
+ "model.visual.merger.ln_q.bias": "model-00004-of-00004.safetensors",
697
+ "model.visual.merger.ln_q.weight": "model-00004-of-00004.safetensors",
698
+ "model.visual.merger.mlp.0.bias": "model-00004-of-00004.safetensors",
699
+ "model.visual.merger.mlp.0.weight": "model-00004-of-00004.safetensors",
700
+ "model.visual.merger.mlp.2.bias": "model-00004-of-00004.safetensors",
701
+ "model.visual.merger.mlp.2.weight": "model-00004-of-00004.safetensors"
702
+ }
703
+ }
modeling_llava_onevision2.py ADDED
@@ -0,0 +1,1607 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Callable
2
+ from dataclasses import dataclass
3
+ from typing import Any, Optional, Union
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn import LayerNorm
8
+
9
+ from transformers import AutoModel
10
+ from transformers.cache_utils import Cache
11
+ from transformers.generation import GenerationMixin
12
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
13
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
14
+ from transformers.models.siglip.modeling_siglip import SiglipMLP
15
+ from transformers.processing_utils import Unpack
16
+ from transformers.utils import (
17
+ TransformersKwargs,
18
+ auto_docstring,
19
+ can_return_tuple,
20
+ replace_return_docstrings,
21
+ )
22
+ from transformers.utils.generic import is_flash_attention_requested
23
+
24
+ from .configuration_llava_onevision2 import LlavaOnevision2Config, LlavaOnevision2VisionConfig
25
+
26
+
27
+ @dataclass
28
+ @auto_docstring(
29
+ custom_intro="""
30
+ Base class for Llava-Onevision-1.5 outputs, with hidden states and attentions.
31
+ """
32
+ )
33
+ class LlavaOnevision2ModelOutputWithPast(ModelOutput):
34
+ r"""
35
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
36
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
37
+
38
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
39
+ `past_key_values` input) to speed up sequential decoding.
40
+ """
41
+
42
+ last_hidden_state: Optional[torch.FloatTensor] = None
43
+ past_key_values: Optional[Cache] = None
44
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
45
+ attentions: Optional[tuple[torch.FloatTensor]] = None
46
+
47
+
48
+ @dataclass
49
+ @auto_docstring(
50
+ custom_intro="""
51
+ Base class for Llava-Onevision-1.5 causal language model (or autoregressive) outputs.
52
+ """
53
+ )
54
+ class LlavaOnevision2CausalLMOutputWithPast(ModelOutput):
55
+ r"""
56
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
57
+ Language modeling loss (for next-token prediction).
58
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
59
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
60
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
61
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
62
+
63
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
64
+ `past_key_values` input) to speed up sequential decoding.
65
+ """
66
+
67
+ loss: Optional[torch.FloatTensor] = None
68
+ logits: Optional[torch.FloatTensor] = None
69
+ past_key_values: Optional[Cache] = None
70
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
71
+ attentions: Optional[tuple[torch.FloatTensor]] = None
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # Vision Rotary Embedding
76
+ # ---------------------------------------------------------------------------
77
+
78
+
79
+ class VisionRotaryEmbedding(nn.Module):
80
+ """
81
+ 3D (T,H,W) Rotary frequency constructor with 4:6:6 split.
82
+ Supports both grid_thw-based and explicit position-based RoPE computation.
83
+ """
84
+
85
+ def __init__(self, config: LlavaOnevision2VisionConfig):
86
+ super().__init__()
87
+ head_dim = config.hidden_size // config.num_attention_heads
88
+ base = config.rope_theta
89
+
90
+ assert head_dim % 2 == 0, "head_dim must be even for rotary."
91
+ assert head_dim % 16 == 0, "head_dim must be divisible by 16."
92
+ half = head_dim // 2
93
+ assert half % 16 == 0, "head_dim//2 must also be divisible by 16 to split into 4:6:6."
94
+
95
+ self.head_dim = head_dim
96
+ self.half = half
97
+ self.base = base
98
+
99
+ # 4:6:6 split for T:H:W
100
+ unit = half // 16
101
+ self.t_size = 4 * unit
102
+ self.h_size = 6 * unit
103
+ self.w_size = 6 * unit
104
+
105
+ self.register_buffer(
106
+ "inv_freq_t",
107
+ 1.0 / (base ** (torch.arange(self.t_size, dtype=torch.float32) / self.t_size)),
108
+ persistent=False,
109
+ )
110
+ self.register_buffer(
111
+ "inv_freq_h",
112
+ 1.0 / (base ** (torch.arange(self.h_size, dtype=torch.float32) / self.h_size)),
113
+ persistent=False,
114
+ )
115
+ self.register_buffer(
116
+ "inv_freq_w",
117
+ 1.0 / (base ** (torch.arange(self.w_size, dtype=torch.float32) / self.w_size)),
118
+ persistent=False,
119
+ )
120
+
121
+ def forward(self, grid_thw: torch.Tensor) -> torch.Tensor:
122
+ """
123
+ Compute rotary position embeddings from grid_thw (Qwen2VL style).
124
+
125
+ Args:
126
+ grid_thw: [num_samples, 3] tensor with [t, h, w] for each sample
127
+
128
+ Returns:
129
+ freqs: [total_seq_len, half] tensor of position frequencies
130
+ """
131
+ device = grid_thw.device
132
+ inv_t = self.inv_freq_t.to(device=device)
133
+ inv_h = self.inv_freq_h.to(device=device)
134
+ inv_w = self.inv_freq_w.to(device=device)
135
+
136
+ all_freqs = []
137
+ for sample_thw in grid_thw:
138
+ t, h, w = sample_thw[0].item(), sample_thw[1].item(), sample_thw[2].item()
139
+
140
+ # Compute frequency tables
141
+ ft = torch.outer(torch.arange(t, device=device, dtype=torch.float32), inv_t)
142
+ fh = torch.outer(torch.arange(h, device=device, dtype=torch.float32), inv_h)
143
+ fw = torch.outer(torch.arange(w, device=device, dtype=torch.float32), inv_w)
144
+
145
+ # Build position indices for this sample
146
+ t_ids = torch.arange(t, device=device).repeat_interleave(h * w)
147
+ h_ids = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
148
+ w_ids = torch.arange(w, device=device).repeat(h).repeat(t)
149
+
150
+ # Concatenate frequencies: [seq_len, half]
151
+ sample_freqs = torch.cat([ft[t_ids], fh[h_ids], fw[w_ids]], dim=-1)
152
+ all_freqs.append(sample_freqs)
153
+
154
+ return torch.cat(all_freqs, dim=0)
155
+
156
+ def forward_from_positions(self, patch_positions: torch.Tensor) -> torch.Tensor:
157
+ """
158
+ Compute rotary position embeddings from explicit patch positions.
159
+
160
+ Args:
161
+ patch_positions: [seq_len, 3] tensor with [t, h, w] positions for each patch
162
+
163
+ Returns:
164
+ freqs: [seq_len, half] tensor of position frequencies
165
+ """
166
+ device = patch_positions.device
167
+ inv_t = self.inv_freq_t.to(device=device)
168
+ inv_h = self.inv_freq_h.to(device=device)
169
+ inv_w = self.inv_freq_w.to(device=device)
170
+
171
+ t_pos = patch_positions[:, 0].float()
172
+ h_pos = patch_positions[:, 1].float()
173
+ w_pos = patch_positions[:, 2].float()
174
+
175
+ ft = torch.outer(t_pos, inv_t)
176
+ fh = torch.outer(h_pos, inv_h)
177
+ fw = torch.outer(w_pos, inv_w)
178
+
179
+ return torch.cat([ft, fh, fw], dim=-1)
180
+
181
+ def forward_with_thw(self, t: int, h: int, w: int, device=None) -> torch.Tensor:
182
+ """
183
+ Compute rotary position embeddings from explicit t, h, w dimensions.
184
+
185
+ Args:
186
+ t: Number of temporal frames
187
+ h: Number of height patches
188
+ w: Number of width patches
189
+ device: Target device
190
+
191
+ Returns:
192
+ freqs: [t*h*w, half] tensor of position frequencies
193
+ """
194
+ if device is None:
195
+ device = self.inv_freq_t.device
196
+
197
+ inv_t = self.inv_freq_t.to(device=device)
198
+ inv_h = self.inv_freq_h.to(device=device)
199
+ inv_w = self.inv_freq_w.to(device=device)
200
+
201
+ ft = torch.outer(torch.arange(t, device=device, dtype=torch.float32), inv_t)
202
+ fh = torch.outer(torch.arange(h, device=device, dtype=torch.float32), inv_h)
203
+ fw = torch.outer(torch.arange(w, device=device, dtype=torch.float32), inv_w)
204
+
205
+ t_ids = torch.arange(t, device=device).repeat_interleave(h * w)
206
+ h_ids = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
207
+ w_ids = torch.arange(w, device=device).repeat(h).repeat(t)
208
+
209
+ freqs = torch.cat([ft[t_ids], fh[h_ids], fw[w_ids]], dim=-1)
210
+ return freqs
211
+
212
+
213
+ # ---------------------------------------------------------------------------
214
+ # Patch Embedding
215
+ # ---------------------------------------------------------------------------
216
+
217
+
218
+ class OneVisionEncoderEmbeddings(nn.Module):
219
+ """
220
+ Patch embedding layer that converts pre-processed patches to embeddings.
221
+
222
+ This module is designed to receive patches that have already been extracted
223
+ and arranged by the Qwen2VL image processor in 2x2 block spatial order.
224
+
225
+ Input format: [total_patches, num_channels, patch_size, patch_size]
226
+ Output format: [total_patches, embed_dim]
227
+ """
228
+
229
+ def __init__(self, config: LlavaOnevision2VisionConfig):
230
+ super().__init__()
231
+ self.config = config
232
+ self.embed_dim = config.hidden_size
233
+ self.image_size = config.image_size
234
+ self.patch_size = config.patch_size
235
+ self.in_channels = config.num_channels
236
+
237
+ self.patch_embedding = nn.Conv2d(
238
+ in_channels=config.num_channels,
239
+ out_channels=self.embed_dim,
240
+ kernel_size=self.patch_size,
241
+ stride=self.patch_size,
242
+ bias=False,
243
+ )
244
+
245
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
246
+ target_dtype = self.patch_embedding.weight.dtype
247
+ hidden_states = hidden_states.view(-1, self.in_channels, self.patch_size, self.patch_size)
248
+ hidden_states = self.patch_embedding(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
249
+
250
+ return hidden_states
251
+
252
+
253
+ # ---------------------------------------------------------------------------
254
+ # Patch Merger
255
+ # ---------------------------------------------------------------------------
256
+
257
+
258
+ class LlavaOnevision2VisionPatchMerger(nn.Module):
259
+ """
260
+ Patch merger that merges spatial_merge_size x spatial_merge_size patches into one.
261
+
262
+ This module is designed to work with Qwen2VL-style patch processing where patches
263
+ are already arranged in 2x2 block order by the image processor.
264
+ """
265
+
266
+ def __init__(
267
+ self,
268
+ dim: int,
269
+ context_dim: int,
270
+ spatial_merge_size: int = 2,
271
+ layer_norm_eps: float = 1e-05,
272
+ use_patch_position_encoding: bool = False,
273
+ patch_position_encoding_type: str = "absolute",
274
+ max_position_embeddings: int = 8192,
275
+ ) -> None:
276
+ super().__init__()
277
+ self.hidden_size = context_dim * (spatial_merge_size**2)
278
+ self.ln_q = LayerNorm(context_dim, eps=layer_norm_eps)
279
+ self.mlp = nn.Sequential(
280
+ nn.Linear(self.hidden_size, self.hidden_size),
281
+ nn.GELU(),
282
+ nn.Linear(self.hidden_size, dim),
283
+ )
284
+ self.spatial_merge_size = spatial_merge_size
285
+ self.use_patch_position_encoding = use_patch_position_encoding
286
+ self.patch_position_encoding_type = patch_position_encoding_type
287
+
288
+ if self.use_patch_position_encoding:
289
+ if self.patch_position_encoding_type != "absolute":
290
+ raise ValueError(
291
+ f"Unknown patch_position_encoding_type: {self.patch_position_encoding_type}. "
292
+ "Only 'absolute' is supported."
293
+ )
294
+ self.pos_emb_h = nn.Embedding(max_position_embeddings, dim)
295
+ self.pos_emb_w = nn.Embedding(max_position_embeddings, dim)
296
+
297
+ def forward(self, x: torch.Tensor, patch_positions: Optional[torch.Tensor] = None) -> torch.Tensor:
298
+ """
299
+ Merge patches from Qwen2VL-style input.
300
+
301
+ The input patches are already arranged in 2x2 block order by the image processor,
302
+ so we simply need to apply LayerNorm, reshape, and project through MLP.
303
+
304
+ Args:
305
+ x: Input tensor of shape [batch_size, seq_len, hidden_size] or [seq_len, hidden_size]
306
+ where seq_len = t * h * w (patches in 2x2 block order)
307
+
308
+ Returns:
309
+ Merged tensor of shape [batch_size, seq_len // spatial_merge_size^2, dim]
310
+ or [seq_len // spatial_merge_size^2, dim]
311
+ """
312
+ if patch_positions is not None and patch_positions.dim() == 3:
313
+ patch_positions = patch_positions.squeeze(0)
314
+
315
+ x = self.ln_q(x).view(-1, self.hidden_size)
316
+ x = self.mlp(x)
317
+
318
+ if self.use_patch_position_encoding and patch_positions is not None:
319
+ pp = patch_positions.view(-1, self.spatial_merge_size**2, 3)
320
+ pp = pp[:, 0, :]
321
+ pp = (pp // self.spatial_merge_size).long()
322
+
323
+ x = x + self.pos_emb_h(pp[:, 1]) + self.pos_emb_w(pp[:, 2])
324
+
325
+ return x
326
+
327
+
328
+ def rotate_half(x):
329
+ """
330
+ Interleaved rotation to match Source model's implementation.
331
+ (x1, x2, x3, x4) -> (-x2, x1, -x4, x3)
332
+ """
333
+ x_even = x[..., ::2]
334
+ x_odd = x[..., 1::2]
335
+ return torch.stack((-x_odd, x_even), dim=-1).flatten(-2)
336
+
337
+
338
+ def get_norm_layer(config):
339
+ if config.layer_norm_type == "rms_norm":
340
+ return nn.RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
341
+ else:
342
+ return nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
343
+
344
+
345
+ def apply_rotary_pos_emb(q, k, freqs):
346
+ # q, k: (B, H, L, D)
347
+ # freqs: (B, L, D)
348
+ orig_q_dtype = q.dtype
349
+ orig_k_dtype = k.dtype
350
+ q, k = q.float(), k.float()
351
+ # We need to broadcast freqs to match heads
352
+ # (B, L, D) -> (B, 1, L, D)
353
+ # Keep the same dtype as q, k to avoid memory doubling from float32 promotion
354
+ cos = freqs.cos().unsqueeze(1).float()
355
+ sin = freqs.sin().unsqueeze(1).float()
356
+
357
+ q_embed = (q * cos) + (rotate_half(q) * sin)
358
+ k_embed = (k * cos) + (rotate_half(k) * sin)
359
+ q_embed = q_embed.to(orig_q_dtype)
360
+ k_embed = k_embed.to(orig_k_dtype)
361
+ return q_embed, k_embed
362
+
363
+
364
+ def eager_attention_forward(
365
+ module: nn.Module,
366
+ query: torch.Tensor,
367
+ key: torch.Tensor,
368
+ value: torch.Tensor,
369
+ attention_mask: Optional[torch.Tensor],
370
+ scaling: float,
371
+ dropout: float = 0.0,
372
+ **kwargs,
373
+ ):
374
+ """Eager attention; query/key/value are expected as ``(B, H, L, D)``."""
375
+ attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
376
+ if attention_mask is not None:
377
+ attn_weights = attn_weights + attention_mask
378
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
379
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
380
+ attn_output = torch.matmul(attn_weights, value)
381
+ attn_output = attn_output.transpose(1, 2).contiguous() # (B, L, H, D)
382
+ return attn_output, attn_weights
383
+
384
+
385
+ class OneVisionEncoderAttention(nn.Module):
386
+ """
387
+ Multi-headed attention with RoPE support, dispatched through
388
+ :data:`ALL_ATTENTION_FUNCTIONS` (``eager`` / ``sdpa`` / ``flash_attention_2``)
389
+ based on ``config._attn_implementation``.
390
+ """
391
+
392
+ def __init__(self, config: LlavaOnevision2VisionConfig):
393
+ super().__init__()
394
+ self.config = config
395
+ self.embed_dim = config.hidden_size
396
+ self.num_heads = config.num_attention_heads
397
+ self.head_dim = self.embed_dim // self.num_heads
398
+ if self.head_dim * self.num_heads != self.embed_dim:
399
+ raise ValueError(
400
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
401
+ )
402
+
403
+ self.num_key_value_groups = 1 # required by repeat_kv-aware eager paths
404
+ self.scale = self.head_dim**-0.5
405
+ self.scaling = self.scale # alias expected by some attention interfaces
406
+ self.attention_dropout = config.attention_dropout
407
+ self.is_causal = False
408
+ self.qkv = nn.Linear(self.embed_dim, self.embed_dim * 3)
409
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
410
+
411
+ def forward(
412
+ self,
413
+ hidden_states: torch.Tensor,
414
+ attention_mask: Optional[torch.Tensor] = None,
415
+ rotary_pos_emb: Optional[torch.Tensor] = None,
416
+ output_attentions: bool = False,
417
+ cu_seqlens: Optional[torch.Tensor] = None,
418
+ max_seqlen: Optional[int] = None,
419
+ **kwargs,
420
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
421
+ batch_size, q_len, _ = hidden_states.size()
422
+ # (B, L, 3*H*D) -> (B, L, 3, H, D) -> 3 x (B, L, H, D) -> 3 x (B, H, L, D)
423
+ q, k, v = (
424
+ self.qkv(hidden_states)
425
+ .reshape(batch_size, q_len, 3, self.num_heads, self.head_dim)
426
+ .permute(2, 0, 1, 3, 4)
427
+ .unbind(0)
428
+ )
429
+ query_states = q.transpose(1, 2)
430
+ key_states = k.transpose(1, 2)
431
+ value_states = v.transpose(1, 2)
432
+
433
+ if rotary_pos_emb is not None:
434
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, rotary_pos_emb)
435
+
436
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
437
+ self.config._attn_implementation, eager_attention_forward
438
+ )
439
+ dropout = 0.0 if not self.training else self.attention_dropout
440
+
441
+ if cu_seqlens is not None and is_flash_attention_requested(self.config):
442
+ # Flash Attention varlen path: pass cu_seq_lens / max_length kwargs.
443
+ if max_seqlen is None:
444
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
445
+ attn_output, _ = attention_interface(
446
+ self,
447
+ query_states,
448
+ key_states,
449
+ value_states,
450
+ attention_mask=None,
451
+ scaling=self.scale,
452
+ dropout=dropout,
453
+ cu_seq_lens_q=cu_seqlens,
454
+ cu_seq_lens_k=cu_seqlens,
455
+ max_length_q=max_seqlen,
456
+ max_length_k=max_seqlen,
457
+ is_causal=False,
458
+ **kwargs,
459
+ )
460
+ elif cu_seqlens is not None:
461
+ # Non-FA implementations do not understand cu_seqlens directly; mirror
462
+ # Qwen3-VL by splitting the packed sequence into per-sample chunks
463
+ # along the L dim of (B, H, L, D) and running attention per chunk.
464
+ lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
465
+ splits = [torch.split(t, lengths, dim=2) for t in (query_states, key_states, value_states)]
466
+ attn_outputs = [
467
+ attention_interface(
468
+ self,
469
+ q_chunk,
470
+ k_chunk,
471
+ v_chunk,
472
+ attention_mask=None,
473
+ scaling=self.scale,
474
+ dropout=dropout,
475
+ is_causal=False,
476
+ **kwargs,
477
+ )[0]
478
+ for q_chunk, k_chunk, v_chunk in zip(*splits)
479
+ ]
480
+ # interface output is (B, l_i, H, D); concat along the L axis
481
+ attn_output = torch.cat(attn_outputs, dim=1)
482
+ else:
483
+ attn_mask = None
484
+ if attention_mask is not None:
485
+ attn_mask = attention_mask
486
+ if attn_mask.dim() == 2:
487
+ attn_mask = attn_mask.unsqueeze(0)
488
+ if attn_mask.shape[0] == 1 and batch_size > 1:
489
+ attn_mask = attn_mask.expand(batch_size, -1, -1)
490
+ attn_mask = attn_mask.unsqueeze(1) # (B, 1, L, L)
491
+ attn_output, _ = attention_interface(
492
+ self,
493
+ query_states,
494
+ key_states,
495
+ value_states,
496
+ attention_mask=attn_mask,
497
+ scaling=self.scale,
498
+ dropout=dropout,
499
+ is_causal=False,
500
+ **kwargs,
501
+ )
502
+
503
+ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
504
+ attn_output = self.proj(attn_output)
505
+
506
+ return attn_output, None
507
+
508
+
509
+ class OneVisionEncoderEncoderLayer(nn.Module):
510
+ """Vision encoder layer with pre-norm and Flash Attention 2."""
511
+
512
+ def __init__(self, config: LlavaOnevision2VisionConfig):
513
+ super().__init__()
514
+ self.embed_dim = config.hidden_size
515
+ self.self_attn = OneVisionEncoderAttention(config)
516
+ self.layer_norm1 = get_norm_layer(config)
517
+ self.mlp = SiglipMLP(config)
518
+ self.layer_norm2 = get_norm_layer(config)
519
+
520
+ def forward(
521
+ self,
522
+ hidden_states: torch.Tensor,
523
+ attention_mask: Optional[torch.Tensor] = None,
524
+ rotary_pos_emb: Optional[torch.Tensor] = None,
525
+ output_attentions: bool = False,
526
+ cu_seqlens: Optional[torch.Tensor] = None,
527
+ max_seqlen: Optional[int] = None,
528
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
529
+ residual = hidden_states
530
+ hidden_states = self.layer_norm1(hidden_states)
531
+
532
+ hidden_states, attn_weights = self.self_attn(
533
+ hidden_states=hidden_states,
534
+ attention_mask=attention_mask,
535
+ rotary_pos_emb=rotary_pos_emb,
536
+ output_attentions=output_attentions,
537
+ cu_seqlens=cu_seqlens,
538
+ max_seqlen=max_seqlen,
539
+ )
540
+ hidden_states = residual + hidden_states
541
+
542
+ residual = hidden_states
543
+ hidden_states = self.layer_norm2(hidden_states)
544
+ hidden_states = self.mlp(hidden_states)
545
+ hidden_states = residual + hidden_states
546
+
547
+ outputs = (hidden_states, attn_weights) if output_attentions else (hidden_states,)
548
+ return outputs
549
+
550
+
551
+ class OneVisionEncoderEncoder(nn.Module):
552
+ def __init__(self, config: LlavaOnevision2VisionConfig):
553
+ super().__init__()
554
+ self.config = config
555
+ self.layers = nn.ModuleList([OneVisionEncoderEncoderLayer(config) for _ in range(config.num_hidden_layers)])
556
+ # Gradient checkpointing support
557
+ self.gradient_checkpointing = False
558
+
559
+ def forward(
560
+ self,
561
+ hidden_states: torch.Tensor,
562
+ attention_mask: Optional[torch.Tensor] = None,
563
+ rotary_pos_emb: Optional[torch.Tensor] = None,
564
+ output_attentions: bool = False,
565
+ output_hidden_states: bool = False,
566
+ return_dict: bool = True,
567
+ cu_seqlens: Optional[torch.Tensor] = None,
568
+ max_seqlen: Optional[int] = None,
569
+ ) -> Union[tuple, BaseModelOutput]:
570
+ all_hidden_states = () if output_hidden_states else None
571
+ all_self_attentions = () if output_attentions else None
572
+
573
+ for layer in self.layers:
574
+ if output_hidden_states:
575
+ all_hidden_states = all_hidden_states + (hidden_states,)
576
+
577
+ if self.gradient_checkpointing and self.training:
578
+ layer_outputs = self._gradient_checkpointing_func(
579
+ layer.__call__,
580
+ hidden_states,
581
+ attention_mask,
582
+ rotary_pos_emb,
583
+ output_attentions,
584
+ cu_seqlens,
585
+ max_seqlen,
586
+ )
587
+ else:
588
+ layer_outputs = layer(
589
+ hidden_states,
590
+ attention_mask=attention_mask,
591
+ rotary_pos_emb=rotary_pos_emb,
592
+ output_attentions=output_attentions,
593
+ cu_seqlens=cu_seqlens,
594
+ max_seqlen=max_seqlen,
595
+ )
596
+
597
+ hidden_states = layer_outputs[0]
598
+
599
+ if output_attentions:
600
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
601
+
602
+ if output_hidden_states:
603
+ all_hidden_states = all_hidden_states + (hidden_states,)
604
+
605
+ if not return_dict:
606
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
607
+
608
+ return BaseModelOutput(
609
+ last_hidden_state=hidden_states,
610
+ hidden_states=all_hidden_states,
611
+ attentions=all_self_attentions,
612
+ )
613
+
614
+
615
+ class LlavaOnevision2PreTrainedModel(PreTrainedModel):
616
+ config_class = LlavaOnevision2Config
617
+ base_model_prefix = "model"
618
+ input_modalities = ("image", "video", "text")
619
+ supports_gradient_checkpointing = True
620
+ _no_split_modules = ["OneVisionEncoderEncoderLayer", "Qwen3DecoderLayer"]
621
+ _skip_keys_device_placement = "past_key_values"
622
+ _supports_flash_attn = True
623
+ _supports_sdpa = True
624
+
625
+ def _init_weights(self, module):
626
+ super()._init_weights(module)
627
+ # Re-initialize VisionRotaryEmbedding inv_freq buffers.
628
+ # These are registered with persistent=False, so they are not in the checkpoint
629
+ # state_dict. When ``from_pretrained`` materializes the model from meta tensors,
630
+ # the values in these buffers end up uninitialized. Mirror Qwen3-VL by explicitly
631
+ # filling them here so RoPE produces the correct frequencies post-load.
632
+ if isinstance(module, VisionRotaryEmbedding):
633
+ base = module.base
634
+ with torch.no_grad():
635
+ inv_t = 1.0 / (base ** (torch.arange(module.t_size, dtype=torch.float32) / module.t_size))
636
+ inv_h = 1.0 / (base ** (torch.arange(module.h_size, dtype=torch.float32) / module.h_size))
637
+ inv_w = 1.0 / (base ** (torch.arange(module.w_size, dtype=torch.float32) / module.w_size))
638
+ module.inv_freq_t.copy_(inv_t.to(module.inv_freq_t.device))
639
+ module.inv_freq_h.copy_(inv_h.to(module.inv_freq_h.device))
640
+ module.inv_freq_w.copy_(inv_w.to(module.inv_freq_w.device))
641
+
642
+
643
+ class Siglip2MultiheadAttentionPoolingHead(nn.Module):
644
+ """
645
+ Multi-Head Attention Pooling with a learned probe (PMA-style).
646
+ """
647
+
648
+ def __init__(self, config: LlavaOnevision2VisionConfig):
649
+ super().__init__()
650
+ self.embed_dim = config.hidden_size
651
+ self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
652
+ self.attention = nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
653
+ self.norm = nn.RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
654
+ self.mlp = SiglipMLP(config)
655
+
656
+ def forward(self, hidden_states):
657
+ batch_size = hidden_states.shape[0]
658
+ probe = self.probe.repeat(batch_size, 1, 1)
659
+
660
+ attn_output, _ = self.attention(probe, hidden_states, hidden_states)
661
+
662
+ residual = attn_output
663
+ attn_output = self.norm(attn_output)
664
+ attn_output = residual + self.mlp(attn_output)
665
+
666
+ return attn_output[:, 0]
667
+
668
+
669
+ # ---------------------------------------------------------------------------
670
+ # Vision Model
671
+ # ---------------------------------------------------------------------------
672
+
673
+
674
+ class LlavaOnevision2VisionPretrainedModel(LlavaOnevision2PreTrainedModel):
675
+ """
676
+ LLaVA-OneVision 2.0 Vision Model.
677
+
678
+ This vision model is designed to work with Qwen2VL-style image processing:
679
+ - Receives pre-processed patches in 2x2 block spatial order
680
+ - Applies RoPE with matching 2x2 block layout conversion
681
+ - Accepts explicit patch_positions for RoPE computation
682
+
683
+ Input format:
684
+ hidden_state: [total_patches, num_channels, patch_size, patch_size]
685
+ grid_thw: [num_samples, 3] with [t, h, w] for each sample
686
+ """
687
+
688
+ def __init__(self, config: LlavaOnevision2VisionConfig):
689
+ super().__init__(config)
690
+ self.config = config
691
+ self.spatial_merge_size = config.spatial_merge_size
692
+
693
+ # Vision components
694
+ self.embeddings = OneVisionEncoderEmbeddings(config)
695
+ self.layernorm_pre = get_norm_layer(config)
696
+ self.encoder = OneVisionEncoderEncoder(config)
697
+ self.video_rope = VisionRotaryEmbedding(config)
698
+
699
+ if config.use_head:
700
+ self.layernorm_post = get_norm_layer(config)
701
+ self.head = Siglip2MultiheadAttentionPoolingHead(config)
702
+ else:
703
+ self.layernorm_post = None
704
+ self.head = None
705
+
706
+ self.merger = LlavaOnevision2VisionPatchMerger(
707
+ dim=config.out_hidden_size,
708
+ context_dim=config.hidden_size,
709
+ spatial_merge_size=config.spatial_merge_size,
710
+ layer_norm_eps=config.layer_norm_eps,
711
+ use_patch_position_encoding=getattr(config, "use_patch_position_encoding", False),
712
+ patch_position_encoding_type=getattr(config, "patch_position_encoding_type", "absolute"),
713
+ max_position_embeddings=getattr(config, "max_position_embeddings", 8192),
714
+ )
715
+
716
+ self.post_init()
717
+
718
+ def _build_cu_seqlens(
719
+ self,
720
+ grid_thw: torch.Tensor,
721
+ total_patches: int,
722
+ fixed_t: Optional[int] = 4,
723
+ device: Optional[torch.device] = None,
724
+ ) -> tuple[torch.Tensor, int]:
725
+ if grid_thw is None or grid_thw.numel() == 0:
726
+ # Fallback for no grid_thw: treat as single sequence
727
+ return torch.tensor([0, total_patches], dtype=torch.int32, device=device), total_patches
728
+
729
+ if device is None:
730
+ device = grid_thw.device
731
+
732
+ cu_seqlens = [0]
733
+ max_seqlen = 0
734
+ total_entries = grid_thw.shape[0]
735
+ current_len = 0
736
+
737
+ # Calculate cumulative lengths: split sequences based on fixed_t if provided
738
+ for idx in range(total_entries):
739
+ t_val = grid_thw[idx, 0].item()
740
+ h_val = grid_thw[idx, 1].item()
741
+ w_val = grid_thw[idx, 2].item()
742
+
743
+ if fixed_t is not None and fixed_t > 0 and t_val > fixed_t:
744
+ # Split large t into chunks of fixed_t
745
+ num_full_windows = t_val // fixed_t
746
+ remainder = t_val % fixed_t
747
+
748
+ # Add full windows
749
+ for _ in range(num_full_windows):
750
+ chunk_patches = fixed_t * int(h_val) * int(w_val)
751
+ current_len += chunk_patches
752
+ max_seqlen = max(max_seqlen, chunk_patches)
753
+ cu_seqlens.append(current_len)
754
+
755
+ # Add remainder if any
756
+ if remainder > 0:
757
+ chunk_patches = remainder * int(h_val) * int(w_val)
758
+ current_len += chunk_patches
759
+ max_seqlen = max(max_seqlen, chunk_patches)
760
+ cu_seqlens.append(current_len)
761
+ else:
762
+ # Standard case: add as one chunk
763
+ chunk_patches = t_val * int(h_val) * int(w_val)
764
+ current_len += chunk_patches
765
+ max_seqlen = max(max_seqlen, chunk_patches)
766
+ cu_seqlens.append(current_len)
767
+
768
+ last_len = cu_seqlens[-1]
769
+ if last_len != total_patches:
770
+ raise ValueError(
771
+ "cu_seqlens calculation mismatch:\n"
772
+ f"- total_patches: {total_patches}\n"
773
+ f"- calculated total: {last_len}\n"
774
+ f"- grid_thw: {grid_thw}"
775
+ )
776
+
777
+ return torch.tensor(cu_seqlens, dtype=torch.int32, device=device), max_seqlen
778
+
779
+ def _build_block_attention_mask(
780
+ self,
781
+ grid_thw: torch.Tensor,
782
+ total_patches: int,
783
+ fixed_t: Optional[int] = 4,
784
+ device: Optional[torch.device] = None,
785
+ ) -> Optional[torch.Tensor]:
786
+ if grid_thw is None or grid_thw.numel() == 0:
787
+ return None
788
+
789
+ if device is None:
790
+ device = grid_thw.device
791
+
792
+ lengths = []
793
+ total_entries = grid_thw.shape[0]
794
+
795
+ for idx in range(total_entries):
796
+ t_val = grid_thw[idx, 0].item()
797
+ h_val = grid_thw[idx, 1].item()
798
+ w_val = grid_thw[idx, 2].item()
799
+
800
+ if fixed_t is not None and fixed_t > 0 and t_val > fixed_t:
801
+ # Split large t into chunks of fixed_t
802
+ num_full_windows = t_val // fixed_t
803
+ remainder = t_val % fixed_t
804
+
805
+ # Add full windows
806
+ for _ in range(num_full_windows):
807
+ lengths.append(fixed_t * int(h_val) * int(w_val))
808
+
809
+ # Add remainder if any
810
+ if remainder > 0:
811
+ lengths.append(remainder * int(h_val) * int(w_val))
812
+ else:
813
+ lengths.append(t_val * int(h_val) * int(w_val))
814
+
815
+ total_len = sum(lengths)
816
+ if total_len != total_patches:
817
+ raise ValueError(
818
+ "Block attention mask length mismatch:\n"
819
+ f"- total_patches: {total_patches}\n"
820
+ f"- total_len: {total_len}\n"
821
+ f"- grid_thw: {grid_thw}"
822
+ )
823
+
824
+ attn_mask = torch.ones((total_len, total_len), dtype=torch.bool, device=device)
825
+ start = 0
826
+ for size in lengths:
827
+ end = start + size
828
+ attn_mask[start:end, start:end] = False
829
+ start = end
830
+
831
+ return attn_mask
832
+
833
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=LlavaOnevision2VisionConfig)
834
+ def forward(
835
+ self,
836
+ hidden_state: torch.Tensor,
837
+ grid_thw: Optional[torch.Tensor] = None,
838
+ patch_positions: Optional[torch.Tensor] = None,
839
+ output_attentions: Optional[bool] = None,
840
+ output_hidden_states: Optional[bool] = None,
841
+ return_dict: Optional[bool] = None,
842
+ skip_merger: Optional[bool] = False,
843
+ ) -> Union[tuple, BaseModelOutputWithPooling]:
844
+ r"""
845
+ Forward pass for vision model.
846
+
847
+ This method accepts pre-processed patches from Qwen2VL image processor and applies
848
+ RoPE (Rotary Position Embedding) in 2x2 block layout to match the spatial arrangement
849
+ of patches.
850
+
851
+ Args:
852
+ hidden_state: Pre-processed patches from Qwen2VL processor.
853
+ Shape: [total_patches, num_channels, patch_size, patch_size]
854
+ grid_thw: Grid sizes tensor of shape [num_samples, 3] with [t, h, w] for each sample.
855
+ Required for computing RoPE and handling visible indices.
856
+ patch_positions: Optional explicit patch positions for RoPE computation.
857
+ output_attentions: Whether to return attention weights.
858
+ output_hidden_states: Whether to return all hidden states.
859
+ return_dict: Whether to return a ModelOutput instead of tuple.
860
+ skip_merger: If True, skip patch merger (useful for consistency checking).
861
+
862
+ Returns:
863
+ BaseModelOutputWithPooling with last_hidden_state containing merged features.
864
+ """
865
+ output_attentions = (
866
+ output_attentions if output_attentions is not None else getattr(self.config, "output_attentions", False)
867
+ )
868
+ output_hidden_states = (
869
+ output_hidden_states
870
+ if output_hidden_states is not None
871
+ else getattr(self.config, "output_hidden_states", False)
872
+ )
873
+ return_dict = True if return_dict is None else return_dict
874
+
875
+ # 1. Embeddings
876
+ # Note: embeddings returns [total_patches, embed_dim], we need to add batch dimension
877
+ hidden_states = self.embeddings(hidden_state)
878
+ if hidden_states.dim() == 2:
879
+ hidden_states = hidden_states.unsqueeze(0) # [1, total_patches, embed_dim]
880
+ batch_size, total_patches, _ = hidden_states.shape
881
+
882
+ # 2. RoPE Construction
883
+ if patch_positions is not None and patch_positions.dim() == 3:
884
+ patch_positions = patch_positions.squeeze(0)
885
+ freqs_visible = self.video_rope.forward_from_positions(patch_positions)
886
+
887
+ # Concatenate D/2 + D/2 -> D for applying rope
888
+ freqs_visible = torch.cat([freqs_visible, freqs_visible], dim=-1)
889
+ if freqs_visible.dim() == 2:
890
+ freqs_visible = freqs_visible.unsqueeze(0)
891
+
892
+ # 3. Pre-Norm & Encoder
893
+ hidden_states = self.layernorm_pre(hidden_states)
894
+
895
+ cu_seqlens, max_seqlen = self._build_cu_seqlens(
896
+ grid_thw=grid_thw,
897
+ total_patches=total_patches,
898
+ fixed_t=getattr(self.config, "frame_windows_size", 4),
899
+ device=hidden_states.device,
900
+ )
901
+
902
+ encoder_outputs = self.encoder(
903
+ hidden_states,
904
+ attention_mask=None,
905
+ rotary_pos_emb=freqs_visible,
906
+ output_attentions=output_attentions,
907
+ output_hidden_states=True, # Always get hidden states to use -2 layer
908
+ return_dict=True,
909
+ cu_seqlens=cu_seqlens,
910
+ max_seqlen=max_seqlen,
911
+ )
912
+
913
+ # Use second-to-last layer output for better feature representation
914
+ if encoder_outputs.hidden_states is not None and len(encoder_outputs.hidden_states) >= 2 and not skip_merger:
915
+ sequence_output = encoder_outputs.hidden_states[-1]
916
+ else:
917
+ sequence_output = encoder_outputs[0]
918
+
919
+ # Post-Norm
920
+ if self.layernorm_post is not None:
921
+ sequence_output = self.layernorm_post(sequence_output)
922
+
923
+ # Skip merger for consistency check with original ViT
924
+ if skip_merger:
925
+ pooled_output = None
926
+ if self.head is not None:
927
+ pooled_output = self.head(sequence_output)
928
+
929
+ if not return_dict:
930
+ return (sequence_output, pooled_output) + (
931
+ encoder_outputs.hidden_states if output_hidden_states else None,
932
+ )
933
+ return BaseModelOutputWithPooling(
934
+ last_hidden_state=sequence_output,
935
+ pooler_output=pooled_output,
936
+ hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
937
+ attentions=encoder_outputs.attentions if output_attentions else None,
938
+ )
939
+
940
+ # Patch merger: input patches are already in 2x2 block order from Qwen2VL processor
941
+ merged_output = self.merger(sequence_output, patch_positions=patch_positions)
942
+
943
+ if not return_dict:
944
+ return (merged_output,) + (encoder_outputs.hidden_states if output_hidden_states else None,)
945
+
946
+ return BaseModelOutputWithPooling(
947
+ last_hidden_state=merged_output,
948
+ pooler_output=None,
949
+ hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
950
+ attentions=encoder_outputs.attentions if output_attentions else None,
951
+ )
952
+
953
+
954
+ @auto_docstring
955
+ class LlavaOnevision2Model(LlavaOnevision2PreTrainedModel):
956
+ base_model_prefix = ""
957
+ # Reference: fix gemma3 grad acc #37208
958
+ accepts_loss_kwargs = False
959
+ config: LlavaOnevision2Config
960
+ _no_split_modules = ["OneVisionEncoderEncoderLayer", "Qwen3DecoderLayer"]
961
+
962
+ def __init__(self, config: LlavaOnevision2Config):
963
+ super().__init__(config)
964
+ self.visual = LlavaOnevision2VisionPretrainedModel._from_config(config.vision_config)
965
+ self.language_model = AutoModel.from_config(config.text_config)
966
+
967
+ # Initialize weights and apply final processing
968
+ self.post_init()
969
+
970
+ def get_input_embeddings(self):
971
+ return self.language_model.get_input_embeddings()
972
+
973
+ def set_input_embeddings(self, value):
974
+ self.language_model.set_input_embeddings(value)
975
+
976
+ def set_decoder(self, decoder):
977
+ self.language_model = decoder
978
+
979
+ def get_decoder(self):
980
+ return self.language_model
981
+
982
+ def get_video_features(
983
+ self,
984
+ pixel_values_videos: torch.FloatTensor,
985
+ video_grid_thw: Optional[torch.LongTensor] = None,
986
+ patch_positions=None,
987
+ ):
988
+ """
989
+ Encodes videos into continuous embeddings that can be forwarded to the language model.
990
+
991
+ Args:
992
+ pixel_values_videos: Pre-processed patches from Qwen2VL processor.
993
+ `torch.FloatTensor` of shape `(total_patches, num_channels, patch_size, patch_size)`
994
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
995
+ The temporal, height and width of feature shape of each video in LLM.
996
+ """
997
+ # Convert to correct dtype
998
+ pixel_values_videos = pixel_values_videos.type(self.visual.embeddings.patch_embedding.weight.dtype)
999
+
1000
+ # Forward through vision model with grid_thw
1001
+ vision_output = self.visual(pixel_values_videos, grid_thw=video_grid_thw, patch_positions=patch_positions)
1002
+
1003
+ # Extract the actual tensor from BaseModelOutputWithPooling
1004
+ if hasattr(vision_output, "last_hidden_state"):
1005
+ video_embeds = vision_output.last_hidden_state
1006
+ else:
1007
+ video_embeds = vision_output[0] # Fallback for tuple output
1008
+
1009
+ # Compute split sizes from video_grid_thw or from input shape
1010
+ if video_grid_thw is not None:
1011
+ split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1012
+ else:
1013
+ # Compute from input shape
1014
+ batch_size = pixel_values_videos.shape[0]
1015
+ split_sizes = [video_embeds.shape[1]] * batch_size
1016
+
1017
+ # Split embeddings per video
1018
+ if len(split_sizes) > 1:
1019
+ video_embeds = torch.split(video_embeds.view(-1, video_embeds.shape[-1]), split_sizes)
1020
+ else:
1021
+ video_embeds = [video_embeds.view(-1, video_embeds.shape[-1])]
1022
+
1023
+ return video_embeds
1024
+
1025
+ def get_image_features(
1026
+ self, pixel_values, image_grid_thw: Optional[torch.LongTensor] = None, patch_positions=None
1027
+ ):
1028
+ """
1029
+ Encodes images into continuous embeddings that can be forwarded to the language model.
1030
+
1031
+ Args:
1032
+ pixel_values: Pre-processed patches from Qwen2VL processor.
1033
+ - `torch.FloatTensor` of shape `(total_patches, num_channels, patch_size, patch_size)`
1034
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1035
+ The temporal, height and width of feature shape of each image in LLM.
1036
+ """
1037
+ # Standard format from Qwen2VL processor
1038
+ if pixel_values.dim() == 2:
1039
+ # Convert to correct dtype
1040
+ pixel_values = pixel_values.type(self.visual.embeddings.patch_embedding.weight.dtype)
1041
+
1042
+ # Forward through vision model with grid_thw
1043
+ vision_output = self.visual(pixel_values, grid_thw=image_grid_thw, patch_positions=patch_positions)
1044
+
1045
+ # Extract the actual tensor from BaseModelOutputWithPooling
1046
+ if hasattr(vision_output, "last_hidden_state"):
1047
+ image_embeds = vision_output.last_hidden_state
1048
+ else:
1049
+ image_embeds = vision_output[0]
1050
+
1051
+ # Compute split sizes from grid_thw
1052
+ if image_grid_thw is not None:
1053
+ split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1054
+ else:
1055
+ # Fallback: assume single image
1056
+ split_sizes = [image_embeds.shape[0] if image_embeds.dim() == 2 else image_embeds.shape[1]]
1057
+
1058
+ # Split embeddings per image
1059
+ image_embeds_flat = image_embeds.view(-1, image_embeds.shape[-1])
1060
+ if len(split_sizes) > 1:
1061
+ image_embeds = list(torch.split(image_embeds_flat, split_sizes))
1062
+ else:
1063
+ image_embeds = [image_embeds_flat]
1064
+
1065
+ return image_embeds
1066
+ else:
1067
+ raise ValueError(
1068
+ f"Unsupported pixel_values shape: expected 4D tensor [total_patches, C, H, W], "
1069
+ f"got {pixel_values.shape if hasattr(pixel_values, 'shape') else type(pixel_values)}"
1070
+ )
1071
+
1072
+ def get_placeholder_mask(
1073
+ self,
1074
+ input_ids: torch.LongTensor,
1075
+ inputs_embeds: torch.FloatTensor,
1076
+ image_features: Optional[torch.FloatTensor] = None,
1077
+ video_features: Optional[torch.FloatTensor] = None,
1078
+ ):
1079
+ """
1080
+ Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
1081
+ equal to the length of multimodal features. If the lengths are different, an error is raised.
1082
+ """
1083
+ if input_ids is None:
1084
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
1085
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
1086
+ )
1087
+ special_image_mask = special_image_mask.all(-1)
1088
+ special_video_mask = inputs_embeds == self.get_input_embeddings()(
1089
+ torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
1090
+ )
1091
+ special_video_mask = special_video_mask.all(-1)
1092
+ else:
1093
+ special_image_mask = input_ids == self.config.image_token_id
1094
+ special_video_mask = input_ids == self.config.video_token_id
1095
+
1096
+ n_image_tokens = special_image_mask.sum()
1097
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
1098
+ if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
1099
+ raise ValueError(
1100
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
1101
+ )
1102
+
1103
+ n_video_tokens = special_video_mask.sum()
1104
+ special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
1105
+ if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
1106
+ raise ValueError(
1107
+ f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
1108
+ )
1109
+
1110
+ return special_image_mask, special_video_mask
1111
+
1112
+ @auto_docstring
1113
+ def forward(
1114
+ self,
1115
+ input_ids: Optional[torch.LongTensor] = None,
1116
+ attention_mask: Optional[torch.Tensor] = None,
1117
+ position_ids: Optional[torch.LongTensor] = None,
1118
+ past_key_values: Optional[Cache] = None,
1119
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1120
+ use_cache: Optional[bool] = None,
1121
+ output_attentions: Optional[bool] = None,
1122
+ output_hidden_states: Optional[bool] = None,
1123
+ return_dict: Optional[bool] = None,
1124
+ pixel_values: Optional[torch.Tensor] = None,
1125
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
1126
+ image_grid_thw: Optional[torch.LongTensor] = None,
1127
+ patch_positions: Optional[torch.LongTensor] = None,
1128
+ video_grid_thw: Optional[torch.LongTensor] = None,
1129
+ cache_position: Optional[torch.LongTensor] = None,
1130
+ second_per_grid_ts: Optional[torch.Tensor] = None,
1131
+ **kwargs: Unpack[TransformersKwargs],
1132
+ ) -> Union[tuple, LlavaOnevision2ModelOutputWithPast]:
1133
+ r"""
1134
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1135
+ The temporal, height and width of feature shape of each image in LLM.
1136
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1137
+ The temporal, height and width of feature shape of each video in LLM.
1138
+ patch_positions (`torch.LongTensor` of shape `(total_patches, 3)` or `(1, total_patches, 3)`, *optional*):
1139
+ Explicit per-patch `(t, h, w)` position indices used by the vision tower to compute 3D rotary
1140
+ position embeddings (and the optional absolute position embedding inside the patch merger).
1141
+ `total_patches` is the sum of `t * h * w` across all images and videos in the batch, matching
1142
+ the layout produced by the Qwen2VL-style image processor.
1143
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
1144
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
1145
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
1146
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to
1147
+ `position_ids`, this tensor is not affected by padding.
1148
+
1149
+ Note: see the top-level ``LlavaOnevision2ForConditionalGeneration.forward``
1150
+ docstring; currently video flows in via the ``image_grid_thw`` / ``pixel_values``
1151
+ alias, so ``pixel_values_videos`` / ``video_grid_thw`` /
1152
+ ``second_per_grid_ts`` are unused at this layer.
1153
+ """
1154
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1155
+ output_hidden_states = (
1156
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1157
+ )
1158
+ return_dict = True if return_dict is None else return_dict
1159
+
1160
+ if inputs_embeds is None:
1161
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1162
+
1163
+ image_embeds = None
1164
+
1165
+ if pixel_values is not None:
1166
+ image_embeds = self.get_image_features(pixel_values, image_grid_thw, patch_positions=patch_positions)
1167
+
1168
+ if image_embeds is not None:
1169
+ image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
1170
+ image_mask, _ = self.get_placeholder_mask(
1171
+ input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
1172
+ )
1173
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
1174
+
1175
+ if pixel_values_videos is not None:
1176
+ video_embeds = self.get_video_features(
1177
+ pixel_values_videos, video_grid_thw, patch_positions=patch_positions
1178
+ )
1179
+ video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
1180
+ _, video_mask = self.get_placeholder_mask(
1181
+ input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
1182
+ )
1183
+ inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
1184
+
1185
+ # Use simple 1D position_ids
1186
+ if position_ids is None:
1187
+ batch_size, seq_length, _ = inputs_embeds.shape
1188
+ if attention_mask is not None:
1189
+ position_ids = attention_mask.long().cumsum(-1) - 1
1190
+ position_ids.masked_fill_(attention_mask == 0, 1)
1191
+ else:
1192
+ position_ids = (
1193
+ torch.arange(seq_length, device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1)
1194
+ )
1195
+
1196
+ # Handle cache_position for generation
1197
+ if cache_position is not None and cache_position[0] != 0:
1198
+ position_ids = position_ids + cache_position[0]
1199
+
1200
+ outputs = self.language_model(
1201
+ input_ids=None,
1202
+ position_ids=position_ids,
1203
+ attention_mask=attention_mask,
1204
+ past_key_values=past_key_values,
1205
+ inputs_embeds=inputs_embeds,
1206
+ use_cache=use_cache,
1207
+ output_attentions=output_attentions,
1208
+ output_hidden_states=output_hidden_states,
1209
+ return_dict=True,
1210
+ cache_position=cache_position,
1211
+ **kwargs,
1212
+ )
1213
+
1214
+ output = LlavaOnevision2ModelOutputWithPast(
1215
+ last_hidden_state=outputs.last_hidden_state,
1216
+ past_key_values=outputs.past_key_values,
1217
+ hidden_states=outputs.hidden_states,
1218
+ attentions=outputs.attentions,
1219
+ )
1220
+ return output if return_dict else output.to_tuple()
1221
+
1222
+
1223
+ @auto_docstring
1224
+ class LlavaOnevision2ForConditionalGeneration(LlavaOnevision2PreTrainedModel, GenerationMixin):
1225
+ _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
1226
+ # Reference: fix gemma3 grad acc #37208
1227
+ accepts_loss_kwargs = False
1228
+
1229
+ def __init__(self, config):
1230
+ super().__init__(config)
1231
+ self.model = LlavaOnevision2Model(config)
1232
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
1233
+ self.post_init()
1234
+
1235
+ def get_input_embeddings(self):
1236
+ return self.model.get_input_embeddings()
1237
+
1238
+ def set_input_embeddings(self, value):
1239
+ self.model.set_input_embeddings(value)
1240
+
1241
+ def set_decoder(self, decoder):
1242
+ self.model.set_decoder(decoder)
1243
+
1244
+ def get_decoder(self):
1245
+ return self.model.get_decoder()
1246
+
1247
+ def get_video_features(
1248
+ self,
1249
+ pixel_values_videos: torch.FloatTensor,
1250
+ video_grid_thw: Optional[torch.LongTensor] = None,
1251
+ patch_positions=None,
1252
+ ):
1253
+ return self.model.get_video_features(pixel_values_videos, video_grid_thw, patch_positions=patch_positions)
1254
+
1255
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1256
+ return self.model.get_image_features(pixel_values, image_grid_thw)
1257
+
1258
+ # Make modules available through conditional class for BC
1259
+ @property
1260
+ def language_model(self):
1261
+ return self.model.language_model
1262
+
1263
+ @property
1264
+ def visual(self):
1265
+ return self.model.visual
1266
+
1267
+ @can_return_tuple
1268
+ @auto_docstring
1269
+ def forward(
1270
+ self,
1271
+ input_ids: Optional[torch.LongTensor] = None,
1272
+ attention_mask: Optional[torch.Tensor] = None,
1273
+ position_ids: Optional[torch.LongTensor] = None,
1274
+ past_key_values: Optional[Cache] = None,
1275
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1276
+ labels: Optional[torch.LongTensor] = None,
1277
+ use_cache: Optional[bool] = None,
1278
+ output_attentions: Optional[bool] = None,
1279
+ output_hidden_states: Optional[bool] = None,
1280
+ pixel_values: Optional[torch.Tensor] = None,
1281
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
1282
+ image_grid_thw: Optional[torch.LongTensor] = None,
1283
+ patch_positions: Optional[torch.LongTensor] = None,
1284
+ video_grid_thw: Optional[torch.LongTensor] = None,
1285
+ cache_position: Optional[torch.LongTensor] = None,
1286
+ second_per_grid_ts: Optional[torch.Tensor] = None,
1287
+ logits_to_keep: Union[int, torch.Tensor] = 0,
1288
+ **kwargs: Unpack[TransformersKwargs],
1289
+ ) -> Union[tuple, LlavaOnevision2CausalLMOutputWithPast]:
1290
+ r"""
1291
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1292
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1293
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1294
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1295
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1296
+ The temporal, height and width of feature shape of each image in LLM.
1297
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1298
+ The temporal, height and width of feature shape of each video in LLM.
1299
+ patch_positions (`torch.LongTensor` of shape `(total_patches, 3)` or `(1, total_patches, 3)`, *optional*):
1300
+ Explicit per-patch `(t, h, w)` position indices used by the vision tower to compute 3D rotary
1301
+ position embeddings (and the optional absolute position embedding inside the patch merger).
1302
+ `total_patches` is the sum of `t * h * w` across all images and videos in the batch, matching
1303
+ the layout produced by the Qwen2VL-style image processor.
1304
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
1305
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
1306
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
1307
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to
1308
+ `position_ids`, this tensor is not affected by padding.
1309
+
1310
+ Note (native-video alias):
1311
+ The companion ``LlavaOnevision2Processor.__call__(videos=...)`` does NOT
1312
+ pass ``pixel_values_videos`` / ``video_grid_thw`` / ``second_per_grid_ts``
1313
+ to this forward. Instead it aliases the video patch tensor as
1314
+ ``pixel_values=`` and ``image_grid_thw=``, so video inputs share the
1315
+ same code path as multi-image inputs (the OneVision encoder is purely
1316
+ spatial; temporal information is carried by per-frame ``<X.X seconds>``
1317
+ text tags emitted by the processor). The ``*_videos`` and
1318
+ ``second_per_grid_ts`` kwargs are kept declared here only for API
1319
+ completeness and future use (e.g. 3D mRoPE / ``get_rope_index``); they
1320
+ are NOT consumed by the current OneVision encoder.
1321
+
1322
+ Example:
1323
+
1324
+ ```python
1325
+ >>> from PIL import Image
1326
+ >>> import requests
1327
+ >>> from transformers import AutoProcessor, LlavaOnevision2ForConditionalGeneration
1328
+
1329
+ >>> model = LlavaOnevision2ForConditionalGeneration.from_pretrained("lmms-lab-encoder/LLaVA-OneVision2-8B-Instruct", trust_remote_code=True)
1330
+ >>> processor = AutoProcessor.from_pretrained("lmms-lab-encoder/LLaVA-OneVision2-8B-Instruct", trust_remote_code=True)
1331
+
1332
+ >>> messages = [
1333
+ {
1334
+ "role": "user",
1335
+ "content": [
1336
+ {"type": "image"},
1337
+ {"type": "text", "text": "What is shown in this image?"},
1338
+ ],
1339
+ },
1340
+ ]
1341
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1342
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1343
+
1344
+ >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
1345
+ >>> inputs = processor(text=[text], images=[image], return_tensors="pt")
1346
+
1347
+ >>> # Generate
1348
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1349
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1350
+ "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
1351
+ ```"""
1352
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1353
+ output_hidden_states = (
1354
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1355
+ )
1356
+ outputs = self.model(
1357
+ input_ids=input_ids,
1358
+ pixel_values=pixel_values,
1359
+ pixel_values_videos=pixel_values_videos,
1360
+ image_grid_thw=image_grid_thw,
1361
+ patch_positions=patch_positions,
1362
+ video_grid_thw=video_grid_thw,
1363
+ second_per_grid_ts=second_per_grid_ts,
1364
+ position_ids=position_ids,
1365
+ attention_mask=attention_mask,
1366
+ past_key_values=past_key_values,
1367
+ inputs_embeds=inputs_embeds,
1368
+ use_cache=use_cache,
1369
+ output_attentions=output_attentions,
1370
+ output_hidden_states=output_hidden_states,
1371
+ return_dict=True,
1372
+ cache_position=cache_position,
1373
+ **kwargs,
1374
+ )
1375
+
1376
+ hidden_states = outputs[0]
1377
+
1378
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1379
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1380
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1381
+
1382
+ loss = None
1383
+ if labels is not None:
1384
+ loss = self.loss_function(
1385
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
1386
+ )
1387
+
1388
+ return LlavaOnevision2CausalLMOutputWithPast(
1389
+ loss=loss,
1390
+ logits=logits,
1391
+ past_key_values=outputs.past_key_values,
1392
+ hidden_states=outputs.hidden_states,
1393
+ attentions=outputs.attentions,
1394
+ )
1395
+
1396
+ def prepare_inputs_for_generation(
1397
+ self,
1398
+ input_ids,
1399
+ past_key_values=None,
1400
+ attention_mask=None,
1401
+ inputs_embeds=None,
1402
+ cache_position=None,
1403
+ position_ids=None,
1404
+ use_cache=True,
1405
+ pixel_values=None,
1406
+ pixel_values_videos=None,
1407
+ image_grid_thw=None,
1408
+ patch_positions=None,
1409
+ video_grid_thw=None,
1410
+ second_per_grid_ts=None,
1411
+ is_first_iteration=False,
1412
+ **kwargs,
1413
+ ):
1414
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
1415
+ model_inputs = super().prepare_inputs_for_generation(
1416
+ input_ids,
1417
+ past_key_values=past_key_values,
1418
+ attention_mask=attention_mask,
1419
+ inputs_embeds=inputs_embeds,
1420
+ cache_position=cache_position,
1421
+ position_ids=position_ids,
1422
+ pixel_values=pixel_values,
1423
+ pixel_values_videos=pixel_values_videos,
1424
+ image_grid_thw=image_grid_thw,
1425
+ video_grid_thw=video_grid_thw,
1426
+ second_per_grid_ts=second_per_grid_ts,
1427
+ patch_positions=patch_positions,
1428
+ use_cache=use_cache,
1429
+ is_first_iteration=is_first_iteration,
1430
+ **kwargs,
1431
+ )
1432
+
1433
+ # After the prefill iteration, drop image inputs so the vision tower
1434
+ # isn't re-run on decode steps. Gating on `is_first_iteration` (the
1435
+ # Qwen3-VL convention) is the only reliable signal in transformers
1436
+ # 5.x: `past_key_values` is non-None even on the first call (an empty
1437
+ # DynamicCache is created up-front by `generate`), and `cache_position`
1438
+ # may be `None` for remote-code models.
1439
+ if not is_first_iteration and use_cache:
1440
+ model_inputs["pixel_values"] = None
1441
+ model_inputs["pixel_values_videos"] = None
1442
+
1443
+ return model_inputs
1444
+
1445
+ def _get_image_nums_and_video_nums(
1446
+ self,
1447
+ input_ids: Optional[torch.LongTensor],
1448
+ inputs_embeds: Optional[torch.Tensor] = None,
1449
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1450
+ """
1451
+ Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
1452
+ These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
1453
+
1454
+ Args:
1455
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1456
+ Indices of input sequence tokens in the vocabulary.
1457
+
1458
+ Returns:
1459
+ image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
1460
+ video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
1461
+ """
1462
+ image_token_id = self.config.image_token_id
1463
+ video_token_id = self.config.video_token_id
1464
+ vision_start_token_id = self.config.vision_start_token_id
1465
+
1466
+ if inputs_embeds is not None:
1467
+ vision_start_mask = (
1468
+ inputs_embeds
1469
+ == self.get_input_embeddings()(
1470
+ torch.tensor(vision_start_token_id, dtype=torch.long, device=inputs_embeds.device)
1471
+ )
1472
+ )[..., 0]
1473
+ image_mask = (
1474
+ inputs_embeds
1475
+ == self.get_input_embeddings()(
1476
+ torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
1477
+ )
1478
+ )[..., 0]
1479
+ video_mask = (
1480
+ inputs_embeds
1481
+ == self.get_input_embeddings()(
1482
+ torch.tensor(video_token_id, dtype=torch.long, device=inputs_embeds.device)
1483
+ )
1484
+ )[..., 0]
1485
+ else:
1486
+ vision_start_mask = input_ids == vision_start_token_id
1487
+ image_mask = input_ids == image_token_id
1488
+ video_mask = input_ids == video_token_id
1489
+
1490
+ vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
1491
+ image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
1492
+ video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
1493
+
1494
+ return image_nums, video_nums
1495
+
1496
+ def _expand_inputs_for_generation(
1497
+ self,
1498
+ expand_size: int = 1,
1499
+ is_encoder_decoder: bool = False,
1500
+ input_ids: Optional[torch.LongTensor] = None,
1501
+ **model_kwargs,
1502
+ ) -> tuple[torch.LongTensor, dict[str, Any]]:
1503
+ # Overwritten -- Support for expanding tensors without a batch size dimension
1504
+ # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
1505
+ # pixel_values.shape[0] is sum(seqlen_images for samples)
1506
+ # image_grid_thw.shape[0] is sum(num_images for samples)
1507
+
1508
+ if expand_size == 1:
1509
+ return input_ids, model_kwargs
1510
+
1511
+ visual_keys = [
1512
+ "pixel_values",
1513
+ "image_grid_thw",
1514
+ "pixel_values_videos",
1515
+ "video_grid_thw",
1516
+ "second_per_grid_ts",
1517
+ "patch_positions",
1518
+ ]
1519
+
1520
+ def _expand_dict_for_generation_visual(dict_to_expand):
1521
+ image_grid_thw = model_kwargs.get("image_grid_thw", None)
1522
+ video_grid_thw = model_kwargs.get("video_grid_thw", None)
1523
+ image_nums, video_nums = self._get_image_nums_and_video_nums(
1524
+ input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
1525
+ )
1526
+
1527
+ def _repeat_interleave_samples(x, lengths, repeat_times):
1528
+ samples = torch.split(x, lengths)
1529
+ repeat_args = [repeat_times] + [1] * (x.dim() - 1)
1530
+ result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
1531
+ return result
1532
+
1533
+ for key in dict_to_expand:
1534
+ if key == "pixel_values":
1535
+ # split images into samples
1536
+ samples = torch.split(image_grid_thw, list(image_nums))
1537
+ # compute the sequence length of images for each sample
1538
+ lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1539
+ dict_to_expand[key] = _repeat_interleave_samples(
1540
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1541
+ )
1542
+ elif key == "image_grid_thw":
1543
+ # get the num of images for each sample
1544
+ lengths = list(image_nums)
1545
+ dict_to_expand[key] = _repeat_interleave_samples(
1546
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1547
+ )
1548
+ elif key == "pixel_values_videos":
1549
+ samples = torch.split(video_grid_thw, list(video_nums))
1550
+ lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1551
+ dict_to_expand[key] = _repeat_interleave_samples(
1552
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1553
+ )
1554
+ elif key == "video_grid_thw":
1555
+ lengths = list(video_nums)
1556
+ dict_to_expand[key] = _repeat_interleave_samples(
1557
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1558
+ )
1559
+ elif key == "second_per_grid_ts":
1560
+ dict_to_expand[key] = _repeat_interleave_samples(
1561
+ dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
1562
+ )
1563
+ elif key == "patch_positions":
1564
+ if image_grid_thw is not None and image_grid_thw.numel() > 0 and image_nums.sum() > 0:
1565
+ samples = torch.split(image_grid_thw, list(image_nums))
1566
+ lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1567
+ elif video_grid_thw is not None and video_grid_thw.numel() > 0 and video_nums.sum() > 0:
1568
+ samples = torch.split(video_grid_thw, list(video_nums))
1569
+ lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1570
+ else:
1571
+ continue
1572
+ dict_to_expand[key] = _repeat_interleave_samples(
1573
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1574
+ )
1575
+ return dict_to_expand
1576
+
1577
+ def _expand_dict_for_generation(dict_to_expand):
1578
+ for key in dict_to_expand:
1579
+ if (
1580
+ key != "cache_position"
1581
+ and dict_to_expand[key] is not None
1582
+ and isinstance(dict_to_expand[key], torch.Tensor)
1583
+ and key not in visual_keys
1584
+ ):
1585
+ dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
1586
+ return dict_to_expand
1587
+
1588
+ model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
1589
+
1590
+ if input_ids is not None:
1591
+ input_ids = input_ids.repeat_interleave(expand_size, dim=0)
1592
+
1593
+ model_kwargs = _expand_dict_for_generation(model_kwargs)
1594
+
1595
+ if is_encoder_decoder:
1596
+ if model_kwargs.get("encoder_outputs") is None:
1597
+ raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
1598
+ model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
1599
+
1600
+ return input_ids, model_kwargs
1601
+
1602
+
1603
+ __all__ = [
1604
+ "LlavaOnevision2ForConditionalGeneration",
1605
+ "LlavaOnevision2Model",
1606
+ "LlavaOnevision2PreTrainedModel",
1607
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_format": "channels_first",
3
+ "default_to_square": true,
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_rescale": true,
7
+ "do_resize": true,
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_processor_type": "Qwen2VLImageProcessor",
14
+ "image_std": [
15
+ 0.26862954,
16
+ 0.26130258,
17
+ 0.27577711
18
+ ],
19
+ "max_pixels": 4000000,
20
+ "merge_size": 2,
21
+ "min_pixels": 3136,
22
+ "patch_size": 14,
23
+ "processor_class": "LlavaOnevision2Processor",
24
+ "auto_map": {
25
+ "AutoProcessor": "processing_llava_onevision2.LlavaOnevision2Processor",
26
+ "AutoVideoProcessor": "video_processing_llava_onevision2.LlavaOnevision2VideoProcessor"
27
+ },
28
+ "resample": 3,
29
+ "rescale_factor": 0.00392156862745098,
30
+ "size": {
31
+ "longest_edge": 4000000,
32
+ "shortest_edge": 3136
33
+ },
34
+ "temporal_patch_size": 1
35
+ }
processing_llava_onevision2.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LlavaOnevision2 multi-modal processor.
2
+
3
+ Combines:
4
+ - ``Qwen2VLImageProcessor[Fast]`` (existing in checkpoint preprocessor_config)
5
+ - ``LlavaOnevision2VideoProcessor`` (this checkpoint, video_processing_*)
6
+ - ``AutoTokenizer`` (existing tokenizer.json)
7
+ - ``chat_template.jinja`` (existing, emits <|video_pad|>)
8
+
9
+ Public API:
10
+ proc = LlavaOnevision2Processor(image_processor, tokenizer, video_processor)
11
+ text = proc.apply_chat_template(messages, add_generation_prompt=True)
12
+ inputs = proc(text=[text], videos=[mp4_or_frames], return_tensors="pt")
13
+ out = model.generate(**inputs)
14
+
15
+ Design choices:
16
+ - Video path is "in-processor, transformed to multi-image + per-frame
17
+ timestamps" — model.forward sees the image path only.
18
+ - The chat_template's <|vision_start|><|video_pad|><|vision_end|> placeholder
19
+ is rewritten in __call__ to per-frame blocks:
20
+ <X.X seconds><|vision_start|><|image_pad|>*n<|vision_end|>\n
21
+ - We DO NOT emit `second_per_grid_ts`; see plan §0.5.
22
+ - Backward-compatible: `images=...` / pure-text usage matches the existing
23
+ Qwen2_5_VLProcessor output.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import re
29
+ from typing import List, Optional, Sequence, Union
30
+
31
+ import torch
32
+
33
+ # Special-token strings used by the checkpoint's tokenizer / chat_template.
34
+ VISION_START = "<|vision_start|>"
35
+ VISION_END = "<|vision_end|>"
36
+ IMAGE_PAD = "<|image_pad|>"
37
+ VIDEO_PAD = "<|video_pad|>"
38
+
39
+
40
+ def _format_seconds_tag(seconds: float) -> str:
41
+ """Match training format: ``<X.X seconds>`` (one decimal place)."""
42
+ return f"<{float(seconds):.1f} seconds>"
43
+
44
+
45
+ def _expand_video_block_for_frames(
46
+ n_per_frame: int,
47
+ frame_seconds: Sequence[float],
48
+ ) -> str:
49
+ """Build the per-frame expanded text that replaces a single
50
+ ``<|vision_start|><|video_pad|><|vision_end|>`` block.
51
+
52
+ Output (one block per frame, newline-separated):
53
+ ``<X.X seconds><|vision_start|><|image_pad|>*n_per_frame<|vision_end|>\\n``
54
+ """
55
+ parts: List[str] = []
56
+ for sec in frame_seconds:
57
+ parts.append(_format_seconds_tag(sec))
58
+ parts.append(VISION_START)
59
+ parts.append(IMAGE_PAD * n_per_frame)
60
+ parts.append(VISION_END)
61
+ return "".join(parts)
62
+
63
+
64
+ class LlavaOnevision2Processor:
65
+ """Native multi-modal processor for LlavaOnevision2.
66
+
67
+ NOTE: We deliberately do NOT inherit ``transformers.ProcessorMixin``.
68
+ This class is registered via ``auto_map`` so
69
+ ``AutoProcessor.from_pretrained(..., trust_remote_code=True)`` returns it.
70
+ """
71
+
72
+ attributes = ["image_processor", "video_processor", "tokenizer"]
73
+ image_processor_class = "AutoImageProcessor"
74
+ tokenizer_class = "AutoTokenizer"
75
+
76
+ def __init__(
77
+ self,
78
+ image_processor=None,
79
+ tokenizer=None,
80
+ video_processor=None,
81
+ chat_template: Optional[str] = None,
82
+ ):
83
+ self.image_processor = image_processor
84
+ self.tokenizer = tokenizer
85
+ self.video_processor = video_processor
86
+
87
+ # Inherit chat_template from the tokenizer if not given (matches Qwen2_5_VLProcessor).
88
+ if chat_template is None and tokenizer is not None:
89
+ chat_template = getattr(tokenizer, "chat_template", None)
90
+ self.chat_template = chat_template
91
+
92
+ # Cache the merge size from image_processor for token-count math.
93
+ self.spatial_merge_size = int(
94
+ getattr(image_processor, "merge_size", 2) if image_processor is not None else 2
95
+ )
96
+
97
+ # ------------------------------------------------------------------ utils
98
+
99
+ @classmethod
100
+ def register_for_auto_class(cls, auto_class="AutoProcessor"):
101
+ """No-op stub so ``AutoProcessor.from_pretrained(..., trust_remote_code=True)``
102
+ can call this on the dynamically-loaded class without erroring.
103
+ Real ``ProcessorMixin`` uses this to remember the auto-class for
104
+ ``push_to_hub``; we don't need that for inference-only use."""
105
+ cls._auto_class = auto_class
106
+
107
+ @classmethod
108
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
109
+ """Convenience builder mirroring HF's ``from_pretrained`` pattern."""
110
+ from transformers import AutoTokenizer, Qwen2VLImageProcessor
111
+
112
+ # Drop kwargs that AutoProcessor injects but downstream constructors
113
+ # don't accept (e.g. _from_auto / trust_remote_code propagation).
114
+ kwargs.pop("_from_auto", None)
115
+ kwargs.pop("trust_remote_code", None)
116
+ kwargs.pop("code_revision", None)
117
+
118
+ # Use the SLOW Qwen2VLImageProcessor: the Fast variant has small
119
+ # normalization rounding differences that change pixel_values bit-for-bit.
120
+ image_processor = Qwen2VLImageProcessor.from_pretrained(
121
+ pretrained_model_name_or_path, **kwargs
122
+ )
123
+ tokenizer = AutoTokenizer.from_pretrained(
124
+ pretrained_model_name_or_path, **kwargs
125
+ )
126
+
127
+ # Use the bundled VideoProcessor. Try a relative import first (when
128
+ # this module is loaded as part of a remote_code package), and fall
129
+ # back to a top-level import (when loaded as a standalone file via
130
+ # ``get_class_from_dynamic_module``, which places sibling files on
131
+ # ``sys.path``).
132
+ try:
133
+ from .video_processing_llava_onevision2 import LlavaOnevision2VideoProcessor
134
+ except ImportError:
135
+ from video_processing_llava_onevision2 import LlavaOnevision2VideoProcessor
136
+
137
+ video_processor = LlavaOnevision2VideoProcessor(
138
+ image_processor=image_processor,
139
+ min_pixels=getattr(image_processor, "min_pixels", 256 * 28 * 28),
140
+ max_pixels=getattr(image_processor, "max_pixels", 1605632),
141
+ patch_size=getattr(image_processor, "patch_size", 14),
142
+ spatial_merge_size=getattr(image_processor, "merge_size", 2),
143
+ )
144
+ return cls(
145
+ image_processor=image_processor,
146
+ tokenizer=tokenizer,
147
+ video_processor=video_processor,
148
+ )
149
+
150
+ # ------------------------------------------------------------- chat helpers
151
+
152
+ def apply_chat_template(self, messages, **kwargs):
153
+ """Delegate to the tokenizer (which already has ``chat_template``)."""
154
+ if self.chat_template and "chat_template" not in kwargs:
155
+ kwargs["chat_template"] = self.chat_template
156
+ return self.tokenizer.apply_chat_template(messages, **kwargs)
157
+
158
+ # ----------------------------------------------------------- main __call__
159
+
160
+ def __call__(
161
+ self,
162
+ text: Optional[Union[str, List[str]]] = None,
163
+ images=None,
164
+ videos=None,
165
+ return_tensors: Optional[str] = "pt",
166
+ padding: Union[bool, str] = False,
167
+ num_frames: Optional[int] = None,
168
+ max_frames: Optional[int] = None,
169
+ target_fps: Optional[float] = None,
170
+ **kwargs,
171
+ ):
172
+ """Process an aligned (text, images, videos) batch.
173
+
174
+ Behaviour:
175
+ * ``videos is not None``: run the VideoProcessor, rewrite each
176
+ ``<|video_pad|>`` block in ``text`` to per-frame ``<X.X seconds>``
177
+ blocks, then alias the video patches as ``pixel_values`` /
178
+ ``image_grid_thw`` so the model's image path consumes them.
179
+ * ``images is not None``: passed through to the underlying
180
+ ``image_processor``. (May coexist with ``videos``; expansion order
181
+ in the prompt is determined by the chat_template / placeholders.)
182
+ * Pure text: tokenize and return.
183
+
184
+ Per-call frame-sampling overrides (apply only to ``videos`` path; do
185
+ not mutate the underlying VideoProcessor's defaults):
186
+ * ``num_frames`` : force exactly N frames per video
187
+ (alias of ``fixed_num_frames``).
188
+ * ``max_frames`` : cap on auto-selected frame count (long videos).
189
+ * ``target_fps`` : sample at this FPS (capped by ``max_frames``).
190
+
191
+ Returns a ``BatchFeature`` with at minimum ``input_ids`` and
192
+ ``attention_mask``; plus ``pixel_values`` / ``image_grid_thw`` /
193
+ ``patch_positions`` when visuals are present.
194
+ """
195
+ if text is None:
196
+ raise ValueError("`text` is required.")
197
+ if isinstance(text, str):
198
+ text = [text]
199
+ text = list(text)
200
+
201
+ out: dict = {}
202
+
203
+ # ---------------- VIDEO PATH ----------------
204
+ # Process videos first so we can rewrite their placeholders into the
205
+ # text before tokenization.
206
+ video_outputs = None
207
+ if videos is not None:
208
+ if self.video_processor is None:
209
+ raise ValueError("videos passed but no video_processor configured.")
210
+ # Normalise to a list of videos.
211
+ if isinstance(videos, (str,)):
212
+ videos_list = [videos]
213
+ elif isinstance(videos, list) and len(videos) > 0 and not isinstance(
214
+ videos[0], (list, str)
215
+ ):
216
+ # list[PIL]/[np.ndarray] = single video
217
+ videos_list = [videos]
218
+ else:
219
+ videos_list = list(videos)
220
+
221
+ # Per-call sampling overrides: temporarily swap the
222
+ # VideoProcessor's attributes, then restore. Lets users do
223
+ # processor(videos=[mp4], num_frames=8)
224
+ # without mutating processor.video_processor.
225
+ vp = self.video_processor
226
+ saved = (vp.fixed_num_frames, vp.max_frames, vp.target_fps)
227
+ try:
228
+ if num_frames is not None:
229
+ vp.fixed_num_frames = int(num_frames)
230
+ if max_frames is not None:
231
+ vp.max_frames = int(max_frames)
232
+ if target_fps is not None:
233
+ vp.target_fps = float(target_fps)
234
+ video_outputs = vp(videos=videos_list, return_tensors="pt")
235
+ finally:
236
+ vp.fixed_num_frames, vp.max_frames, vp.target_fps = saved
237
+
238
+ # Rewrite each <|video_pad|> in `text` into per-frame blocks.
239
+ video_grid_thw = video_outputs["video_grid_thw"] # [num_videos, 3]
240
+ frame_timestamps = video_outputs["frame_timestamps"]
241
+ sms = self.spatial_merge_size
242
+
243
+ # We iterate placeholders globally across all texts (matching how
244
+ # Qwen2_5_VLProcessor sources `image_grid_thw` rows).
245
+ video_idx = 0
246
+
247
+ def _rewrite_one_text(s: str) -> str:
248
+ nonlocal video_idx
249
+ pattern = re.compile(
250
+ re.escape(VISION_START) + r"\s*" + re.escape(VIDEO_PAD) + r"\s*" + re.escape(VISION_END)
251
+ )
252
+
253
+ def _sub(_match):
254
+ nonlocal video_idx
255
+ if video_idx >= video_grid_thw.shape[0]:
256
+ raise ValueError(
257
+ "More <|video_pad|> placeholders in text than videos provided."
258
+ )
259
+ T_eff = int(video_grid_thw[video_idx, 0].item())
260
+ H_p = int(video_grid_thw[video_idx, 1].item())
261
+ W_p = int(video_grid_thw[video_idx, 2].item())
262
+ n_per_frame = (H_p * W_p) // (sms * sms)
263
+ frame_seconds = frame_timestamps[video_idx]
264
+ if len(frame_seconds) != T_eff:
265
+ # Defensive: pad/truncate so the count matches the grid.
266
+ if len(frame_seconds) < T_eff:
267
+ frame_seconds = list(frame_seconds) + [
268
+ frame_seconds[-1] if frame_seconds else 0.0
269
+ ] * (T_eff - len(frame_seconds))
270
+ else:
271
+ frame_seconds = list(frame_seconds[:T_eff])
272
+ expanded = _expand_video_block_for_frames(
273
+ n_per_frame, frame_seconds
274
+ )
275
+ video_idx += 1
276
+ # Strip trailing newline so we don't double-newline existing prompts.
277
+ return expanded.rstrip("\n")
278
+
279
+ return pattern.sub(_sub, s)
280
+
281
+ text = [_rewrite_one_text(s) for s in text]
282
+
283
+ if video_idx != video_grid_thw.shape[0]:
284
+ raise ValueError(
285
+ f"Provided {video_grid_thw.shape[0]} videos but only "
286
+ f"{video_idx} <|video_pad|> placeholders were found in text."
287
+ )
288
+
289
+ # Alias video tensors into the image path (NEW model only consumes the image path).
290
+ # Option 1 (multi-image semantics, training-aligned): expand each
291
+ # video_grid_thw row [T, H, W] into T rows of [1, H, W]. The
292
+ # pixel_values rows are already laid out frame-by-frame (T*H*W per
293
+ # video, with temporal_patch_size=1), so this row-expansion of
294
+ # image_grid_thw is the only adjustment needed for the model's
295
+ # forward to treat each frame as a separate image (matching the
296
+ # multi-image inference path).
297
+ out["pixel_values"] = video_outputs["pixel_values_videos"]
298
+ vgthw = video_outputs["video_grid_thw"]
299
+ expanded_rows = []
300
+ for row in vgthw:
301
+ T_v, H_v, W_v = int(row[0]), int(row[1]), int(row[2])
302
+ expanded_rows.extend([[1, H_v, W_v]] * T_v)
303
+ out["image_grid_thw"] = torch.tensor(expanded_rows, dtype=vgthw.dtype)
304
+ out["patch_positions"] = video_outputs["patch_positions"]
305
+
306
+ # ---------------- IMAGE PATH ----------------
307
+ if images is not None:
308
+ if self.image_processor is None:
309
+ raise ValueError("images passed but no image_processor configured.")
310
+ image_outputs = self.image_processor(
311
+ images=images, return_tensors="pt"
312
+ )
313
+ image_grid_thw = image_outputs["image_grid_thw"]
314
+
315
+ # Expand each <|image_pad|> placeholder to the number of merged tokens.
316
+ sms = self.spatial_merge_size
317
+ merge_factor = sms * sms
318
+ image_token_counts = (
319
+ (image_grid_thw[:, 0] * image_grid_thw[:, 1] * image_grid_thw[:, 2])
320
+ // merge_factor
321
+ ).tolist()
322
+ img_idx = 0
323
+
324
+ def _expand_image_pads(s: str) -> str:
325
+ nonlocal img_idx
326
+ while IMAGE_PAD in s:
327
+ if img_idx >= len(image_token_counts):
328
+ break
329
+ n = int(image_token_counts[img_idx])
330
+ s = s.replace(IMAGE_PAD, "<|placeholder|>" * n, 1)
331
+ img_idx += 1
332
+ return s.replace("<|placeholder|>", IMAGE_PAD)
333
+
334
+ text = [_expand_image_pads(s) for s in text]
335
+
336
+ # If videos and images coexist, prefer concatenation of patch tensors.
337
+ if "pixel_values" in out:
338
+ out["pixel_values"] = torch.cat(
339
+ [out["pixel_values"], image_outputs["pixel_values"]], dim=0
340
+ )
341
+ out["image_grid_thw"] = torch.cat(
342
+ [out["image_grid_thw"], image_outputs["image_grid_thw"]], dim=0
343
+ )
344
+ # Build image patch_positions and concat.
345
+ from .video_processing_llava_onevision2 import build_patch_positions
346
+ image_pp = build_patch_positions(
347
+ image_outputs["image_grid_thw"], spatial_merge_size=sms
348
+ )
349
+ out["patch_positions"] = torch.cat(
350
+ [out["patch_positions"], image_pp], dim=0
351
+ )
352
+ else:
353
+ out["pixel_values"] = image_outputs["pixel_values"]
354
+ out["image_grid_thw"] = image_outputs["image_grid_thw"]
355
+ from .video_processing_llava_onevision2 import build_patch_positions
356
+ out["patch_positions"] = build_patch_positions(
357
+ image_outputs["image_grid_thw"], spatial_merge_size=sms
358
+ )
359
+
360
+ # ---------------- VIDEO PATH FINAL EXPANSION ----------------
361
+ # When `videos` was given (and possibly without `images`), the per-frame
362
+ # rewrite above already produced runs of <|image_pad|> that need to be
363
+ # treated like image placeholders (one per merged token). Because the
364
+ # rewrite directly emits ``IMAGE_PAD * n_per_frame``, the texts are
365
+ # already in their tokenize-ready form for the video portion. So nothing
366
+ # more to do here — fall through to tokenize.
367
+
368
+ # ---------------- TOKENIZE ----------------
369
+ encoding = self.tokenizer(
370
+ text,
371
+ padding=padding,
372
+ return_tensors=return_tensors,
373
+ **{k: v for k, v in kwargs.items() if k in (
374
+ "max_length", "truncation", "add_special_tokens",
375
+ "return_attention_mask", "return_token_type_ids",
376
+ )},
377
+ )
378
+ out["input_ids"] = encoding["input_ids"]
379
+ out["attention_mask"] = encoding.get(
380
+ "attention_mask",
381
+ torch.ones_like(encoding["input_ids"]),
382
+ )
383
+
384
+ try:
385
+ from transformers.feature_extraction_utils import BatchFeature
386
+
387
+ return BatchFeature(data=out)
388
+ except Exception:
389
+ return out
390
+
391
+ # ---------------------------------------------------------------- decoding
392
+
393
+ def batch_decode(self, *args, **kwargs):
394
+ return self.tokenizer.batch_decode(*args, **kwargs)
395
+
396
+ def decode(self, *args, **kwargs):
397
+ return self.tokenizer.decode(*args, **kwargs)
398
+
399
+
400
+ __all__ = [
401
+ "LlavaOnevision2Processor",
402
+ "VISION_START",
403
+ "VISION_END",
404
+ "IMAGE_PAD",
405
+ "VIDEO_PAD",
406
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0c439f7be467bf47d12a7e6f9adc6116201056fc60c67f431c679b7c16afc8
3
+ size 11422064
tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "processor_class": "Qwen2_5_VLProcessor",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null,
208
+ "use_fast": true
209
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "video_processor_type": "LlavaOnevision2VideoProcessor",
3
+ "processor_class": "LlavaOnevision2Processor",
4
+ "auto_map": {
5
+ "AutoProcessor": "processing_llava_onevision2.LlavaOnevision2Processor",
6
+ "AutoVideoProcessor": "video_processing_llava_onevision2.LlavaOnevision2VideoProcessor"
7
+ },
8
+ "max_frames": 768,
9
+ "fixed_num_frames": null,
10
+ "target_fps": null,
11
+ "min_pixels": 3136,
12
+ "max_pixels": 12845056,
13
+ "patch_size": 14,
14
+ "spatial_merge_size": 2,
15
+ "temporal_patch_size": 1,
16
+ "resize_frames": true
17
+ }
video_processing_llava_onevision2.py ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Video frame extraction helpers for LlavaOnevision2 native video input.
2
+
3
+ Helpers (decord-first / opencv-fallback decoding) are used by
4
+ ``LlavaOnevision2VideoProcessor`` defined below.
5
+
6
+ The helpers were ported from the training pipeline with minor cleanups:
7
+ - dropped wrapper-only imports
8
+ - consolidated timestamp helpers
9
+ - kept decord-first / opencv-fallback decoding identical
10
+
11
+ Public API:
12
+ - format_timestamp(seconds) -> "MM:SS.xx"
13
+ - choose_target_frames(duration, max_frames, fixed_num_frames=None,
14
+ target_fps=None) -> int
15
+ - select_frame_indices(frame_count, target_count) -> list[int]
16
+ - smart_resize(h, w, patch_size=14, min_pixels=None, max_pixels=None,
17
+ align_patch_size=None) -> (h, w)
18
+ - extract_video_frames(video_path, ...) -> (frames_np, frame_indices,
19
+ timestamps_dict)
20
+ - extract_video_frames_to_pil(video_path, ...) -> (frames_pil, frame_indices,
21
+ timestamps_dict)
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import logging
27
+ import math
28
+ from typing import List, Optional, Tuple
29
+
30
+ import numpy as np
31
+ import torch
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ # =============================================================================
37
+ # Timestamp helpers
38
+ # =============================================================================
39
+
40
+ def format_timestamp(seconds: float) -> str:
41
+ minutes = int(seconds // 60)
42
+ sec = seconds - minutes * 60
43
+ return f"{minutes:02d}:{sec:09.6f}"
44
+
45
+
46
+ def time_str_to_seconds(t: str) -> float:
47
+ """Convert ``MM:SS.xx`` back to a float number of seconds.
48
+
49
+ Inverse of :func:`format_timestamp`.
50
+ """
51
+ minute, sec = t.split(":")
52
+ return int(minute) * 60 + float(sec)
53
+
54
+
55
+ # =============================================================================
56
+ # Frame-count / index selection
57
+ # =============================================================================
58
+
59
+ def choose_target_frames(
60
+ duration_seconds: float,
61
+ max_frames: int,
62
+ fixed_num_frames: Optional[int] = None,
63
+ target_fps: Optional[float] = None,
64
+ ) -> int:
65
+ """Choose target frame count based on video duration in seconds.
66
+
67
+ Sampling strategy:
68
+ - if ``target_fps`` is set, sample at that fps (capped by ``max_frames``)
69
+ - elif ``fixed_num_frames`` is set, use that exact count
70
+ - else duration < 10s -> 8 frames
71
+ - duration < 30s -> 16 frames
72
+ - otherwise -> ``max_frames`` (default 32)
73
+ """
74
+ if target_fps is not None and target_fps > 0:
75
+ return min(max(1, int(duration_seconds * target_fps)), max_frames)
76
+ if fixed_num_frames is not None:
77
+ return fixed_num_frames
78
+ if duration_seconds < 10:
79
+ return 8
80
+ if duration_seconds < 30:
81
+ return 16
82
+ return max_frames
83
+
84
+
85
+ def select_frame_indices(frame_count: int, target_count: int) -> List[int]:
86
+ if frame_count <= target_count:
87
+ return list(range(frame_count))
88
+ return torch.linspace(0, frame_count - 1, target_count).round().long().tolist()
89
+
90
+
91
+ # =============================================================================
92
+ # Spatial resize
93
+ # =============================================================================
94
+
95
+ def smart_resize(height, width, patch_size=14, min_pixels=None, max_pixels=None, align_patch_size=None):
96
+ if height <= 0 or width <= 0:
97
+ raise ValueError(f"Invalid size: height={height}, width={width}")
98
+ factor = align_patch_size or patch_size
99
+ h_bar = max(factor, int(round(height / factor) * factor))
100
+ w_bar = max(factor, int(round(width / factor) * factor))
101
+ if max_pixels and h_bar * w_bar > max_pixels:
102
+ beta = math.sqrt((height * width) / max_pixels)
103
+ h_bar = math.floor(height / beta / factor) * factor
104
+ w_bar = math.floor(width / beta / factor) * factor
105
+ elif min_pixels and h_bar * w_bar < min_pixels:
106
+ beta = math.sqrt(min_pixels / (height * width))
107
+ h_bar = math.ceil(height * beta / factor) * factor
108
+ w_bar = math.ceil(width * beta / factor) * factor
109
+ return int(h_bar), int(w_bar)
110
+
111
+
112
+ # =============================================================================
113
+ # Frame extraction (decord first, opencv fallback)
114
+ # =============================================================================
115
+
116
+ def extract_video_frames(
117
+ video_path: str,
118
+ max_frames: int = 32,
119
+ patch_size: int = 14,
120
+ min_pixels: Optional[int] = None,
121
+ max_pixels: Optional[int] = None,
122
+ resize_frames: bool = True,
123
+ fixed_num_frames: Optional[int] = None,
124
+ target_fps: Optional[float] = None,
125
+ ) -> Tuple[List[np.ndarray], torch.Tensor, dict]:
126
+ """Extract frames from a video.
127
+
128
+ Sampling rule matches :func:`choose_target_frames`. Decoding tries decord
129
+ first (better codec coverage) and falls back to OpenCV.
130
+
131
+ Args:
132
+ video_path: path to the input video file.
133
+ max_frames: cap for long videos.
134
+ patch_size: vision tower patch size for alignment.
135
+ min_pixels: minimum pixel budget for resize.
136
+ max_pixels: maximum pixel budget for resize.
137
+ resize_frames: whether to apply :func:`smart_resize` (with
138
+ ``align_patch_size = patch_size * 2``, i.e. 28 for spatial_merge=2).
139
+ fixed_num_frames: see :func:`choose_target_frames`.
140
+ target_fps: see :func:`choose_target_frames`.
141
+
142
+ Returns:
143
+ Tuple of:
144
+ - ``frames`` : list of RGB ``np.ndarray`` (H, W, 3), dtype uint8.
145
+ - ``frame_indices`` : 1D ``torch.Tensor[int64]`` of selected indices.
146
+ - ``timestamps`` : ``dict[str(frame_idx) -> "MM:SS.xx"]``.
147
+
148
+ Notes:
149
+ Lazy imports of ``decord`` and ``cv2`` keep the module importable in
150
+ environments where neither is installed (e.g. unit tests that only
151
+ exercise the helpers above).
152
+ """
153
+ frames: List[np.ndarray] = []
154
+ timestamps: dict = {}
155
+ frame_indices: List[int] = []
156
+
157
+ # Prefer decord because of broader codec support.
158
+ try:
159
+ import decord # type: ignore
160
+
161
+ vr = decord.VideoReader(video_path)
162
+ frame_count = len(vr)
163
+ fps = vr.get_avg_fps()
164
+ if not fps or fps <= 0:
165
+ fps = 30.0
166
+
167
+ duration = frame_count / fps
168
+ target_count = choose_target_frames(
169
+ duration, max_frames, fixed_num_frames, target_fps
170
+ )
171
+ selected_indices = select_frame_indices(frame_count, target_count)
172
+
173
+ # One-shot batch decode + torchvision BICUBIC+antialias resize.
174
+ # Mirrors qwen_vl_utils.fetch_video, replacing per-frame cv2 INTER_AREA/LINEAR.
175
+ arr = vr.get_batch(selected_indices).asnumpy() # [N,H,W,3] uint8 RGB
176
+ H, W = arr.shape[1], arr.shape[2]
177
+ if resize_frames and (min_pixels or max_pixels):
178
+ resized_h, resized_w = smart_resize(
179
+ H, W, patch_size,
180
+ min_pixels=min_pixels,
181
+ max_pixels=max_pixels,
182
+ align_patch_size=patch_size * 2,
183
+ )
184
+ if (resized_h, resized_w) != (H, W):
185
+ from torchvision import transforms as _T
186
+ from torchvision.transforms import InterpolationMode as _IM
187
+ video_t = torch.from_numpy(arr).permute(0, 3, 1, 2).contiguous()
188
+ video_t = _T.functional.resize(
189
+ video_t,
190
+ [resized_h, resized_w],
191
+ interpolation=_IM.BICUBIC,
192
+ antialias=True,
193
+ )
194
+ arr = video_t.permute(0, 2, 3, 1).contiguous().numpy()
195
+
196
+ frames = list(arr)
197
+ frame_indices = list(selected_indices)
198
+ for frame_idx in selected_indices:
199
+ timestamps[str(int(frame_idx))] = format_timestamp(int(frame_idx) / fps)
200
+
201
+ return frames, torch.tensor(frame_indices, dtype=torch.int64), timestamps
202
+ except Exception as e:
203
+ logger.warning(
204
+ f"decord failed to open {video_path}: {e}; falling back to OpenCV"
205
+ )
206
+
207
+ # OpenCV fallback.
208
+ import cv2 # type: ignore
209
+
210
+ cap = cv2.VideoCapture(video_path)
211
+ if not cap.isOpened():
212
+ logger.warning(f"OpenCV also failed to open video, skipped: {video_path}")
213
+ return frames, torch.tensor(frame_indices, dtype=torch.int64), timestamps
214
+
215
+ fps = cap.get(cv2.CAP_PROP_FPS)
216
+ if not fps or fps <= 0:
217
+ fps = 30.0
218
+
219
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
220
+
221
+ if frame_count > 0:
222
+ duration = frame_count / fps
223
+ target_count = choose_target_frames(
224
+ duration, max_frames, fixed_num_frames, target_fps
225
+ )
226
+ selected_indices = select_frame_indices(frame_count, target_count)
227
+
228
+ for frame_idx in selected_indices:
229
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
230
+ ret, frame = cap.read()
231
+ if not ret:
232
+ continue
233
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
234
+
235
+ if resize_frames and (min_pixels or max_pixels):
236
+ resized_h, resized_w = smart_resize(
237
+ frame.shape[0],
238
+ frame.shape[1],
239
+ patch_size,
240
+ min_pixels,
241
+ max_pixels,
242
+ align_patch_size=patch_size * 2,
243
+ )
244
+ if (resized_h, resized_w) != (frame.shape[0], frame.shape[1]):
245
+ interp = (
246
+ cv2.INTER_AREA
247
+ if resized_h < frame.shape[0] or resized_w < frame.shape[1]
248
+ else cv2.INTER_LINEAR
249
+ )
250
+ frame = cv2.resize(frame, (resized_w, resized_h), interpolation=interp)
251
+
252
+ frames.append(frame)
253
+ timestamps[str(frame_idx)] = format_timestamp(frame_idx / fps)
254
+ frame_indices.append(frame_idx)
255
+ else:
256
+ # Unknown frame count: read sequentially then sample.
257
+ frame_idx = 0
258
+ temp_frames: List[Tuple[int, np.ndarray]] = []
259
+ while True:
260
+ ret, frame = cap.read()
261
+ if not ret:
262
+ break
263
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
264
+ temp_frames.append((frame_idx, frame))
265
+ frame_idx += 1
266
+
267
+ if temp_frames:
268
+ duration = len(temp_frames) / fps
269
+ target_count = choose_target_frames(
270
+ duration, max_frames, fixed_num_frames, target_fps
271
+ )
272
+ selected_indices = select_frame_indices(len(temp_frames), target_count)
273
+
274
+ for idx in selected_indices:
275
+ frame_idx, frame = temp_frames[idx]
276
+ if resize_frames and (min_pixels or max_pixels):
277
+ resized_h, resized_w = smart_resize(
278
+ frame.shape[0],
279
+ frame.shape[1],
280
+ patch_size,
281
+ min_pixels,
282
+ max_pixels,
283
+ align_patch_size=patch_size * 2,
284
+ )
285
+ if (resized_h, resized_w) != (frame.shape[0], frame.shape[1]):
286
+ interp = (
287
+ cv2.INTER_AREA
288
+ if resized_h < frame.shape[0] or resized_w < frame.shape[1]
289
+ else cv2.INTER_LINEAR
290
+ )
291
+ frame = cv2.resize(frame, (resized_w, resized_h), interpolation=interp)
292
+
293
+ frames.append(frame)
294
+ timestamps[str(frame_idx)] = format_timestamp(frame_idx / fps)
295
+ frame_indices.append(frame_idx)
296
+
297
+ cap.release()
298
+ return frames, torch.tensor(frame_indices, dtype=torch.int64), timestamps
299
+
300
+
301
+ def extract_video_frames_to_pil(
302
+ video_path: str,
303
+ max_frames: int = 32,
304
+ patch_size: int = 14,
305
+ min_pixels: Optional[int] = None,
306
+ max_pixels: Optional[int] = None,
307
+ resize_frames: bool = True,
308
+ fixed_num_frames: Optional[int] = None,
309
+ target_fps: Optional[float] = None,
310
+ ):
311
+ """Same as :func:`extract_video_frames` but returns a list of PIL Images."""
312
+ from PIL import Image # local import: PIL is mandatory for the processor
313
+
314
+ frames_np, frame_indices, timestamps = extract_video_frames(
315
+ video_path=video_path,
316
+ max_frames=max_frames,
317
+ patch_size=patch_size,
318
+ min_pixels=min_pixels,
319
+ max_pixels=max_pixels,
320
+ resize_frames=resize_frames,
321
+ fixed_num_frames=fixed_num_frames,
322
+ target_fps=target_fps,
323
+ )
324
+ frames_pil = [Image.fromarray(frame) for frame in frames_np]
325
+ return frames_pil, frame_indices, timestamps
326
+
327
+
328
+ # =============================================================================
329
+ # patch_positions construction (row-major + 2x2 block-layout reorder)
330
+ # =============================================================================
331
+ # Block-layout reorder mirroring the training pipeline, kept here so the
332
+ # VideoProcessor is self-contained.
333
+
334
+ def _convert_positions_to_block_layout(
335
+ positions: torch.Tensor,
336
+ t: int,
337
+ h: int,
338
+ w: int,
339
+ spatial_merge_size: int = 2,
340
+ ) -> torch.Tensor:
341
+ """Reorder ``[t*h*w, 3]`` row-major positions to 2x2 block layout."""
342
+ sms = spatial_merge_size
343
+ if sms == 1:
344
+ return positions
345
+ device = positions.device
346
+ total = t * h * w
347
+ indices = torch.arange(total, device=device).view(t, h, w)
348
+ h_m, w_m = h // sms, w // sms
349
+ indices = (
350
+ indices.view(t, h_m, sms, w_m, sms)
351
+ .permute(0, 1, 3, 2, 4)
352
+ .contiguous()
353
+ .view(total)
354
+ )
355
+ return positions[indices]
356
+
357
+
358
+ def build_patch_positions(
359
+ grid_thw: torch.Tensor,
360
+ spatial_merge_size: int = 2,
361
+ frame_indices: Optional[List[Optional[torch.Tensor]]] = None,
362
+ ) -> torch.Tensor:
363
+ """Build block-layout ``[t,h,w]`` patch positions for one or many videos/images.
364
+
365
+ Args:
366
+ grid_thw: ``[num_samples, 3]`` LongTensor (T, H_p, W_p) per sample.
367
+ spatial_merge_size: vision tower spatial-merge size (default 2).
368
+ frame_indices: optional list (one entry per row of ``grid_thw``) of
369
+ real frame indices to use as the t-coordinate. Each entry should
370
+ be a 1-D LongTensor of length ``T`` for that sample. When provided
371
+ this matches the training pipeline,
372
+ where ``t`` is the original frame number in the source video so
373
+ the vision tower's 3-D RoPE encodes the actual temporal position
374
+ rather than a 0..T-1 dense index. Pass ``None`` for an entry to
375
+ fall back to dense ``arange(T)`` for that sample.
376
+
377
+ Returns:
378
+ ``[sum(T*H_p*W_p), 3]`` Int64Tensor in block layout, ready to feed
379
+ ``forward(... patch_positions=...)``.
380
+ """
381
+ out = []
382
+ for sample_idx, row in enumerate(grid_thw):
383
+ t_v, h_v, w_v = int(row[0]), int(row[1]), int(row[2])
384
+ h_coords = torch.arange(h_v, dtype=torch.int64).repeat_interleave(w_v).repeat(t_v)
385
+ w_coords = torch.arange(w_v, dtype=torch.int64).repeat(h_v).repeat(t_v)
386
+ # t-coords: prefer real frame_indices (training convention) when given.
387
+ sample_frame_idx = None
388
+ if frame_indices is not None and sample_idx < len(frame_indices):
389
+ sample_frame_idx = frame_indices[sample_idx]
390
+ if sample_frame_idx is not None:
391
+ fi = torch.as_tensor(sample_frame_idx, dtype=torch.int64)
392
+ if fi.numel() != t_v:
393
+ raise ValueError(
394
+ f"frame_indices[{sample_idx}] has length {fi.numel()} but "
395
+ f"grid_thw[{sample_idx}, 0] = {t_v}"
396
+ )
397
+ t_coords = fi.repeat_interleave(h_v * w_v)
398
+ else:
399
+ # Each frame's t coordinate runs 0..t_v-1 (each value repeated h_v*w_v).
400
+ t_coords = torch.arange(t_v, dtype=torch.int64).repeat_interleave(h_v * w_v)
401
+ pp = torch.stack([t_coords, h_coords, w_coords], dim=1)
402
+ pp = _convert_positions_to_block_layout(pp, t_v, h_v, w_v, spatial_merge_size)
403
+ out.append(pp)
404
+ return torch.cat(out, dim=0)
405
+
406
+
407
+ # =============================================================================
408
+ # LlavaOnevision2VideoProcessor
409
+ # =============================================================================
410
+ # A thin processor that wraps `Qwen2VLImageProcessor` to convert raw video
411
+ # files (or pre-decoded frame lists) into the tensor bundle needed by the
412
+ # LlavaOnevision2 model.
413
+ #
414
+ # Output (BatchFeature):
415
+ # - pixel_values_videos : [sum(T*H_p*W_p), C, P, P] patch tensor
416
+ # - video_grid_thw : [num_videos, 3] (T_eff, H_p, W_p)
417
+ # - patch_positions : [sum(T*H_p*W_p), 3] block layout
418
+ # - frame_timestamps : list[list[float]] per-video per-frame seconds
419
+ #
420
+ # Aligned with the modeling code, we deliberately
421
+ # DO NOT emit `second_per_grid_ts`.
422
+
423
+ class LlavaOnevision2VideoProcessor:
424
+ """Decode + sample + patch-ify videos for LlavaOnevision2.
425
+
426
+ Designed to be standalone (does not inherit ``transformers.ProcessorMixin``)
427
+ so it can be unit-tested without the full Processor stack.
428
+ """
429
+
430
+ # Canonical defaults.
431
+ DEFAULT_MAX_FRAMES = 384
432
+ DEFAULT_PATCH_SIZE = 14
433
+ DEFAULT_SPATIAL_MERGE_SIZE = 2
434
+ DEFAULT_TEMPORAL_PATCH_SIZE = 1 # this checkpoint ships tps=1
435
+ DEFAULT_MIN_PIXELS = 256 * 28 * 28
436
+ DEFAULT_MAX_PIXELS = 1605632
437
+
438
+ def __init__(
439
+ self,
440
+ image_processor=None,
441
+ max_frames: int = DEFAULT_MAX_FRAMES,
442
+ fixed_num_frames: Optional[int] = None,
443
+ target_fps: Optional[float] = None,
444
+ patch_size: int = DEFAULT_PATCH_SIZE,
445
+ spatial_merge_size: int = DEFAULT_SPATIAL_MERGE_SIZE,
446
+ temporal_patch_size: int = DEFAULT_TEMPORAL_PATCH_SIZE,
447
+ min_pixels: int = DEFAULT_MIN_PIXELS,
448
+ max_pixels: int = DEFAULT_MAX_PIXELS,
449
+ resize_frames: bool = True,
450
+ ):
451
+ """
452
+ Args:
453
+ image_processor: a `Qwen2VLImageProcessor` instance. If ``None`` an
454
+ instance is built from the other kwargs at first call.
455
+ max_frames / fixed_num_frames / target_fps: see
456
+ :func:`choose_target_frames`.
457
+ patch_size: vision tower patch size (default 14).
458
+ spatial_merge_size: vision tower spatial merge factor (default 2).
459
+ temporal_patch_size: temporal-patch grouping; this checkpoint
460
+ ships ``temporal_patch_size=1`` so each pv row is one single
461
+ patch (3*14*14=588) and ``Σ t·h·w == total_patches``
462
+ naturally. Override only if loading a non-default processor.
463
+ min_pixels / max_pixels: smart_resize budget.
464
+ resize_frames: whether to resize frames before patching.
465
+ """
466
+ self._image_processor = image_processor
467
+ self.max_frames = max_frames
468
+ self.fixed_num_frames = fixed_num_frames
469
+ self.target_fps = target_fps
470
+ self.patch_size = patch_size
471
+ self.spatial_merge_size = spatial_merge_size
472
+ self.temporal_patch_size = temporal_patch_size
473
+ self.min_pixels = min_pixels
474
+ self.max_pixels = max_pixels
475
+ self.resize_frames = resize_frames
476
+
477
+ # ------------------------------------------------------------------ utils
478
+
479
+ @property
480
+ def image_processor(self):
481
+ """Lazy-build the underlying `Qwen2VLImageProcessor`."""
482
+ if self._image_processor is None:
483
+ from transformers import Qwen2VLImageProcessor
484
+
485
+ self._image_processor = Qwen2VLImageProcessor(
486
+ min_pixels=self.min_pixels,
487
+ max_pixels=self.max_pixels,
488
+ patch_size=self.patch_size,
489
+ merge_size=self.spatial_merge_size,
490
+ temporal_patch_size=self.temporal_patch_size,
491
+ )
492
+ return self._image_processor
493
+
494
+ @staticmethod
495
+ def _coerce_video_input(video):
496
+ """Normalise a single video input to ``(frames_pil, timestamps_seconds)``.
497
+
498
+ Accepts:
499
+ - ``str`` path to a video file,
500
+ - ``list[PIL.Image]`` (already decoded; timestamps default to None),
501
+ - ``list[np.ndarray]`` (RGB uint8; converted to PIL).
502
+ """
503
+ from PIL import Image
504
+
505
+ if isinstance(video, str):
506
+ return None # signal: use video path through extract_video_frames_to_pil
507
+ if isinstance(video, list) and len(video) > 0:
508
+ first = video[0]
509
+ if isinstance(first, Image.Image):
510
+ return list(video), None
511
+ if isinstance(first, np.ndarray):
512
+ return [Image.fromarray(f) for f in video], None
513
+ raise TypeError(
514
+ f"Unsupported video input type: {type(video).__name__}. "
515
+ "Expected file path, list[PIL.Image], or list[np.ndarray]."
516
+ )
517
+
518
+ # ---------------------------------------------------------------- __call__
519
+
520
+ def __call__(
521
+ self,
522
+ videos,
523
+ return_tensors: Optional[str] = "pt",
524
+ **kwargs,
525
+ ):
526
+ """Process one or several videos.
527
+
528
+ Args:
529
+ videos: a single video or a list of videos. Each video may be a
530
+ path, a list of PIL frames, or a list of np.ndarray RGB frames.
531
+ return_tensors: only ``"pt"`` is supported (mirrors the underlying
532
+ image processor).
533
+ **kwargs: ignored / reserved for transformers ProcessorMixin
534
+ compatibility (e.g. ``do_rescale``).
535
+
536
+ Returns:
537
+ A dict-like object with keys:
538
+ - ``pixel_values_videos`` : Tensor ``[N_total_patches, C, P, P]``
539
+ - ``video_grid_thw`` : Tensor ``[num_videos, 3]`` (T, H_p, W_p)
540
+ - ``patch_positions`` : Tensor ``[N_total_patches, 3]`` block layout
541
+ - ``frame_timestamps`` : ``list[list[float]]`` per video
542
+ """
543
+ if return_tensors not in (None, "pt"):
544
+ raise ValueError(
545
+ f"return_tensors={return_tensors!r} not supported; only 'pt' is."
546
+ )
547
+
548
+ # Normalise to a list of videos.
549
+ if not isinstance(videos, (list, tuple)) or (
550
+ len(videos) > 0
551
+ and (isinstance(videos[0], str) is False)
552
+ and not isinstance(videos[0], list)
553
+ ):
554
+ # Heuristic: a single video as `list[PIL.Image]` should not be
555
+ # treated as a batch of single-frame videos. We detect that case
556
+ # by checking the inner element type.
557
+ from PIL import Image
558
+
559
+ if isinstance(videos, list) and len(videos) > 0 and isinstance(
560
+ videos[0], (Image.Image, np.ndarray)
561
+ ):
562
+ videos = [videos]
563
+ elif isinstance(videos, str):
564
+ videos = [videos]
565
+ if not isinstance(videos, (list, tuple)):
566
+ videos = [videos]
567
+
568
+ per_video_pixel_values = []
569
+ per_video_grid_thw = []
570
+ per_video_patch_positions = []
571
+ frame_timestamps_all: List[List[float]] = []
572
+
573
+ for video in videos:
574
+ # 1) Decode + sample
575
+ if isinstance(video, str):
576
+ frames_pil, frame_indices, timestamps = extract_video_frames_to_pil(
577
+ video_path=video,
578
+ max_frames=self.max_frames,
579
+ patch_size=self.patch_size,
580
+ min_pixels=self.min_pixels,
581
+ max_pixels=self.max_pixels,
582
+ resize_frames=self.resize_frames,
583
+ fixed_num_frames=self.fixed_num_frames,
584
+ target_fps=self.target_fps,
585
+ )
586
+ # Reconstruct fps from any two timestamps, fall back to 30.
587
+ seconds_seq: List[float] = []
588
+ if len(frames_pil) > 0:
589
+ fi_list = frame_indices.tolist()
590
+ for fi in fi_list:
591
+ ts = timestamps.get(str(int(fi)))
592
+ if ts is None:
593
+ seconds_seq.append(0.0)
594
+ else:
595
+ seconds_seq.append(time_str_to_seconds(ts))
596
+ # Real frame indices in the source video (training convention
597
+ # for the t-axis of patch_positions).
598
+ frame_indices_t = frame_indices.to(torch.int64)
599
+ else:
600
+ pre_decoded = self._coerce_video_input(video)
601
+ frames_pil, _ = pre_decoded
602
+ seconds_seq = [float(i) for i in range(len(frames_pil))]
603
+ # Without the original video we have no real indices; fall back
604
+ # to dense ``arange(T)``.
605
+ frame_indices_t = torch.arange(len(frames_pil), dtype=torch.int64)
606
+
607
+ if len(frames_pil) == 0:
608
+ raise ValueError(f"No frames decoded from video: {video!r}")
609
+
610
+ # 2) Patch-ify via Qwen2VLImageProcessor.
611
+ # Video frames go
612
+ # through the *image* path, one frame == one image. The
613
+ # resulting `image_grid_thw` has shape ``[N, 3]`` with each row
614
+ # ``[1, H_p, W_p]``. We then merge into a single video grid
615
+ # ``[1, T=N, H_p, W_p]`` (smart_resize guarantees same H/W).
616
+ #
617
+ # Important: this checkpoint ships an image processor with
618
+ # ``temporal_patch_size=1``, so each pv row encodes ONE single
619
+ # patch (3*14*14 = 588). The OneVision encoder's embedding
620
+ # layer reshapes pv via ``view(-1, 3, 14, 14)`` and produces
621
+ # exactly ``pv.shape[0]`` patches, so the cu_seqlens check
622
+ # ``Σ t·h·w == total_patches`` is satisfied with the natural
623
+ # per-frame grid below. The lazy-built fallback in
624
+ # ``image_processor`` honors ``temporal_patch_size=1`` to keep
625
+ # standalone tests aligned with the checkpoint convention.
626
+ ip = self.image_processor
627
+ data = ip(images=frames_pil, return_tensors="pt")
628
+ pixel_values = data["pixel_values"]
629
+ image_grid_thw = data["image_grid_thw"] # [N, 3]
630
+
631
+ if not torch.all(image_grid_thw[:, 1] == image_grid_thw[0, 1]) or not torch.all(
632
+ image_grid_thw[:, 2] == image_grid_thw[0, 2]
633
+ ):
634
+ raise RuntimeError(
635
+ "Frames yielded inconsistent (H_p, W_p); smart_resize should "
636
+ f"prevent this. Got grid_thw={image_grid_thw.tolist()}"
637
+ )
638
+
639
+ T_eff = int(image_grid_thw[:, 0].sum().item()) # sum of per-frame t (each is 1)
640
+ H_p = int(image_grid_thw[0, 1].item())
641
+ W_p = int(image_grid_thw[0, 2].item())
642
+ video_grid_thw = torch.tensor(
643
+ [[T_eff, H_p, W_p]], dtype=image_grid_thw.dtype
644
+ )
645
+ pixel_values_videos = pixel_values # already [T_eff*H_p*W_p, C, P, P]
646
+
647
+ # 3) patch_positions in block layout (over the merged video grid).
648
+ # Use REAL frame_indices for the t-axis (training convention).
649
+ patch_positions = build_patch_positions(
650
+ video_grid_thw,
651
+ spatial_merge_size=self.spatial_merge_size,
652
+ frame_indices=[frame_indices_t],
653
+ )
654
+
655
+ per_video_pixel_values.append(pixel_values_videos)
656
+ per_video_grid_thw.append(video_grid_thw)
657
+ per_video_patch_positions.append(patch_positions)
658
+ frame_timestamps_all.append(seconds_seq)
659
+
660
+ out_pixel_values = torch.cat(per_video_pixel_values, dim=0)
661
+ out_grid_thw = torch.cat(per_video_grid_thw, dim=0)
662
+ out_patch_positions = torch.cat(per_video_patch_positions, dim=0)
663
+
664
+ try:
665
+ from transformers.feature_extraction_utils import BatchFeature
666
+
667
+ return BatchFeature(
668
+ data={
669
+ "pixel_values_videos": out_pixel_values,
670
+ "video_grid_thw": out_grid_thw,
671
+ "patch_positions": out_patch_positions,
672
+ "frame_timestamps": frame_timestamps_all,
673
+ }
674
+ )
675
+ except Exception:
676
+ return {
677
+ "pixel_values_videos": out_pixel_values,
678
+ "video_grid_thw": out_grid_thw,
679
+ "patch_positions": out_patch_positions,
680
+ "frame_timestamps": frame_timestamps_all,
681
+ }
682
+
683
+
684
+ __all__ = [
685
+ "format_timestamp",
686
+ "time_str_to_seconds",
687
+ "choose_target_frames",
688
+ "select_frame_indices",
689
+ "smart_resize",
690
+ "extract_video_frames",
691
+ "extract_video_frames_to_pil",
692
+ "build_patch_positions",
693
+ "LlavaOnevision2VideoProcessor",
694
+ ]
vocab.json ADDED
The diff for this file is too large to render. See raw diff