AndyZijianZhang commited on
Commit
1182c0e
·
verified ·
1 Parent(s): 70103aa

Upload files with `vila-upload`.

Browse files

Upload tokenizer_config.json
Upload config.json
Upload configuration_vila.py
Upload generation_config.json
Upload special_tokens_map.json
Upload merges.txt
Upload model.safetensors
Upload added_tokens.json
Upload processing_vila.py
Upload vocab.json
Upload processor_config.json
Upload modeling_vila.py
Upload chat_template.json
Upload preprocessor_config.json

added_tokens.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<image>": 151648,
3
+ "<video>": 151649,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "[BOS]": 151646,
8
+ "[PAD]": 151647
9
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{{ '<|im_start|>' + message['role'] + '\\n' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>\\n' }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{{ '<image>' }}{% elif content['type'] == 'video' or 'video' in content %}{{ '<video>' }}{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{{ '<|im_end|>\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}\n"
3
+ }
config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VILAForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_vila.VILAConfig",
7
+ "AutoModel": "modeling_vila.VILAForConditionalGeneration",
8
+ "AutoModelForCausalLM": "modeling_vila.VILAForConditionalGeneration",
9
+ "AutoModelForImageTextToText": "modeling_vila.VILAForConditionalGeneration",
10
+ "AutoModelForVision2Seq": "modeling_vila.VILAForConditionalGeneration"
11
+ },
12
+ "hidden_size": 1536,
13
+ "image_end_token_id": 198,
14
+ "image_token_id": 151648,
15
+ "mm_hidden_size": 1152,
16
+ "mm_projector_type": "mlp_downsample_3x3_fix",
17
+ "mm_vision_select_feature": "cls_patch",
18
+ "mm_vision_select_layer": -2,
19
+ "model_type": "vila",
20
+ "text_config": {
21
+ "architectures": [
22
+ "Qwen2ForCausalLM"
23
+ ],
24
+ "attention_dropout": 0.0,
25
+ "bos_token_id": 151643,
26
+ "eos_token_id": 151645,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 1536,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 8960,
31
+ "max_position_embeddings": 32768,
32
+ "max_window_layers": 28,
33
+ "model_max_length": 4096,
34
+ "model_type": "qwen2",
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 28,
37
+ "num_key_value_heads": 2,
38
+ "rms_norm_eps": 1e-06,
39
+ "rope_scaling": null,
40
+ "rope_theta": 1000000.0,
41
+ "sliding_window": null,
42
+ "tie_word_embeddings": true,
43
+ "tokenizer_model_max_length": 4096,
44
+ "tokenizer_padding_side": "right",
45
+ "torch_dtype": "bfloat16",
46
+ "use_cache": true,
47
+ "use_sliding_window": false,
48
+ "vocab_size": 151648
49
+ },
50
+ "torch_dtype": "bfloat16",
51
+ "transformers_version": "4.51.3",
52
+ "video_token_id": 151649,
53
+ "vision_config": {
54
+ "architectures": [
55
+ "SiglipVisionModel"
56
+ ],
57
+ "attention_dropout": 0.0,
58
+ "hidden_act": "gelu_pytorch_tanh",
59
+ "hidden_size": 1152,
60
+ "image_size": 448,
61
+ "intermediate_size": 4304,
62
+ "layer_norm_eps": 1e-06,
63
+ "model_type": "siglip_vision_model",
64
+ "num_attention_heads": 16,
65
+ "num_channels": 3,
66
+ "num_hidden_layers": 27,
67
+ "num_image_tokens": 256,
68
+ "patch_size": 14,
69
+ "projection_dim": 2048,
70
+ "projector_hidden_act": "gelu_fast",
71
+ "torch_dtype": "bfloat16",
72
+ "vision_use_head": false
73
+ }
74
+ }
configuration_vila.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Optional
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
5
+ from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
6
+
7
+
8
+ class VILAConfig(PretrainedConfig):
9
+ # Class attributes.
10
+ model_type: str = "vila"
11
+ sub_configs: Dict[str, PretrainedConfig] = {
12
+ "text_config": Qwen2Config(),
13
+ "vision_config": SiglipVisionConfig(),
14
+ }
15
+ _auto_class: Optional[str] = "AutoConfig"
16
+
17
+ # Configuration for sub-modules.
18
+ text_config: Qwen2Config = Qwen2Config()
19
+ vision_config: SiglipVisionConfig = SiglipVisionConfig()
20
+
21
+ # Model configuration.
22
+ hidden_size: int
23
+ image_token_id: int
24
+ image_end_token_id: int
25
+ mm_hidden_size: int
26
+ mm_projector_type: str
27
+ mm_vision_select_feature: str
28
+ mm_vision_select_layer: int
29
+ video_token_id: int
30
+
31
+ def __init__(
32
+ self,
33
+ *,
34
+ text_config: Optional[Dict[str, Any]] = None,
35
+ vision_config: Optional[Dict[str, Any]] = None,
36
+ hidden_size: Optional[int] = None,
37
+ image_token_id: Optional[int] = None,
38
+ image_end_token_id: Optional[int] = None,
39
+ mm_hidden_size: Optional[int] = None,
40
+ mm_projector_type: Optional[str] = None,
41
+ mm_vision_select_feature: Optional[str] = None,
42
+ mm_vision_select_layer: Optional[int] = None,
43
+ video_token_id: Optional[int] = None,
44
+ **kwargs,
45
+ ):
46
+ super().__init__(**kwargs)
47
+
48
+ self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config()
49
+ self.vision_config = SiglipVisionConfig(**vision_config) if vision_config else SiglipVisionConfig()
50
+
51
+ # By default, we use settings from NVILA-Lite.
52
+ self.hidden_size = hidden_size if hidden_size is not None else 1536
53
+ self.image_token_id = image_token_id if image_token_id is not None else 151649
54
+ self.image_end_token_id = image_end_token_id if image_end_token_id is not None else 198 # "\n"
55
+ self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else 1152
56
+ self.mm_projector_type = mm_projector_type if mm_projector_type is not None else "mlp_downsample_3x3_fix"
57
+ self.mm_vision_select_feature = (
58
+ mm_vision_select_feature if mm_vision_select_feature is not None else "cls_patch"
59
+ )
60
+ self.mm_vision_select_layer = mm_vision_select_layer if mm_vision_select_layer is not None else -2
61
+ self.video_token_id = video_token_id if video_token_id is not None else 151650
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "pad_token_id": 151643,
6
+ "transformers_version": "4.51.3"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6d04890b3c56e2c052a6dd9b769b0a2b686769c3fbcc8250e9b4494b5575e7
3
+ size 4000366736
modeling_vila.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Type
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch import Tensor
6
+ from transformers.configuration_utils import PretrainedConfig
7
+ from transformers.generation.utils import GenerationMixin
8
+ from transformers.modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
9
+ from transformers.modeling_utils import PreTrainedModel
10
+ from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
11
+ from transformers.models.siglip.modeling_siglip import SiglipVisionModel
12
+
13
+ from .configuration_vila import VILAConfig
14
+
15
+
16
+ class DownSampleBlock(nn.Module):
17
+ @staticmethod
18
+ def flat_square(x: Tensor) -> Tensor:
19
+ n, w, h, c = x.size()
20
+ if w % 2 == 1:
21
+ x = torch.concat([x, torch.zeros((n, 1, h, c), device=x.device, dtype=x.dtype)], dim=1).contiguous()
22
+ n, w, h, c = x.size()
23
+ if h % 2 == 1:
24
+ x = torch.concat([x, torch.zeros((n, w, 1, c), device=x.device, dtype=x.dtype)], dim=2).contiguous()
25
+ n, w, h, c = x.size()
26
+ x = x.contiguous()
27
+ x = x.view(n, w, int(h / 2), int(c * 2))
28
+ x = x.permute(0, 2, 1, 3).contiguous()
29
+ x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
30
+ x = x.permute(0, 2, 1, 3).contiguous()
31
+ return x
32
+
33
+ def forward(self, x: Tensor) -> Tensor:
34
+ vit_embeds = x
35
+ h = w = int(vit_embeds.shape[1] ** 0.5)
36
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
37
+ vit_embeds = self.flat_square(vit_embeds)
38
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
39
+ return vit_embeds
40
+
41
+
42
+ class DownSample3x3BlockFix(nn.Module):
43
+ @staticmethod
44
+ def flat_square_3x3(x: Tensor) -> Tensor:
45
+ n, w, h, c = x.size()
46
+ if w % 3 != 0:
47
+ x = torch.concat(
48
+ [
49
+ x,
50
+ torch.zeros((n, 3 - (w % 3), h, c), device=x.device, dtype=x.dtype),
51
+ ],
52
+ dim=1,
53
+ ).contiguous()
54
+ n, w, h, c = x.size()
55
+ x = x.contiguous()
56
+ if h % 3 != 0:
57
+ x = torch.concat(
58
+ [
59
+ x,
60
+ torch.zeros((n, w, 3 - (h % 3), c), device=x.device, dtype=x.dtype),
61
+ ],
62
+ dim=2,
63
+ ).contiguous()
64
+ n, w, h, c = x.size()
65
+ x = x.view(n, w, int(h / 3), int(c * 3))
66
+ x = x.permute(0, 2, 1, 3).contiguous()
67
+ x = x.view(n, int(h / 3), int(w / 3), int(c * 9))
68
+ x = x.permute(0, 2, 1, 3).contiguous()
69
+ return x
70
+
71
+ def forward(self, x: Tensor) -> Tensor:
72
+ vit_embeds = x
73
+ h = w = int(vit_embeds.shape[1] ** 0.5)
74
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
75
+ vit_embeds = self.flat_square_3x3(vit_embeds)
76
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
77
+ return vit_embeds
78
+
79
+
80
+ class MultimodalProjector(nn.Module):
81
+ layers: nn.Sequential
82
+
83
+ def __init__(
84
+ self,
85
+ config: VILAConfig,
86
+ *args,
87
+ **kwargs,
88
+ ):
89
+ super().__init__(*args, **kwargs)
90
+
91
+ match config.mm_projector_type:
92
+ case "linear":
93
+ self.layers = nn.Sequential(
94
+ nn.Linear(config.vision_config.hidden_size, config.hidden_size),
95
+ )
96
+ case "mlp_downsample":
97
+ self.layers = nn.Sequential(
98
+ DownSampleBlock(),
99
+ nn.LayerNorm(config.mm_hidden_size * 4),
100
+ nn.Linear(config.mm_hidden_size * 4, config.hidden_size),
101
+ nn.GELU(),
102
+ nn.Linear(config.hidden_size, config.hidden_size),
103
+ )
104
+ case "mlp_downsample_3x3_fix":
105
+ self.layers = nn.Sequential(
106
+ DownSample3x3BlockFix(),
107
+ nn.LayerNorm(config.mm_hidden_size * 9),
108
+ nn.Linear(
109
+ config.mm_hidden_size * 9,
110
+ config.mm_hidden_size * 3,
111
+ ),
112
+ nn.GELU(),
113
+ nn.LayerNorm(config.vision_config.hidden_size * 3),
114
+ nn.Linear(config.vision_config.hidden_size * 3, config.hidden_size),
115
+ nn.GELU(),
116
+ nn.Linear(config.hidden_size, config.hidden_size),
117
+ )
118
+ case _:
119
+ raise NotImplementedError(f"mm_projector_type={config.mm_projector_type} not implemented.")
120
+
121
+ self.layers.to(dtype=config.torch_dtype)
122
+
123
+ @property
124
+ def device(self) -> torch.device:
125
+ return next(self.parameters()).device
126
+
127
+ @property
128
+ def dtype(self) -> torch.dtype:
129
+ return next(self.parameters()).dtype
130
+
131
+ def forward(self, x: Tensor) -> Tensor:
132
+ return self.layers(x)
133
+
134
+
135
+ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
136
+ config_class: Type[PretrainedConfig] = VILAConfig
137
+ base_model_prefix: str = "llm"
138
+ _auto_class = "AutoModelForImageTextToText"
139
+ _no_split_modules: List[str] = ["MultimodalProjector"]
140
+ _skip_keys_device_placement: List[str] = ["past_key_values"]
141
+ supports_gradient_checkpointing = True
142
+ _supports_flash_attn_2: bool = True
143
+ _supports_sdpa = True
144
+
145
+ config: VILAConfig
146
+
147
+ llm: Qwen2ForCausalLM
148
+ mm_projector: MultimodalProjector
149
+ vision_tower: SiglipVisionModel
150
+
151
+ def __init__(
152
+ self,
153
+ config: VILAConfig,
154
+ *args,
155
+ **kwargs,
156
+ ):
157
+ super().__init__(config, *args, **kwargs)
158
+
159
+ self.llm = Qwen2ForCausalLM(config.text_config, *args, **kwargs)
160
+ self.mm_projector = MultimodalProjector(config)
161
+ self.vision_tower = SiglipVisionModel(config.vision_config, *args, **kwargs)
162
+
163
+ self.post_init()
164
+
165
+ def forward(
166
+ self,
167
+ *,
168
+ attention_mask: Optional[Tensor] = None,
169
+ input_ids: Optional[Tensor] = None,
170
+ inputs_embeds: Optional[Tensor] = None,
171
+ pixel_values: Optional[Tensor] = None,
172
+ **kwargs,
173
+ ) -> CausalLMOutputWithPast:
174
+ # Vision info is only used for prefilling.
175
+ if kwargs.get("past_key_values", None) is not None:
176
+ pixel_values = None
177
+
178
+ inputs_embeds = inputs_embeds.to(dtype=self.dtype) if inputs_embeds is not None else None
179
+ pixel_values = pixel_values.to(dtype=self.dtype) if pixel_values is not None else None
180
+
181
+ if inputs_embeds is None:
182
+ assert input_ids is not None
183
+
184
+ inputs_embeds = self._embed(input_ids, pixel_values)
185
+ else:
186
+ assert input_ids is None
187
+ assert pixel_values is None
188
+
189
+ outputs = self.llm.__call__(
190
+ inputs_embeds=inputs_embeds.to(
191
+ device=self.llm.device,
192
+ dtype=self.llm.dtype,
193
+ ),
194
+ attention_mask=(
195
+ attention_mask.to(
196
+ device=self.llm.device,
197
+ )
198
+ if attention_mask is not None
199
+ else None
200
+ ),
201
+ **kwargs,
202
+ )
203
+
204
+ return outputs
205
+
206
+ def get_output_embeddings(self) -> nn.Module:
207
+ return self.llm.get_output_embeddings()
208
+
209
+ def _embed(
210
+ self,
211
+ input_ids: Tensor,
212
+ pixel_values: Optional[Tensor],
213
+ ) -> Tensor:
214
+ """Gets the embedding of the input ids and pixel values.
215
+
216
+ Args:
217
+ input_ids: The input ids.
218
+ pixel_values: The pixel values.
219
+
220
+ Returns:
221
+ The embedding of the input ids and pixel values.
222
+ """
223
+
224
+ # Video tokens should be removed during preprocessing, so there must not be any video
225
+ # tokens in the input ids.
226
+ if torch.any(input_ids == self.config.video_token_id):
227
+ raise ValueError("Video token ids should not be present in the input ids.")
228
+
229
+ image_token_mask = input_ids == self.config.image_token_id
230
+
231
+ text_embedding: Tensor = self.llm.get_input_embeddings().__call__(input_ids * ~image_token_mask)
232
+
233
+ if pixel_values is None:
234
+ return text_embedding
235
+
236
+ image_features: BaseModelOutputWithPooling = self.vision_tower.__call__(
237
+ pixel_values.to(
238
+ device=self.vision_tower.device,
239
+ dtype=self.vision_tower.dtype,
240
+ ),
241
+ output_hidden_states=True,
242
+ )
243
+ assert image_features.hidden_states is not None
244
+
245
+ # Select image feature.
246
+ selected_layer_output = image_features.hidden_states[self.config.mm_vision_select_layer]
247
+ match self.config.mm_vision_select_feature:
248
+ case "cls_patch":
249
+ selected_feature = selected_layer_output
250
+ case _:
251
+ raise NotImplementedError(
252
+ f"mm_vision_select_feature={self.config.mm_vision_select_feature} not implemented."
253
+ )
254
+
255
+ # TODO: Support dynamic_s2.
256
+
257
+ image_embedding: Tensor = self.mm_projector.__call__(
258
+ selected_feature.to(
259
+ device=self.mm_projector.device,
260
+ dtype=self.mm_projector.dtype,
261
+ )
262
+ )
263
+
264
+ # Append image end token to every image embedding.
265
+ image_end_token_embedding: Tensor = self.llm.get_input_embeddings().__call__(
266
+ torch.tensor(
267
+ self.config.image_end_token_id,
268
+ device=next(self.llm.get_input_embeddings().parameters()).device,
269
+ dtype=torch.long,
270
+ ).view(1, -1)
271
+ ) # Shape: (1, 1, dim_feature)
272
+ image_end_token_embedding = image_end_token_embedding.expand(
273
+ image_embedding.shape[0], 1, -1
274
+ ) # Shape: (n_images, 1, dim_feature)
275
+ image_embedding = torch.concat(
276
+ [
277
+ image_embedding,
278
+ image_end_token_embedding.to(device=image_embedding.device),
279
+ ],
280
+ dim=1,
281
+ )
282
+
283
+ n_images, n_feature, dim_feature = image_embedding.shape
284
+ image_embedding = image_embedding.view(n_images * n_feature, dim_feature)
285
+
286
+ text_embedding[image_token_mask.to(device=text_embedding.device)] = image_embedding.to(
287
+ device=text_embedding.device
288
+ )
289
+
290
+ return text_embedding
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_vila.VILAProcessor"
4
+ },
5
+ "do_convert_rgb": null,
6
+ "do_normalize": true,
7
+ "do_rescale": true,
8
+ "do_resize": true,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_processor_type": "SiglipImageProcessor",
15
+ "image_std": [
16
+ 0.5,
17
+ 0.5,
18
+ 0.5
19
+ ],
20
+ "processor_class": "VILAProcessor",
21
+ "resample": 3,
22
+ "rescale_factor": 0.00392156862745098,
23
+ "size": {
24
+ "height": 448,
25
+ "width": 448
26
+ }
27
+ }
processing_vila.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, cast
2
+
3
+ import transformers.image_transforms as image_transforms
4
+ import transformers.image_utils as image_utils
5
+ import transformers.utils.logging
6
+ from PIL.Image import Image
7
+ from torch import Tensor
8
+ from transformers.feature_extraction_utils import BatchFeature
9
+ from transformers.image_processing_utils import BaseImageProcessor
10
+ from transformers.image_processing_utils_fast import BaseImageProcessorFast
11
+ from transformers.image_utils import ImageInput, VideoInput
12
+ from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
13
+ from transformers.models.siglip.image_processing_siglip_fast import SiglipImageProcessorFast
14
+ from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
15
+ from transformers.tokenization_utils import PreTrainedTokenizer
16
+ from transformers.tokenization_utils_base import PreTrainedTokenizerBase, TextInput
17
+
18
+ logger = transformers.utils.logging.get_logger(__name__)
19
+
20
+
21
+ class VILAProcessorProcessingKwargs(ProcessingKwargs, total=False):
22
+ _defaults = {} # type: ignore
23
+
24
+
25
+ class VILAProcessorOutput(BatchFeature):
26
+ input_ids: List[List[int]] | Tensor
27
+ attention_mask: List[List[int]] | Tensor
28
+ pixel_values: Optional[List[Tensor] | Tensor]
29
+
30
+
31
+ class VILAProcessor(ProcessorMixin):
32
+ attributes: List[str] = [
33
+ "image_processor",
34
+ "tokenizer",
35
+ ]
36
+ image_processor_class: str = "AutoImageProcessor"
37
+ tokenizer_class: str = "AutoTokenizer"
38
+ _auto_class: str = "AutoProcessor"
39
+ valid_kwargs: List[str] = [
40
+ "chat_template",
41
+ "image_pad_len",
42
+ "max_tiles",
43
+ "min_tiles",
44
+ ]
45
+
46
+ # Attributes.
47
+ image_processor: BaseImageProcessor | BaseImageProcessorFast
48
+ tokenizer: PreTrainedTokenizerBase
49
+
50
+ # Configuration parameters.
51
+ image_pad_len: int
52
+ max_tiles: int
53
+ min_tiles: int
54
+
55
+ def __init__(
56
+ self,
57
+ image_processor: BaseImageProcessor,
58
+ tokenizer: PreTrainedTokenizer,
59
+ *,
60
+ image_pad_len: Optional[int] = None,
61
+ max_tiles: Optional[int] = None,
62
+ min_tiles: Optional[int] = None,
63
+ **kwargs,
64
+ ):
65
+ super().__init__(
66
+ image_processor,
67
+ tokenizer,
68
+ **kwargs,
69
+ )
70
+
71
+ self.image_pad_len = image_pad_len if image_pad_len is not None else 122
72
+ self.max_tiles = max_tiles if max_tiles is not None else 12
73
+ self.min_tiles = min_tiles if min_tiles is not None else 1
74
+
75
+ def __call__(
76
+ self,
77
+ text: TextInput | List[TextInput],
78
+ images: Optional[ImageInput] = None,
79
+ videos: Optional[VideoInput] = None,
80
+ audio: None = None,
81
+ **kwargs: Unpack[VILAProcessorProcessingKwargs],
82
+ ) -> VILAProcessorOutput:
83
+ """Preprocesses inputs for VILA.
84
+
85
+ Args:
86
+ text: The text to be processed.
87
+ images: The images to be processed.
88
+ videos: The videos to be processed.
89
+ audio: Not available.
90
+ **kwargs: Additional arguments for processing.
91
+
92
+ Returns:
93
+ The processed inputs that can be fed to the model.
94
+ """
95
+
96
+ merged_kwargs = self._merge_kwargs(
97
+ VILAProcessorProcessingKwargs, # type: ignore
98
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
99
+ **kwargs,
100
+ )
101
+
102
+ text, images, videos = self._prepare_inputs(
103
+ text=text,
104
+ images=images,
105
+ videos=videos,
106
+ )
107
+
108
+ # Process videos.
109
+ text, images, video_flags = self._treat_videos_as_image_seqs(
110
+ text=text,
111
+ images=images,
112
+ videos=videos,
113
+ )
114
+
115
+ # Process images.
116
+ image_inputs, num_cropped_images = self._process_images(
117
+ images=images,
118
+ **merged_kwargs["images_kwargs"],
119
+ )
120
+
121
+ # Process text.
122
+ text = self._pad_image_tokens_by_num_crops(
123
+ text,
124
+ num_cropped_images=num_cropped_images,
125
+ video_flags=video_flags,
126
+ )
127
+
128
+ text = self._pad_image_tokens_by_num_embeddings(text)
129
+
130
+ text_inputs = self.tokenizer.__call__(
131
+ text,
132
+ **merged_kwargs["text_kwargs"],
133
+ )
134
+
135
+ return VILAProcessorOutput(
136
+ data={
137
+ **text_inputs,
138
+ **image_inputs,
139
+ }
140
+ )
141
+
142
+ def _crop_image(
143
+ self,
144
+ image: Image,
145
+ ) -> List[Image]:
146
+ """Crops the image into multiple tiles.
147
+
148
+ Args:
149
+ image: The image to be cropped.
150
+
151
+ Returns:
152
+ The cropped images.
153
+ """
154
+
155
+ # TODO: Support more image processors.
156
+ if not isinstance(self.image_processor, (SiglipImageProcessor, SiglipImageProcessorFast)):
157
+ raise NotImplementedError
158
+
159
+ assert self.image_processor.size["height"] == self.image_processor.size["width"]
160
+ cropped_size = self.image_processor.size["height"]
161
+
162
+ cropped_images: List[Image] = dynamic_preprocess(
163
+ image,
164
+ min_num=self.min_tiles,
165
+ max_num=self.max_tiles,
166
+ image_size=cropped_size,
167
+ )
168
+
169
+ return cropped_images
170
+
171
+ def _pad_image_tokens_by_num_crops(
172
+ self,
173
+ text: List[str],
174
+ *,
175
+ num_cropped_images: List[int],
176
+ video_flags: List[bool],
177
+ ) -> List[str]:
178
+ """Pads each \\<image> to num_cropped_images of "\\<image>\\n" for images and "\\<video>" for videos.
179
+
180
+ Args:
181
+ text: The text to be padded.
182
+ num_cropped_images: The number of cropped images for each image token.
183
+ video_flags: A list of flags indicating whether the num_cropped_images item is a video.
184
+
185
+ Returns:
186
+ The padded text.
187
+ """
188
+
189
+ assert len(num_cropped_images) == len(
190
+ video_flags
191
+ ), "num_cropped_images and video_flags must have the same length."
192
+
193
+ image_token: str = cast(str, self.tokenizer.image_token)
194
+
195
+ return_text: List[str] = []
196
+
197
+ for text_item in text:
198
+ return_text_item: str = ""
199
+
200
+ # Repeatedly find image_token in the text.
201
+ while image_token in text_item:
202
+ image_pos = text_item.find(image_token)
203
+
204
+ if image_pos != -1 and len(num_cropped_images) > 0:
205
+ num_crops = num_cropped_images.pop(0)
206
+ video_flag = video_flags.pop(0)
207
+
208
+ return_text_item += (
209
+ text_item[:image_pos] + (image_token if video_flag else (image_token + "\n")) * num_crops
210
+ )
211
+ text_item = text_item[image_pos + len(image_token) :]
212
+
213
+ else:
214
+ break
215
+
216
+ # Must place outside the while loop.
217
+ if image_token in text_item:
218
+ raise ValueError("Too many image tokens in the text.")
219
+
220
+ return_text_item += text_item
221
+ text_item = ""
222
+
223
+ return_text.append(return_text_item)
224
+
225
+ if len(num_cropped_images) != 0:
226
+ raise ValueError("Too many images provided.")
227
+
228
+ return return_text
229
+
230
+ def _pad_image_tokens_by_num_embeddings(
231
+ self,
232
+ text: List[str],
233
+ ) -> List[str]:
234
+ """Pads each \\<image> to image_pad_len times of "\\<image>".
235
+
236
+ Args:
237
+ text: The text to be padded.
238
+
239
+ Returns:
240
+ The padded text.
241
+ """
242
+
243
+ return [
244
+ text_item.replace(
245
+ cast(str, self.tokenizer.image_token), cast(str, self.tokenizer.image_token) * self.image_pad_len
246
+ )
247
+ for text_item in text
248
+ ]
249
+
250
+ @staticmethod
251
+ def _prepare_inputs(
252
+ text: TextInput | List[TextInput],
253
+ images: Optional[ImageInput],
254
+ videos: Optional[VideoInput],
255
+ ) -> Tuple[List[str], List[Image], List[List[Image]]]:
256
+ # Prepare text.
257
+ text = text if isinstance(text, list) else [text]
258
+
259
+ # Prepare images.
260
+ if images is not None:
261
+ image_list = cast(List, image_utils.make_flat_list_of_images(images))
262
+ images = [image_transforms.to_pil_image(image) for image in image_list]
263
+ else:
264
+ images = cast(List[Image], [])
265
+
266
+ # Prepare videos.
267
+ if videos is not None:
268
+ video_list = cast(List[List], image_utils.make_batched_videos(videos))
269
+ videos = [[image_transforms.to_pil_image(image) for image in video] for video in video_list]
270
+ else:
271
+ videos = cast(List[List[Image]], [])
272
+
273
+ return text, images, videos
274
+
275
+ def _process_images(
276
+ self,
277
+ images: List[Image],
278
+ **kwargs: Unpack[ImagesKwargs],
279
+ ) -> Tuple[BatchFeature, List[int]]:
280
+ cropped_images: List[Image] = []
281
+ num_cropped_images: List[int] = []
282
+
283
+ for image in images:
284
+ single_cropped_images = self._crop_image(image)
285
+
286
+ cropped_images.extend(single_cropped_images)
287
+ num_cropped_images.append(len(single_cropped_images))
288
+
289
+ if len(cropped_images) == 0:
290
+ # The image processor may not properly handle empty image lists.
291
+ # This is a workaround to avoid errors.
292
+ return BatchFeature(), num_cropped_images
293
+
294
+ image_inputs = self.image_processor.__call__(
295
+ cropped_images,
296
+ **kwargs,
297
+ )
298
+
299
+ return image_inputs, num_cropped_images
300
+
301
+ def _treat_videos_as_image_seqs(
302
+ self, text: List[str], images: List[Image], videos: List[List[Image]]
303
+ ) -> Tuple[List[str], List[Image], List[bool]]:
304
+ """Treats videos as image sequences.
305
+
306
+ This method will replace all video tokens in the text with #frame image tokens,
307
+ and insert the corresponding images into the images list.
308
+
309
+ Args:
310
+ text: The text to be processed.
311
+ images: The images to be processed.
312
+ videos: The videos to be processed.
313
+
314
+ Returns:
315
+ The processed text and images, and a list of flags indicating whether the images are from videos.
316
+ """
317
+
318
+ image_token = cast(str, self.tokenizer.image_token)
319
+ video_token = cast(str, self.tokenizer.video_token)
320
+
321
+ return_text: List[str] = []
322
+ return_images: List[Image] = []
323
+ return_video_flags: List[bool] = []
324
+
325
+ for text_item in text:
326
+ return_text_item: str = ""
327
+
328
+ # Repeatedly find image_token or video_token in the text.
329
+ while image_token in text_item or video_token in text_item:
330
+ image_pos = text_item.find(image_token)
331
+ video_pos = text_item.find(video_token)
332
+
333
+ # If not found, set position to the end of the text.
334
+ if image_pos == -1:
335
+ image_pos = len(text_item)
336
+ if video_pos == -1:
337
+ video_pos = len(text_item)
338
+
339
+ if image_pos != len(text_item) and len(images) > 0 and image_pos < video_pos:
340
+ # Take an image and keep the image token if:
341
+ # - an image token is found, and
342
+ # - there are images left, and
343
+ # - the image token is before the first video token.
344
+
345
+ image = images.pop(0)
346
+ return_images.append(image)
347
+ return_video_flags.append(False)
348
+
349
+ return_text_item += text_item[: image_pos + len(image_token)]
350
+ text_item = text_item[image_pos + len(image_token) :]
351
+
352
+ elif video_pos != len(text_item) and len(videos) > 0 and video_pos < image_pos:
353
+ # Take a video and replace the video token with #frame image tokens if:
354
+ # - a video token is found, and
355
+ # - there are videos left, and
356
+ # - the video token is before the first image token.
357
+
358
+ video = videos.pop(0)
359
+ return_images.extend(video)
360
+ return_video_flags.extend([True] * len(video))
361
+
362
+ return_text_item += text_item[:video_pos] + image_token * len(video)
363
+ text_item = text_item[video_pos + len(video_token) :]
364
+ else:
365
+ break
366
+
367
+ # Must place outside the while loop.
368
+ if image_token in text_item:
369
+ raise ValueError("Too many image tokens in the text.")
370
+ if video_token in text_item:
371
+ raise ValueError("Too many video tokens in the text.")
372
+
373
+ return_text_item += text_item
374
+ text_item = ""
375
+
376
+ return_text.append(return_text_item)
377
+
378
+ if len(images) != 0:
379
+ raise ValueError("Too many images provided.")
380
+ if len(videos) != 0:
381
+ raise ValueError("Too many videos provided.")
382
+
383
+ return return_text, return_images, return_video_flags
384
+
385
+
386
+ def dynamic_preprocess(image: Image, min_num: int, max_num: int, image_size: int, use_thumbnail=True) -> List[Image]:
387
+ orig_width, orig_height = image.size
388
+ aspect_ratio = orig_width / orig_height
389
+
390
+ # calculate the existing image aspect ratio
391
+ target_ratios = {
392
+ (i, j)
393
+ for n in range(min_num, max_num + 1)
394
+ for i in range(1, n + 1)
395
+ for j in range(1, n + 1)
396
+ if i * j <= max_num and i * j >= min_num
397
+ }
398
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
399
+
400
+ # find the closest aspect ratio to the target
401
+ target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
402
+
403
+ # calculate the target width and height
404
+ target_width = image_size * target_aspect_ratio[0]
405
+ target_height = image_size * target_aspect_ratio[1]
406
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
407
+
408
+ # resize the image
409
+ resized_img = image.resize((target_width, target_height))
410
+ processed_images = []
411
+ for i in range(blocks):
412
+ box = (
413
+ (i % (target_width // image_size)) * image_size,
414
+ (i // (target_width // image_size)) * image_size,
415
+ ((i % (target_width // image_size)) + 1) * image_size,
416
+ ((i // (target_width // image_size)) + 1) * image_size,
417
+ )
418
+ # split the image
419
+ split_img = resized_img.crop(box)
420
+ processed_images.append(split_img)
421
+ assert len(processed_images) == blocks
422
+ if use_thumbnail and len(processed_images) != 1:
423
+ thumbnail_img = image.resize((image_size, image_size))
424
+ processed_images.append(thumbnail_img)
425
+ return processed_images
426
+
427
+
428
+ def find_closest_aspect_ratio(
429
+ aspect_ratio: float, target_ratios: List[Tuple[int, int]], width: int, height: int, image_size: int
430
+ ) -> Tuple[int, int]:
431
+ best_ratio_diff = float("inf")
432
+ best_ratio = (1, 1)
433
+ area = width * height
434
+ for ratio in target_ratios:
435
+ target_aspect_ratio = ratio[0] / ratio[1]
436
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
437
+ if ratio_diff < best_ratio_diff:
438
+ best_ratio_diff = ratio_diff
439
+ best_ratio = ratio
440
+ elif ratio_diff == best_ratio_diff:
441
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
442
+ best_ratio = ratio
443
+ return best_ratio
processor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_vila.VILAProcessor"
4
+ },
5
+ "image_pad_len": 122,
6
+ "max_tiles": 12,
7
+ "min_tiles": 1,
8
+ "processor_class": "VILAProcessor"
9
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "[BOS]",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "image_token": "<image>",
21
+ "pad_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "video_token": "<video>"
29
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "[BOS]",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "[PAD]",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<image>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<video>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ }
60
+ },
61
+ "additional_special_tokens": [
62
+ "<|im_start|>",
63
+ "<|im_end|>"
64
+ ],
65
+ "auto_map": {
66
+ "AutoProcessor": "processing_vila.VILAProcessor"
67
+ },
68
+ "bos_token": "[BOS]",
69
+ "chat_template": null,
70
+ "clean_up_tokenization_spaces": false,
71
+ "eos_token": "<|im_end|>",
72
+ "errors": "replace",
73
+ "extra_special_tokens": {
74
+ "image_token": "<image>",
75
+ "video_token": "<video>"
76
+ },
77
+ "image_token": "<image>",
78
+ "legacy": false,
79
+ "model_max_length": 4096,
80
+ "pad_token": "<|endoftext|>",
81
+ "padding_side": "left",
82
+ "processor_class": "VILAProcessor",
83
+ "split_special_tokens": false,
84
+ "tokenizer_class": "Qwen2Tokenizer",
85
+ "unk_token": null,
86
+ "video_token": "<video>"
87
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff