Upload files with `vila-upload`.

Upload tokenizer_config.json
Upload config.json
Upload model-00007-of-00007.safetensors
Upload configuration_vila.py
Upload generation_config.json
Upload special_tokens_map.json
Upload model-00006-of-00007.safetensors
Upload added_tokens.json
Upload model.safetensors.index.json
Upload processing_vila.py
Upload processor_config.json
Upload modeling_vila.py
Upload chat_template.json

Files changed (13) hide show

added_tokens.json +1 -0
chat_template.json +1 -1
config.json +3 -1
configuration_vila.py +6 -0
generation_config.json +1 -1
model-00006-of-00007.safetensors +2 -2
model-00007-of-00007.safetensors +2 -2
model.safetensors.index.json +2 -2
modeling_vila.py +59 -10
processing_vila.py +213 -95
processor_config.json +1 -1
special_tokens_map.json +2 -1
tokenizer_config.json +12 -2

added_tokens.json CHANGED Viewed

@@ -2,6 +2,7 @@
   "</tool_call>": 151658,
   "<image>": 151666,
   "<tool_call>": 151657,
   "<vila/sentinel>": 151665,
   "<vila/video>": 151667,
   "<|box_end|>": 151649,

   "</tool_call>": 151658,
   "<image>": 151666,
   "<tool_call>": 151657,
+  "<video>": 151670,
   "<vila/sentinel>": 151665,
   "<vila/video>": 151667,
   "<|box_end|>": 151649,

chat_template.json CHANGED Viewed

@@ -1,3 +1,3 @@
 {
-  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<image>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
 }

 {
+  "chat_template": "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>\n' }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{{ '<image>' }}{% elif content['type'] == 'video' or 'video' in content %}{{ '<video>' }}{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 }

config.json CHANGED Viewed

@@ -10,6 +10,7 @@
     "AutoModelForVision2Seq": "modeling_vila.VILAForConditionalGeneration"
   },
   "hidden_size": 5120,
   "image_token_id": 151666,
   "mm_hidden_size": 1152,
   "mm_projector_type": "mlp_downsample_3x3_fix",
@@ -44,7 +45,8 @@
     "vocab_size": 151670
   },
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.50.0",
   "vision_config": {
     "architectures": [
       "SiglipVisionModel"

     "AutoModelForVision2Seq": "modeling_vila.VILAForConditionalGeneration"
   },
   "hidden_size": 5120,
+  "image_end_token_id": 198,
   "image_token_id": 151666,
   "mm_hidden_size": 1152,
   "mm_projector_type": "mlp_downsample_3x3_fix",
     "vocab_size": 151670
   },
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "video_token_id": 151670,
   "vision_config": {
     "architectures": [
       "SiglipVisionModel"

configuration_vila.py CHANGED Viewed

@@ -21,10 +21,12 @@ class VILAConfig(PretrainedConfig):
     # Model configuration.
     hidden_size: int
     image_token_id: int
     mm_hidden_size: int
     mm_projector_type: str
     mm_vision_select_feature: str
     mm_vision_select_layer: int
     def __init__(
         self,
@@ -33,10 +35,12 @@ class VILAConfig(PretrainedConfig):
         vision_config: Optional[Dict[str, Any]] = None,
         hidden_size: Optional[int] = None,
         image_token_id: Optional[int] = None,
         mm_hidden_size: Optional[int] = None,
         mm_projector_type: Optional[str] = None,
         mm_vision_select_feature: Optional[str] = None,
         mm_vision_select_layer: Optional[int] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -47,9 +51,11 @@ class VILAConfig(PretrainedConfig):
         # By default, we use settings from NVILA-Lite.
         self.hidden_size = hidden_size if hidden_size is not None else 1536
         self.image_token_id = image_token_id if image_token_id is not None else 151649
         self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else 1152
         self.mm_projector_type = mm_projector_type if mm_projector_type is not None else "mlp_downsample_3x3_fix"
         self.mm_vision_select_feature = (
             mm_vision_select_feature if mm_vision_select_feature is not None else "cls_patch"
         )
         self.mm_vision_select_layer = mm_vision_select_layer if mm_vision_select_layer is not None else -2

     # Model configuration.
     hidden_size: int
     image_token_id: int
+    image_end_token_id: int
     mm_hidden_size: int
     mm_projector_type: str
     mm_vision_select_feature: str
     mm_vision_select_layer: int
+    video_token_id: int
     def __init__(
         self,
         vision_config: Optional[Dict[str, Any]] = None,
         hidden_size: Optional[int] = None,
         image_token_id: Optional[int] = None,
+        image_end_token_id: Optional[int] = None,
         mm_hidden_size: Optional[int] = None,
         mm_projector_type: Optional[str] = None,
         mm_vision_select_feature: Optional[str] = None,
         mm_vision_select_layer: Optional[int] = None,
+        video_token_id: Optional[int] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         # By default, we use settings from NVILA-Lite.
         self.hidden_size = hidden_size if hidden_size is not None else 1536
         self.image_token_id = image_token_id if image_token_id is not None else 151649
+        self.image_end_token_id = image_end_token_id if image_end_token_id is not None else 198  # "\n"
         self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else 1152
         self.mm_projector_type = mm_projector_type if mm_projector_type is not None else "mlp_downsample_3x3_fix"
         self.mm_vision_select_feature = (
             mm_vision_select_feature if mm_vision_select_feature is not None else "cls_patch"
         )
         self.mm_vision_select_layer = mm_vision_select_layer if mm_vision_select_layer is not None else -2
+        self.video_token_id = video_token_id if video_token_id is not None else 151650

generation_config.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "bos_token_id": 151643,
   "eos_token_id": 151645,
   "pad_token_id": 151643,
-  "transformers_version": "4.50.0"
 }

   "bos_token_id": 151643,
   "eos_token_id": 151645,
   "pad_token_id": 151643,
+  "transformers_version": "4.51.3"
 }

model-00006-of-00007.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e67d849dc9b28e68c55ae917164354558a2d2502c1244b3298495bc3147b97e3
-size 4995856896

 version https://git-lfs.github.com/spec/v1
+oid sha256:303bd0b4eb3e02f493a45d09bde196ec08ee39816dd5de32de0a4f098277e7b3
+size 4995861768

model-00007-of-00007.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5fb0c3d9f9175496b72378dd0b25cd153be73d2d1a17aec67acd522354bf9bac
-size 720921232

 version https://git-lfs.github.com/spec/v1
+oid sha256:dc3965ca8e51390f30c18931fb9df01af0be469cc7c6e9ec263127c99a240726
+size 720916360

model.safetensors.index.json CHANGED Viewed

@@ -917,8 +917,8 @@
     "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00006-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00006-of-00007.safetensors",
-    "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00007-of-00007.safetensors",
-    "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00007-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00007-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00007-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00007-of-00007.safetensors",

     "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00006-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00006-of-00007.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00006-of-00007.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00006-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00007-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00007-of-00007.safetensors",
     "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00007-of-00007.safetensors",

modeling_vila.py CHANGED Viewed

@@ -18,10 +18,10 @@ class DownSampleBlock(nn.Module):
     def flat_square(x: Tensor) -> Tensor:
         n, w, h, c = x.size()
         if w % 2 == 1:
-            x = torch.concat([x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
             n, w, h, c = x.size()
         if h % 2 == 1:
-            x = torch.concat([x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
             n, w, h, c = x.size()
         x = x.contiguous()
         x = x.view(n, w, int(h / 2), int(c * 2))
@@ -118,6 +118,16 @@ class MultimodalProjector(nn.Module):
             case _:
                 raise NotImplementedError(f"mm_projector_type={config.mm_projector_type} not implemented.")
     def forward(self, x: Tensor) -> Tensor:
         return self.layers(x)
@@ -147,7 +157,7 @@ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
         super().__init__(config, *args, **kwargs)
         self.llm = Qwen2ForCausalLM(config.text_config, *args, **kwargs)
-        self.mm_projector = MultimodalProjector(config).to(dtype=self.dtype)
         self.vision_tower = SiglipVisionModel(config.vision_config, *args, **kwargs)
         self.post_init()
@@ -177,8 +187,17 @@ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
             assert pixel_values is None
         outputs = self.llm.__call__(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
             **kwargs,
         )
@@ -202,6 +221,11 @@ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
             The embedding of the input ids and pixel values.
         """
         image_token_mask = input_ids == self.config.image_token_id
         text_embedding: Tensor = self.llm.get_input_embeddings().__call__(input_ids * ~image_token_mask)
@@ -210,7 +234,10 @@ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
             return text_embedding
         image_features: BaseModelOutputWithPooling = self.vision_tower.__call__(
-            pixel_values.to(dtype=self.vision_tower.dtype),
             output_hidden_states=True,
         )
         assert image_features.hidden_states is not None
@@ -227,13 +254,35 @@ class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
         # TODO: Support dynamic_s2.
-        image_embedding: Tensor = self.mm_projector.__call__(selected_feature.to(dtype=self.dtype))
         n_images, n_feature, dim_feature = image_embedding.shape
         image_embedding = image_embedding.view(n_images * n_feature, dim_feature)
-        text_embedding[image_token_mask.to(device=text_embedding.device)] = image_embedding.to(
-            device=text_embedding.device
-        )
         return text_embedding

     def flat_square(x: Tensor) -> Tensor:
         n, w, h, c = x.size()
         if w % 2 == 1:
+            x = torch.concat([x, torch.zeros((n, 1, h, c), device=x.device, dtype=x.dtype)], dim=1).contiguous()
             n, w, h, c = x.size()
         if h % 2 == 1:
+            x = torch.concat([x, torch.zeros((n, w, 1, c), device=x.device, dtype=x.dtype)], dim=2).contiguous()
             n, w, h, c = x.size()
         x = x.contiguous()
         x = x.view(n, w, int(h / 2), int(c * 2))
             case _:
                 raise NotImplementedError(f"mm_projector_type={config.mm_projector_type} not implemented.")
+        self.layers.to(dtype=config.torch_dtype)
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
     def forward(self, x: Tensor) -> Tensor:
         return self.layers(x)
         super().__init__(config, *args, **kwargs)
         self.llm = Qwen2ForCausalLM(config.text_config, *args, **kwargs)
+        self.mm_projector = MultimodalProjector(config)
         self.vision_tower = SiglipVisionModel(config.vision_config, *args, **kwargs)
         self.post_init()
             assert pixel_values is None
         outputs = self.llm.__call__(
+            inputs_embeds=inputs_embeds.to(
+                device=self.llm.device,
+                dtype=self.llm.dtype,
+            ),
+            attention_mask=(
+                attention_mask.to(
+                    device=self.llm.device,
+                )
+                if attention_mask is not None
+                else None
+            ),
             **kwargs,
         )
             The embedding of the input ids and pixel values.
         """
+        # Video tokens should be removed during preprocessing, so there must not be any video
+        # tokens in the input ids.
+        if torch.any(input_ids == self.config.video_token_id):
+            raise ValueError("Video token ids should not be present in the input ids.")
         image_token_mask = input_ids == self.config.image_token_id
         text_embedding: Tensor = self.llm.get_input_embeddings().__call__(input_ids * ~image_token_mask)
             return text_embedding
         image_features: BaseModelOutputWithPooling = self.vision_tower.__call__(
+            pixel_values.to(
+                device=self.vision_tower.device,
+                dtype=self.vision_tower.dtype,
+            ),
             output_hidden_states=True,
         )
         assert image_features.hidden_states is not None
         # TODO: Support dynamic_s2.
+        image_embedding: Tensor = self.mm_projector.__call__(
+            selected_feature.to(
+                device=self.mm_projector.device,
+                dtype=self.mm_projector.dtype,
+            )
+        )
+        # Append image end token to every image embedding.
+        image_end_token_embedding: Tensor = self.llm.get_input_embeddings().__call__(
+            torch.tensor(
+                self.config.image_end_token_id,
+                device=text_embedding.device,
+                dtype=torch.long,
+            ).view(1, -1)
+        )  # Shape: (1, 1, dim_feature)
+        image_end_token_embedding = image_end_token_embedding.expand(
+            image_embedding.shape[0], 1, -1
+        )  # Shape: (n_images, 1, dim_feature)
+        image_embedding = torch.concat(
+            [
+                image_embedding.to(device=text_embedding.device),
+                image_end_token_embedding,
+            ],
+            dim=1,
+        )
         n_images, n_feature, dim_feature = image_embedding.shape
         image_embedding = image_embedding.view(n_images * n_feature, dim_feature)
+        text_embedding[image_token_mask.to(device=text_embedding.device)] = image_embedding
         return text_embedding

processing_vila.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from typing import List, Optional, Tuple, cast
-import numpy as np
 import transformers.image_transforms as image_transforms
 import transformers.image_utils as image_utils
-from numpy.typing import NDArray
 from PIL.Image import Image
 from torch import Tensor
 from transformers.feature_extraction_utils import BatchFeature
@@ -12,19 +11,21 @@ from transformers.image_processing_utils_fast import BaseImageProcessorFast
 from transformers.image_utils import ImageInput, VideoInput
 from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
 from transformers.models.siglip.image_processing_siglip_fast import SiglipImageProcessorFast
-from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase, TextInput
-class VILAProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}  # type: ignore
 class VILAProcessorOutput(BatchFeature):
-    input_ids: List[List[int]] | NDArray[np.int64] | Tensor
-    attention_mask: List[List[int]] | NDArray[np.int64] | Tensor
-    pixel_values: Optional[List[NDArray[np.float32]] | NDArray[np.float32] | Tensor]
 class VILAProcessor(ProcessorMixin):
@@ -67,56 +68,68 @@ class VILAProcessor(ProcessorMixin):
             **kwargs,
         )
-        self.image_pad_len = image_pad_len if image_pad_len is not None else 121
         self.max_tiles = max_tiles if max_tiles is not None else 12
         self.min_tiles = min_tiles if min_tiles is not None else 1
     def __call__(
         self,
         images: Optional[ImageInput] = None,
-        text: Optional[TextInput | List[TextInput]] = None,
-        audio: None = None,
         videos: Optional[VideoInput] = None,
-        **kwargs: Unpack[VILAProcessorKwargs],
     ) -> VILAProcessorOutput:
-        # Validate arguments.
-        assert text is not None and text != [], "text must be provided"
-        assert not kwargs.get("is_split_into_words", False), "is_split_into_words=True is not supported"
-        output_kwargs = self._merge_kwargs(
-            VILAProcessorKwargs,  # type: ignore
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
-        # Process images.
-        if images is not None and images != []:
-            image_inputs, num_cropped_images = self._process_images(
-                images=images,
-                **output_kwargs["images_kwargs"],
-            )
-        else:
-            # If no images are provided, do not define pixel_values.
-            image_inputs = BatchFeature()
-            num_cropped_images = []
-        # TODO: video processing.
         # Process text.
-        text = text if isinstance(text, list) else [text]
         text = self._pad_image_tokens_by_num_crops(
             text,
             num_cropped_images=num_cropped_images,
         )
-        text = self._pad_image_tokens_by_num_embeddings(
-            text,
-        )
         text_inputs = self.tokenizer.__call__(
             text,
-            **output_kwargs["text_kwargs"],
         )
         return VILAProcessorOutput(
@@ -140,7 +153,8 @@ class VILAProcessor(ProcessorMixin):
         """
         # TODO: Support more image processors.
-        assert isinstance(self.image_processor, (SiglipImageProcessor, SiglipImageProcessorFast))
         assert self.image_processor.size["height"] == self.image_processor.size["width"]
         cropped_size = self.image_processor.size["height"]
@@ -156,61 +170,68 @@ class VILAProcessor(ProcessorMixin):
     def _pad_image_tokens_by_num_crops(
         self,
-        text: List[TextInput],
         *,
         num_cropped_images: List[int],
-    ) -> List[TextInput]:
-        """Pads each <image> to num_cropped_images of "<image>\n\n".
         Args:
             text: The text to be padded.
             num_cropped_images: The number of cropped images for each image token.
         Returns:
             The padded text.
         """
         image_token: str = cast(str, self.tokenizer.image_token)
-        # Validate arguments.
-        num_images = len(num_cropped_images)
-        num_image_tokens = sum([item.count(image_token) for item in text])
-        assert num_images == num_image_tokens, (
-            f"Number of image tokens ({num_image_tokens}) in text does not match "
-            f"the number of images ({num_images})."
-        )
-        assert all(
-            image_pad_len > 0 for image_pad_len in num_cropped_images
-        ), "All image padding lengths should be positive integers."
-        # Pad image tokens.
-        image_idx = 0
-        padded_text: List[TextInput] = []
-        for i in range(len(text)):
-            padded_text_item = ""
-            remaining_text = text[i]
-            while True:
-                token_pos = remaining_text.find(image_token)
-                if token_pos == -1:
-                    padded_text_item += remaining_text
                     break
-                padded_text_item += remaining_text[:token_pos] + ((image_token + "\n") * num_cropped_images[image_idx])
-                image_idx += 1
-                remaining_text = remaining_text[token_pos + len(image_token) :]
-            padded_text.append(padded_text_item)
-        return padded_text
     def _pad_image_tokens_by_num_embeddings(
         self,
-        text: List[TextInput],
-    ) -> List[TextInput]:
-        """Pads each <image> to image_pad_len times of "<image>".
         Args:
             text: The text to be padded.
@@ -218,56 +239,151 @@ class VILAProcessor(ProcessorMixin):
         Returns:
             The padded text.
         """
-        image_token: str = cast(str, self.tokenizer.image_token)
-        padded_text: List[TextInput] = []
-        for i in range(len(text)):
-            padded_text_item = ""
-            remaining_text = text[i]
-            while True:
-                token_pos = remaining_text.find(image_token)
-                if token_pos == -1:
-                    padded_text_item += remaining_text
-                    break
-                padded_text_item += remaining_text[:token_pos] + (image_token * self.image_pad_len)
-                remaining_text = remaining_text[token_pos + len(image_token) :]
-            padded_text.append(padded_text_item)
-        return padded_text
     def _process_images(
         self,
-        images: ImageInput,
-        **kwargs: Unpack[VILAProcessorKwargs],
     ) -> Tuple[BatchFeature, List[int]]:
-        images_flatten = cast(
-            List[Image] | List[NDArray] | List[Tensor],
-            image_utils.make_flat_list_of_images(images),
-        )
         cropped_images: List[Image] = []
         num_cropped_images: List[int] = []
-        for image in images_flatten:
-            pil_image: Image = image_transforms.to_pil_image(image)
-            single_cropped_images = self._crop_image(pil_image)
             cropped_images.extend(single_cropped_images)
             num_cropped_images.append(len(single_cropped_images))
-        image_inputs = self.image_processor(
             cropped_images,
             **kwargs,
         )
         return image_inputs, num_cropped_images
-def dynamic_preprocess(image, min_num=1, max_num=12, image_size=384, use_thumbnail=True):
     orig_width, orig_height = image.size
     aspect_ratio = orig_width / orig_height
@@ -309,7 +425,9 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=384, use_thumbna
     return processed_images
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
     best_ratio_diff = float("inf")
     best_ratio = (1, 1)
     area = width * height

 from typing import List, Optional, Tuple, cast
 import transformers.image_transforms as image_transforms
 import transformers.image_utils as image_utils
+import transformers.utils.logging
 from PIL.Image import Image
 from torch import Tensor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, VideoInput
 from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
 from transformers.models.siglip.image_processing_siglip_fast import SiglipImageProcessorFast
+from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase, TextInput
+logger = transformers.utils.logging.get_logger(__name__)
+class VILAProcessorProcessingKwargs(ProcessingKwargs, total=False):
     _defaults = {}  # type: ignore
 class VILAProcessorOutput(BatchFeature):
+    input_ids: List[List[int]] | Tensor
+    attention_mask: List[List[int]] | Tensor
+    pixel_values: Optional[List[Tensor] | Tensor]
 class VILAProcessor(ProcessorMixin):
             **kwargs,
         )
+        self.image_pad_len = image_pad_len if image_pad_len is not None else 122
         self.max_tiles = max_tiles if max_tiles is not None else 12
         self.min_tiles = min_tiles if min_tiles is not None else 1
     def __call__(
         self,
+        text: TextInput | List[TextInput],
         images: Optional[ImageInput] = None,
         videos: Optional[VideoInput] = None,
+        audio: None = None,
+        **kwargs: Unpack[VILAProcessorProcessingKwargs],
     ) -> VILAProcessorOutput:
+        """Preprocesses inputs for VILA.
+        Args:
+            text: The text to be processed.
+            images: The images to be processed.
+            videos: The videos to be processed.
+            audio: Not available.
+            **kwargs: Additional arguments for processing.
+        Returns:
+            The processed inputs that can be fed to the model.
+        """
+        merged_kwargs = self._merge_kwargs(
+            VILAProcessorProcessingKwargs,  # type: ignore
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        text, images, videos = self._prepare_inputs(
+            text=text,
+            images=images,
+            videos=videos,
+        )
+        # Process videos.
+        text, images, video_flags = self._treat_videos_as_image_seqs(
+            text=text,
+            images=images,
+            videos=videos,
+        )
+        # Process images.
+        image_inputs, num_cropped_images = self._process_images(
+            images=images,
+            **merged_kwargs["images_kwargs"],
+        )
         # Process text.
         text = self._pad_image_tokens_by_num_crops(
             text,
             num_cropped_images=num_cropped_images,
+            video_flags=video_flags,
         )
+        text = self._pad_image_tokens_by_num_embeddings(text)
         text_inputs = self.tokenizer.__call__(
             text,
+            **merged_kwargs["text_kwargs"],
         )
         return VILAProcessorOutput(
         """
         # TODO: Support more image processors.
+        if not isinstance(self.image_processor, (SiglipImageProcessor, SiglipImageProcessorFast)):
+            raise NotImplementedError
         assert self.image_processor.size["height"] == self.image_processor.size["width"]
         cropped_size = self.image_processor.size["height"]
     def _pad_image_tokens_by_num_crops(
         self,
+        text: List[str],
         *,
         num_cropped_images: List[int],
+        video_flags: List[bool],
+    ) -> List[str]:
+        """Pads each \\<image> to num_cropped_images of "\\<image>\\n" for images and "\\<video>" for videos.
         Args:
             text: The text to be padded.
             num_cropped_images: The number of cropped images for each image token.
+            video_flags: A list of flags indicating whether the num_cropped_images item is a video.
         Returns:
             The padded text.
         """
+        assert len(num_cropped_images) == len(
+            video_flags
+        ), "num_cropped_images and video_flags must have the same length."
         image_token: str = cast(str, self.tokenizer.image_token)
+        return_text: List[str] = []
+        for text_item in text:
+            return_text_item: str = ""
+            # Repeatedly find image_token in the text.
+            while image_token in text_item:
+                image_pos = text_item.find(image_token)
+                if image_pos != -1 and len(num_cropped_images) > 0:
+                    num_crops = num_cropped_images.pop(0)
+                    video_flag = video_flags.pop(0)
+                    return_text_item += (
+                        text_item[:image_pos] + (image_token if video_flag else (image_token + "\n")) * num_crops
+                    )
+                    text_item = text_item[image_pos + len(image_token) :]
+                else:
                     break
+            # Must place outside the while loop.
+            if image_token in text_item:
+                raise ValueError("Too many image tokens in the text.")
+            return_text_item += text_item
+            text_item = ""
+            return_text.append(return_text_item)
+        if len(num_cropped_images) != 0:
+            raise ValueError("Too many images provided.")
+        return return_text
     def _pad_image_tokens_by_num_embeddings(
         self,
+        text: List[str],
+    ) -> List[str]:
+        """Pads each \\<image> to image_pad_len times of "\\<image>".
         Args:
             text: The text to be padded.
         Returns:
             The padded text.
         """
+        return [
+            text_item.replace(
+                cast(str, self.tokenizer.image_token), cast(str, self.tokenizer.image_token) * self.image_pad_len
+            )
+            for text_item in text
+        ]
+    @staticmethod
+    def _prepare_inputs(
+        text: TextInput | List[TextInput],
+        images: Optional[ImageInput],
+        videos: Optional[VideoInput],
+    ) -> Tuple[List[str], List[Image], List[List[Image]]]:
+        # Prepare text.
+        text = text if isinstance(text, list) else [text]
+        # Prepare images.
+        if images is not None:
+            image_list = cast(List, image_utils.make_flat_list_of_images(images))
+            images = [image_transforms.to_pil_image(image) for image in image_list]
+        else:
+            images = cast(List[Image], [])
+        # Prepare videos.
+        if videos is not None:
+            video_list = cast(List[List], image_utils.make_batched_videos(videos))
+            videos = [[image_transforms.to_pil_image(image) for image in video] for video in video_list]
+        else:
+            videos = cast(List[List[Image]], [])
+        return text, images, videos
     def _process_images(
         self,
+        images: List[Image],
+        **kwargs: Unpack[ImagesKwargs],
     ) -> Tuple[BatchFeature, List[int]]:
         cropped_images: List[Image] = []
         num_cropped_images: List[int] = []
+        for image in images:
+            single_cropped_images = self._crop_image(image)
             cropped_images.extend(single_cropped_images)
             num_cropped_images.append(len(single_cropped_images))
+        if len(cropped_images) == 0:
+            # The image processor may not properly handle empty image lists.
+            # This is a workaround to avoid errors.
+            return BatchFeature(), num_cropped_images
+        image_inputs = self.image_processor.__call__(
             cropped_images,
             **kwargs,
         )
         return image_inputs, num_cropped_images
+    def _treat_videos_as_image_seqs(
+        self, text: List[str], images: List[Image], videos: List[List[Image]]
+    ) -> Tuple[List[str], List[Image], List[bool]]:
+        """Treats videos as image sequences.
+        This method will replace all video tokens in the text with #frame image tokens,
+        and insert the corresponding images into the images list.
+        Args:
+            text: The text to be processed.
+            images: The images to be processed.
+            videos: The videos to be processed.
+        Returns:
+            The processed text and images, and a list of flags indicating whether the images are from videos.
+        """
+        image_token = cast(str, self.tokenizer.image_token)
+        video_token = cast(str, self.tokenizer.video_token)
+        return_text: List[str] = []
+        return_images: List[Image] = []
+        return_video_flags: List[bool] = []
+        for text_item in text:
+            return_text_item: str = ""
+            # Repeatedly find image_token or video_token in the text.
+            while image_token in text_item or video_token in text_item:
+                image_pos = text_item.find(image_token)
+                video_pos = text_item.find(video_token)
+                # If not found, set position to the end of the text.
+                if image_pos == -1:
+                    image_pos = len(text_item)
+                if video_pos == -1:
+                    video_pos = len(text_item)
+                if image_pos != len(text_item) and len(images) > 0 and image_pos < video_pos:
+                    # Take an image and keep the image token if:
+                    #   - an image token is found, and
+                    #   - there are images left, and
+                    #   - the image token is before the first video token.
+                    image = images.pop(0)
+                    return_images.append(image)
+                    return_video_flags.append(False)
+                    return_text_item += text_item[: image_pos + len(image_token)]
+                    text_item = text_item[image_pos + len(image_token) :]
+                elif video_pos != len(text_item) and len(videos) > 0 and video_pos < image_pos:
+                    # Take a video and replace the video token with #frame image tokens if:
+                    #   - a video token is found, and
+                    #   - there are videos left, and
+                    #   - the video token is before the first image token.
+                    video = videos.pop(0)
+                    return_images.extend(video)
+                    return_video_flags.extend([True] * len(video))
+                    return_text_item += text_item[:video_pos] + image_token * len(video)
+                    text_item = text_item[video_pos + len(video_token) :]
+                else:
+                    break
+            # Must place outside the while loop.
+            if image_token in text_item:
+                raise ValueError("Too many image tokens in the text.")
+            if video_token in text_item:
+                raise ValueError("Too many video tokens in the text.")
+            return_text_item += text_item
+            text_item = ""
+            return_text.append(return_text_item)
+        if len(images) != 0:
+            raise ValueError("Too many images provided.")
+        if len(videos) != 0:
+            raise ValueError("Too many videos provided.")
+        return return_text, return_images, return_video_flags
+def dynamic_preprocess(image: Image, min_num: int, max_num: int, image_size: int, use_thumbnail=True) -> List[Image]:
     orig_width, orig_height = image.size
     aspect_ratio = orig_width / orig_height
     return processed_images
+def find_closest_aspect_ratio(
+    aspect_ratio: float, target_ratios: List[Tuple[int, int]], width: int, height: int, image_size: int
+) -> Tuple[int, int]:
     best_ratio_diff = float("inf")
     best_ratio = (1, 1)
     area = width * height

processor_config.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "auto_map": {
     "AutoProcessor": "processing_vila.VILAProcessor"
   },
-  "image_pad_len": 121,
   "max_tiles": 12,
   "min_tiles": 1,
   "processor_class": "VILAProcessor"

   "auto_map": {
     "AutoProcessor": "processing_vila.VILAProcessor"
   },
+  "image_pad_len": 122,
   "max_tiles": 12,
   "min_tiles": 1,
   "processor_class": "VILAProcessor"

special_tokens_map.json CHANGED Viewed

@@ -38,5 +38,6 @@
     "normalized": false,
     "rstrip": false,
     "single_word": false
-  }
 }

     "normalized": false,
     "rstrip": false,
     "single_word": false
+  },
+  "video_token": "<video>"
 }

tokenizer_config.json CHANGED Viewed

@@ -217,6 +217,14 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [
@@ -246,7 +254,8 @@
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {
-    "image_token": "<image>"
   },
   "image_token": "<image>",
   "legacy": false,
@@ -256,5 +265,6 @@
   "processor_class": "VILAProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
-  "unk_token": null
 }

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "151670": {
+      "content": "<video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {
+    "image_token": "<image>",
+    "video_token": "<video>"
   },
   "image_token": "<image>",
   "legacy": false,
   "processor_class": "VILAProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<video>"
 }