Upload files with `vila-upload`.

Upload added_tokens.json
Upload processing_nvila_lite.py
Upload generation_config.json
Upload chat_template.jinja
Upload modeling_nvila_lite.py
Upload configuration_nvila_lite.py
Upload merges.txt
Upload special_tokens_map.json
Upload config.json
Upload vocab.json
Upload tokenizer_config.json
Upload processor_config.json
Upload preprocessor_config.json
Upload model.safetensors

Files changed (14) hide show

added_tokens.json +10 -0
chat_template.jinja +7 -0
config.json +100 -0
configuration_nvila_lite.py +31 -0
generation_config.json +6 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_nvila_lite.py +157 -0
preprocessor_config.json +27 -0
processing_nvila_lite.py +389 -0
processor_config.json +6 -0
special_tokens_map.json +30 -0
tokenizer_config.json +96 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "<image>": 151649,
+  "<vila/sentinel>": 151648,
+  "<vila/video>": 151650,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "[BOS]": 151646,
+  "[PAD]": 151647
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '<|im_start|>system
+You are a helpful assistant<|im_end|>
+' }}{% endif %}{{ '<|im_start|>' + message['role'] + '
+' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>
+' }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{{ '<image>' }}{% elif content['type'] == 'video' or 'video' in content %}{{ '<vila/video>' }}{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{{ '<|im_end|>
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,100 @@

+{
+  "architectures": [
+    "NVILALiteForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_nvila_lite.NVILALiteConfig",
+    "AutoModel": "modeling_nvila_lite.NVILALiteForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_nvila.NVILAForConditionalGeneration",
+    "AutoModelForImageTextToText": "modeling_nvila.NVILAForConditionalGeneration",
+    "AutoModelForVision2Seq": "modeling_nvila.NVILAForConditionalGeneration"
+  },
+  "image_token_id": 151649,
+  "model_type": "nvila_lite",
+  "text_config": {
+    "_attn_implementation_autoset": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "initializer_range": 0.02,
+    "intermediate_size": 8960,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "model_max_length": 4096,
+    "model_type": "qwen2",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "tokenizer_model_max_length": 4096,
+    "tokenizer_padding_side": "right",
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151651
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "video_token_id": 151650,
+  "vision_config": {
+    "_attn_implementation_autoset": false,
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 448,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_image_tokens": 256,
+    "patch_size": 14,
+    "projection_dim": 2048,
+    "projector_hidden_act": "gelu_fast",
+    "torch_dtype": "bfloat16",
+    "vision_use_head": false
+  }
+}

configuration_nvila_lite.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Any
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2 import Qwen2Config
+from transformers.models.siglip import SiglipVisionConfig
+class NVILALiteConfig(PretrainedConfig):
+    model_type = "nvila_lite"
+    sub_configs = {
+        "text_config": Qwen2Config,
+        "vision_config": SiglipVisionConfig,
+    }
+    _auto_class = "AutoConfig"
+    def __init__(
+        self,
+        *,
+        text_config: dict[str, Any] | None = None,
+        vision_config: dict[str, Any] | None = None,
+        image_token_id: int | None = None,
+        video_token_id: int | None = None,
+        **kwargs,
+    ):
+        self.text_config = Qwen2Config(**text_config) if text_config is not None else Qwen2Config()
+        self.vision_config = SiglipVisionConfig(**vision_config) if vision_config is not None else SiglipVisionConfig()
+        self.image_token_id = image_token_id if image_token_id is not None else -1
+        self.video_token_id = video_token_id if video_token_id is not None else -1
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "4.55.4"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ec45647914f31abc6564289047ef9ec65001d5d2068d91161b25b1790991567
+size 4000375952

modeling_nvila_lite.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import contextlib
+import math
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers import Qwen2ForCausalLM, SiglipVisionModel
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_nvila_lite import NVILALiteConfig
+MM_HIDDEN_SIZE = 1152
+class NVILALiteMultiModalProjectorDownsampleBlock(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, sequence_length, hidden_size = x.shape
+        feat_size = math.isqrt(sequence_length)
+        features = x.reshape(batch_size, feat_size, feat_size, hidden_size)
+        pad_after = (3 - feat_size % 3) % 3
+        if pad_after > 0:
+            features = F.pad(features, (0, 0, 0, pad_after, 0, pad_after))
+            feat_size = feat_size + pad_after
+        features = features.reshape(batch_size, feat_size // 3, 3, feat_size // 3, 3, hidden_size)
+        features = features.permute(0, 1, 3, 2, 4, 5).contiguous()
+        features = features.reshape(batch_size, -1, 9 * hidden_size)
+        return features
+class NVILALiteMultiModalProjector(nn.Module):
+    def __init__(self, config: NVILALiteConfig):
+        super().__init__()
+        self.layers = nn.Sequential(
+            NVILALiteMultiModalProjectorDownsampleBlock(),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 9),
+            nn.Linear(MM_HIDDEN_SIZE * 9, MM_HIDDEN_SIZE * 3),
+            nn.GELU(),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 3),
+            nn.Linear(MM_HIDDEN_SIZE * 3, config.text_config.hidden_size),
+            nn.GELU(),
+            nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        return self.layers(x)
+class NVILALiteForConditionalGeneration(PreTrainedModel, GenerationMixin):
+    config_class = NVILALiteConfig
+    base_model_prefix = "llm"
+    _auto_class = "AutoModel"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    def __init__(self, config: NVILALiteConfig):
+        super().__init__(config)
+        self.config: NVILALiteConfig
+        @contextlib.contextmanager
+        def default_torch_dtype(dtype):
+            original_dtype = torch.get_default_dtype()
+            torch.set_default_dtype(dtype)
+            try:
+                yield
+            finally:
+                torch.set_default_dtype(original_dtype)
+        with default_torch_dtype(config.torch_dtype):
+            self.vision_tower = SiglipVisionModel(config.vision_config)
+            self.mm_projector = NVILALiteMultiModalProjector(config)
+            self.llm = Qwen2ForCausalLM(config.text_config)
+        self.post_init()
+    def forward(
+        self,
+        *,
+        input_ids: Tensor | None = None,
+        inputs_embeds: Tensor | None = None,
+        pixel_values: Tensor | None = None,
+        pixel_values_videos: Tensor | None = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        assert (input_ids is None) != (
+            inputs_embeds is None
+        ), "Exactly one of `input_ids` or `inputs_embeds` must be specified."
+        if input_ids is not None and torch.any(
+            torch.isin(
+                input_ids,
+                torch.tensor(
+                    [self.config.image_token_id, self.config.video_token_id],
+                    device=input_ids.device,
+                ),
+            ).any()
+        ):  # Prefill
+            inputs_embeds = self._embed(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                pixel_values_videos=pixel_values_videos,
+            )
+            input_ids = None
+        outputs = self.llm(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        return outputs
+    def _embed(
+        self,
+        *,
+        input_ids: Tensor,
+        pixel_values: Tensor | None,
+        pixel_values_videos: Tensor | None,
+    ) -> Tensor:
+        inputs_embeds: Tensor = self.llm.model.embed_tokens(input_ids)
+        for pixel_values, media_token_id in [
+            (pixel_values, self.config.image_token_id),
+            (pixel_values_videos, self.config.video_token_id),
+        ]:
+            if pixel_values is None:
+                continue
+            vision_features = self._encode_vision(pixel_values)
+            vision_features = einops.rearrange(vision_features, "n p d -> (n p) d")
+            inputs_embeds[input_ids == media_token_id] = vision_features
+        return inputs_embeds
+    def _encode_vision(self, pixel_values: Tensor) -> Tensor:
+        vision_tower_output: BaseModelOutputWithPooling = self.vision_tower(
+            pixel_values,
+            output_hidden_states=True,
+        )
+        assert vision_tower_output.hidden_states is not None
+        vision_features = vision_tower_output.hidden_states[-2]
+        vision_features = self.mm_projector(vision_features)
+        return vision_features

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_nvila_lite.NVILALiteProcessor"
+  },
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "NVILALiteProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 448,
+    "width": 448
+  }
+}

processing_nvila_lite.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import re
+from typing import cast
+import numpy as np
+import transformers.image_transforms as image_transforms
+import transformers.image_utils as image_utils
+import transformers.video_utils as video_utils
+from PIL.Image import Image
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.models.qwen2 import Qwen2Tokenizer, Qwen2TokenizerFast
+from transformers.models.siglip import SiglipImageProcessor, SiglipImageProcessorFast
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from transformers.tokenization_utils_base import BatchEncoding, TextInput
+from transformers.video_utils import VideoInput, VideoMetadata
+class NVILALiteProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}  # type: ignore
+class NVILALiteProcessor(ProcessorMixin):
+    attributes = [
+        "image_processor",
+        "tokenizer",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    _auto_class = "AutoProcessor"
+    def __init__(
+        self,
+        image_processor: SiglipImageProcessor | SiglipImageProcessorFast,
+        tokenizer: Qwen2Tokenizer | Qwen2TokenizerFast,
+        chat_template: str | None = None,
+        **kwargs,
+    ):
+        super().__init__(
+            image_processor,
+            tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+        self.image_processor: SiglipImageProcessor | SiglipImageProcessorFast
+        self.tokenizer: Qwen2Tokenizer | Qwen2TokenizerFast
+    def __call__(
+        self,
+        *,
+        text: TextInput | list[TextInput],
+        images: ImageInput | None = None,
+        videos: VideoInput | None = None,
+        **kwargs: Unpack[NVILALiteProcessorKwargs],
+    ) -> BatchFeature:
+        normalized_text, normalized_images, normalized_videos = self._normalize_inputs(
+            text=text,
+            images=images,
+            videos=videos,
+        )
+        images_inputs, image_token_padding_strategy = (
+            self._preprocess_images(
+                normalized_images,
+                **kwargs,
+            )
+            if len(normalized_images) > 0
+            else (BatchFeature(), [])
+        )
+        videos_inputs, video_token_padding_strategy = (
+            self._preprocess_videos(
+                normalized_videos,
+                **kwargs,
+            )
+            if len(normalized_videos) > 0
+            else (BatchFeature(), [])
+        )
+        text_inputs = self._preprocess_text(
+            normalized_text,
+            image_token_padding_strategy=image_token_padding_strategy,
+            video_token_padding_strategy=video_token_padding_strategy,
+            **kwargs,
+        )
+        return BatchFeature(
+            {
+                **text_inputs,
+                **images_inputs,
+                **videos_inputs,
+            }
+        )
+    def batch_decode(self, *args, **kwargs) -> list[str]:
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def _normalize_inputs(
+        self,
+        *,
+        text: TextInput | list[TextInput],
+        images: ImageInput | None,
+        videos: VideoInput | None,
+    ) -> tuple[list[str], list[Image], list[list[Image]]]:
+        if isinstance(text, list):
+            normalized_text = text
+        else:
+            normalized_text = [text]
+        if images is not None and images != []:
+            image_flat_list = cast(list, image_utils.make_flat_list_of_images(images))
+            normalized_images = [cast(Image, image_transforms.to_pil_image(image)) for image in image_flat_list]
+        else:
+            normalized_images = []
+        if videos is not None and videos != []:
+            video_list = cast(list[list], video_utils.make_batched_videos(videos))
+            normalized_videos = [
+                [cast(Image, image_transforms.to_pil_image(image)) for image in video] for video in video_list
+            ]
+        else:
+            normalized_videos = []
+        return normalized_text, normalized_images, normalized_videos
+    def _preprocess_images(
+        self,
+        images: list[Image],
+        **kwargs: Unpack[NVILALiteProcessorKwargs],
+    ) -> tuple[BatchFeature, list[list[int]]]:
+        merged_kwargs = self._merge_kwargs(
+            NVILALiteProcessorKwargs,  # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        images = [image.convert("RGB") for image in images]
+        if len(images) == 1:
+            assert self.image_processor.size["height"] == self.image_processor.size["width"]
+            image_tiles = dynamic_preprocess(
+                images[0],
+                min_num=1,
+                max_num=12,
+                image_size=self.image_processor.size["height"],
+            )
+            pixel_values = self.image_processor(
+                image_tiles,
+                **merged_kwargs["images_kwargs"],
+            )["pixel_values"]
+            images_inputs = BatchFeature(
+                {
+                    "pixel_values": pixel_values,
+                }
+            )
+            padding_strategy = [[121] * len(image_tiles)]
+        else:
+            pixel_values = self.image_processor(
+                images,
+                **merged_kwargs["images_kwargs"],
+            )["pixel_values"]
+            images_inputs = BatchFeature(
+                {
+                    "pixel_values": pixel_values,
+                }
+            )
+            padding_strategy = [[121]] * len(images)
+        return images_inputs, padding_strategy
+    def _preprocess_text(
+        self,
+        text: list[str],
+        *,
+        image_token_padding_strategy: list[list[int]],
+        video_token_padding_strategy: list[list[int]],
+        **kwargs: Unpack[NVILALiteProcessorKwargs],
+    ) -> BatchEncoding:
+        # Pad media tokens.
+        assert isinstance(self.tokenizer.image_token, str)
+        assert isinstance(self.tokenizer.video_token, str)
+        for media_token, padding_strategy in (
+            (self.tokenizer.image_token, image_token_padding_strategy),
+            (self.tokenizer.video_token, video_token_padding_strategy),
+        ):
+            assert sum([s.count(media_token) for s in text]) == len(padding_strategy)
+            # Pad to number of tiles.
+            pad_lens = [len(x) for x in padding_strategy]
+            text = [re.sub(rf"({re.escape(media_token)})", lambda _: media_token * pad_lens.pop(0), s) for s in text]
+            # HACK: NVILA mistakenly suffixes line feeds to some media tokens.
+            if len(image_token_padding_strategy) == 1 and media_token == self.tokenizer.image_token:
+                image_token = self.tokenizer.image_token
+                assert isinstance(image_token, str)
+                text = [re.sub(rf"({re.escape(image_token)})", r"\1\n", s) for s in text]
+            # Pad to number of features.
+            pad_lens = [y for x in padding_strategy for y in x]
+            pad_lens = [x + 1 for x in pad_lens]  # Reserve for lf ending.
+            text = [re.sub(rf"({re.escape(media_token)})", lambda _: media_token * pad_lens.pop(0), s) for s in text]
+        merged_kwargs = self._merge_kwargs(
+            NVILALiteProcessorKwargs,  # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        text_inputs = self.tokenizer(
+            text=text,
+            **merged_kwargs["text_kwargs"],
+        )
+        # Replace last token id of every image tile with lf token id.
+        lf_token_id = self.tokenizer.encode("\n")[0]
+        assert isinstance(self.tokenizer.image_token_id, int)
+        assert isinstance(self.tokenizer.video_token_id, int)
+        input_ids = text_inputs.input_ids
+        for media_token_id, padding_strategy in [
+            (self.tokenizer.image_token_id, image_token_padding_strategy),
+            (self.tokenizer.video_token_id, video_token_padding_strategy),
+        ]:
+            pad_lens = [y for x in padding_strategy for y in x]
+            for i in range(len(input_ids)):
+                j = 0
+                while j < len(input_ids[i]):
+                    if input_ids[i][j] != media_token_id:
+                        j += 1
+                        continue
+                    j += pad_lens.pop(0)
+                    input_ids[i][j] = lf_token_id
+                    j += 1
+        return text_inputs
+    def _preprocess_videos(
+        self,
+        videos: list[list[Image]],
+        **kwargs: Unpack[NVILALiteProcessorKwargs],
+    ) -> tuple[BatchFeature, list[list[int]]]:
+        merged_kwargs = self._merge_kwargs(
+            NVILALiteProcessorKwargs,  # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Support sampling frames.
+        if merged_kwargs["videos_kwargs"].get("do_sample_frames"):
+            videos = [
+                self._sample_frames(
+                    video,
+                    **merged_kwargs["videos_kwargs"],
+                )
+                for video in videos
+            ]
+        videos = [[image.convert("RGB") for image in video] for video in videos]
+        frames = [image for video in videos for image in video]
+        pixel_values_videos = self.image_processor(
+            frames,
+            **merged_kwargs["images_kwargs"],
+        )["pixel_values"]
+        videos_inputs = BatchFeature(
+            {
+                "pixel_values_videos": pixel_values_videos,
+            }
+        )
+        padding_strategy = [[121] * len(video) for video in videos]
+        return videos_inputs, padding_strategy
+    def _sample_frames(
+        self,
+        video: list[Image],
+        **kwargs: Unpack[VideosKwargs],
+    ) -> list[Image]:
+        fps = kwargs.get("fps")
+        num_frames = kwargs.get("num_frames")
+        if num_frames is not None and fps is None:
+            indices = np.round(np.linspace(0, len(video) - 1, num_frames)).astype(int)
+            return [video[i] for i in indices]
+        elif num_frames is None and fps is not None:
+            video_metadata = kwargs.get("video_metadata")
+            if isinstance(video_metadata, VideoMetadata):
+                total_num_frames = video_metadata.total_num_frames
+                duration = video_metadata.duration
+            elif isinstance(video_metadata, dict):
+                total_num_frames = video_metadata.get("total_num_frames")
+                duration = video_metadata.get("duration")
+                assert total_num_frames is not None
+                assert duration is not None
+            else:
+                raise NotImplementedError
+            indices = np.round(np.linspace(0, total_num_frames - 1, int(fps * duration))).astype(int)
+            return [video[i] for i in indices]
+        else:
+            raise NotImplementedError
+# NOTE: The following functions are directly copied from VILA codebase.
+def dynamic_preprocess(
+    image: Image, min_num: int, max_num: int, image_size: int, use_thumbnail: bool = True
+) -> list[Image]:
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def find_closest_aspect_ratio(
+    aspect_ratio: float, target_ratios: list[tuple[int, int]], width: int, height: int, image_size: int
+) -> tuple[int, int]:
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_nvila_lite.NVILALiteProcessor"
+  },
+  "processor_class": "NVILALiteProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image>",
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sentinel_token": "<vila/sentinel>",
+  "video_token": "<vila/video>"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,96 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<vila/sentinel>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<vila/video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_nvila_lite.NVILALiteProcessor"
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "image_token": "<image>",
+    "sentinel_token": "<vila/sentinel>",
+    "video_token": "<vila/video>"
+  },
+  "image_token": "<image>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "[PAD]",
+  "padding_side": "left",
+  "processor_class": "NVILALiteProcessor",
+  "sentinel_token": "<vila/sentinel>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<vila/video>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff