Refactor code to transformers convention

by AndyZijianZhang - opened Apr 7, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+976

-128

Files changed (12) hide show

.gitattributes +136 -34
.gitignore +178 -0
chat_template.json +3 -0
config.json +5 -4
configuration_vila.py +30 -89
llm/vocab.json +0 -0
modeling_vila_hf.py +175 -0
preprocessor_config.json +23 -0
processing_vila.py +326 -0
processor_config.json +9 -0
tokenizer.json +3 -0
tokenizer_config.json +85 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,137 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Common settings that generally should always be used with your language specific settings
+# Auto detect text files and perform LF normalization
+*          text=auto
+#
+# The above will handle all files NOT found below
+#
+# Documents
+*.bibtex   text diff=bibtex
+*.doc      diff=astextplain
+*.DOC      diff=astextplain
+*.docx     diff=astextplain
+*.DOCX     diff=astextplain
+*.dot      diff=astextplain
+*.DOT      diff=astextplain
+*.pdf      diff=astextplain
+*.PDF      diff=astextplain
+*.rtf      diff=astextplain
+*.RTF      diff=astextplain
+*.md       text diff=markdown
+*.mdx      text diff=markdown
+*.tex      text diff=tex
+*.adoc     text
+*.textile  text
+*.mustache text
+*.csv      text eol=crlf
+*.tab      text
+*.tsv      text
+*.txt      text
+*.sql      text
+*.epub     diff=astextplain
+# Graphics
+*.png      binary
+*.jpg      binary
+*.jpeg     binary
+*.gif      binary
+*.tif      binary
+*.tiff     binary
+*.ico      binary
+# SVG treated as text by default.
+*.svg      text
+# If you want to treat it as binary,
+# use the following line instead.
+# *.svg    binary
+*.eps      binary
+# Scripts
+*.bash     text eol=lf
+*.fish     text eol=lf
+*.ksh      text eol=lf
+*.sh       text eol=lf
+*.zsh      text eol=lf
+# These are explicitly windows files and should use crlf
+*.bat      text eol=crlf
+*.cmd      text eol=crlf
+*.ps1      text eol=crlf
+# Serialisation
+*.json     text
+*.toml     text
+*.xml      text
+*.yaml     text
+*.yml      text
+# Archives
+*.7z       binary
+*.bz       binary
+*.bz2      binary
+*.bzip2    binary
+*.gz       binary
+*.lz       binary
+*.lzma     binary
+*.rar      binary
+*.tar      binary
+*.taz      binary
+*.tbz      binary
+*.tbz2     binary
+*.tgz      binary
+*.tlz      binary
+*.txz      binary
+*.xz       binary
+*.Z        binary
+*.zip      binary
+*.zst      binary
+# Text files where line endings should be preserved
+*.patch    -text
+#
+# Exclude files from exporting
+#
+.gitattributes export-ignore
+.gitignore     export-ignore
+.gitkeep       export-ignore
+# Basic .gitattributes for a python repo.
+# Source files
+# ============
+*.pxd    text diff=python
+*.py     text diff=python
+*.py3    text diff=python
+*.pyw    text diff=python
+*.pyx    text diff=python
+*.pyz    text diff=python
+*.pyi    text diff=python
+# Binary files
+# ============
+*.db     binary
+*.p      binary
+*.pkl    binary
+*.pickle binary
+*.pyc    binary export-ignore
+*.pyo    binary export-ignore
+*.pyd    binary
+# Jupyter notebook
+*.ipynb  text eol=lf
+# Note: .db, .p, and .pkl files are associated
+# with the python modules ``pickle``, ``dbm.*``,
+# ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
+# (among others).
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+/llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+/llm/vocab.json filter=lfs diff=lfs merge=lfs -text
+/tokenizer.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+.vscode/

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<image>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json CHANGED Viewed

@@ -2,13 +2,14 @@
   "_attn_implementation_autoset": true,
   "_name_or_path": "NVILA-Lite-2B-hf-preview",
   "architectures": [
-    "VILAForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_vila.VILAConfig",
-    "AutoModel": "modeling_vila.VILAForCausalLM",
-    "AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
-    "AutoProcessor": "auto_processor.VILAProcessor"
   },
   "chat_template": null,
   "drop_path_rate": 0.0,

   "_attn_implementation_autoset": true,
   "_name_or_path": "NVILA-Lite-2B-hf-preview",
   "architectures": [
+    "VILAForConditionalGeneration"
   ],
   "auto_map": {
     "AutoConfig": "configuration_vila.VILAConfig",
+    "AutoModel": "modeling_vila_hf.VILAForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_vila_hf.VILAForConditionalGeneration",
+    "AutoModelForImageTextToText": "modeling_vila_hf.VILAForConditionalGeneration",
+    "AutoModelForVision2Seq": "modeling_vila_hf.VILAForConditionalGeneration"
   },
   "chat_template": null,
   "drop_path_rate": 0.0,

configuration_vila.py CHANGED Viewed

@@ -1,93 +1,34 @@
-import json
-import math
-import os
-import os.path as osp
-from copy import deepcopy
-from threading import Thread
-from typing import List, Optional
-import torch
-import torchvision
-from PIL import Image
-from transformers import (
-    AutoProcessor,
-    PretrainedConfig,
-    PreTrainedModel,
-    Qwen2Config,
-    Qwen2ForCausalLM,
-    Qwen2PreTrainedModel,
-    TextIteratorStreamer,
-)
 class VILAConfig(PretrainedConfig):
-    model_type = "vila"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        llm_cfg=None,
-        vision_tower_cfg=None,
-        mm_projector_cfg=None,
-        architectures=None,
-        resume_path=None,
-        hidden_size=None,
-        mm_hidden_size=None,
-        image_aspect_ratio=None,
-        num_video_frames=None,
-        fps=None,
-        mm_vision_select_layer=None,
-        mm_vision_select_feature=None,
-        mm_use_im_start_end=False,
-        mm_use_im_patch_token=False,
-        mm_projector_lr=None,
-        vision_tower_lr=None,
-        vision_resolution=None,
-        interpolate_mode=None,
-        s2=None,
-        dynamic_s2=None,
-        s2_scales=None,
-        s2_max_split_size=None,
-        s2_resize_output_to_scale_idx=0,
-        min_tiles: Optional[int] = 1,
-        max_tiles: Optional[int] = 12,
-        num_time_tokens=None,
-        time_token_format=None,
-        image_encoder: str = '{"_target_": "llava.model.encoders.BasicImageEncoder"}',
-        video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.architectures = architectures
-        self.llm_cfg = llm_cfg
-        self.vision_tower_cfg = vision_tower_cfg
-        self.mm_projector_cfg = mm_projector_cfg
-        self.resume_path = resume_path
-        self.hidden_size = hidden_size
-        self.mm_hidden_size = mm_hidden_size
-        self.image_aspect_ratio = image_aspect_ratio
-        self.num_video_frames = num_video_frames
-        self.fps = fps
-        self.mm_vision_select_layer = mm_vision_select_layer
-        self.mm_vision_select_feature = mm_vision_select_feature
-        self.mm_use_im_start_end = mm_use_im_start_end
-        self.mm_use_im_patch_token = mm_use_im_patch_token
-        self.mm_projector_lr = mm_projector_lr
-        self.vision_tower_lr = vision_tower_lr
-        self.vision_resolution = vision_resolution
-        self.interpolate_mode = interpolate_mode
-        self.s2 = s2
-        self.dynamic_s2 = dynamic_s2
-        self.s2_scales = s2_scales
-        self.s2_max_split_size = s2_max_split_size
-        self.s2_resize_output_to_scale_idx = s2_resize_output_to_scale_idx
-        self.min_tiles = min_tiles
-        self.max_tiles = max_tiles
-        self.num_time_tokens = num_time_tokens
-        self.time_token_format = time_token_format
-        self.image_encoder = image_encoder
-        self.video_encoder = video_encoder

+from typing import Any, Dict
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
 class VILAConfig(PretrainedConfig):
+    # Overridden class attributes.
+    model_type: str = "vila"
+    is_composition: bool = True
+    # Common attributes.
+    vocab_size: int
+    hidden_size: int
+    num_attention_heads: int
+    num_hidden_layers: int
+    # Other attributes.
+    llm_cfg: Dict[str, Any]
+    mm_projector_cfg: Dict[str, Any]
+    vision_tower_cfg: Dict[str, Any]
+    @property
+    def text_config(self) -> Qwen2Config:
+        config = Qwen2Config.from_dict(self.llm_cfg)
+        assert isinstance(config, Qwen2Config)
+        return config
+    @property
+    def vision_config(self) -> SiglipVisionConfig:
+        config = SiglipVisionConfig.from_dict(self.mm_projector_cfg)
+        assert isinstance(config, SiglipVisionConfig)
+        return config

llm/vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_vila_hf.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import os
+from typing import Optional, Type, Union, cast, override
+import transformers.modeling_utils as modeling_utils
+from torch import FloatTensor, LongTensor, Tensor
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
+from .configuration_vila import VILAConfig
+from .modeling_vila import VILAForCausalLM
+IMAGE_TOKEN_ID = 151649
+class VILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
+    config_class: Type[PretrainedConfig] = VILAConfig
+    base_model_prefix: str = "vila"
+    is_parallelizable: bool = True
+    main_input_name: str = "input_ids"
+    config: PretrainedConfig
+    mm_projector: PreTrainedModel
+    llm: Qwen2ForCausalLM
+    vision_tower: PreTrainedModel
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        model: VILAForCausalLM,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(config, *args, **kwargs)
+        self.mm_projector = cast(PreTrainedModel, model.mm_projector)
+        self.llm = cast(Qwen2ForCausalLM, model.llm)
+        self.vision_tower = cast(PreTrainedModel, model.vision_tower)
+    def forward(
+        self,
+        *,
+        attention_mask: Optional[Tensor] = None,
+        input_ids: Optional[LongTensor] = None,
+        inputs_embeds: Optional[FloatTensor] = None,
+        pixel_values: Optional[Tensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        # Vision info is only used for prefilling.
+        if kwargs.get("past_key_values", None) is not None:
+            pixel_values = None
+        if inputs_embeds is None:
+            assert input_ids is not None
+            inputs_embeds = self._embed(input_ids, pixel_values)
+        else:
+            assert input_ids is None
+            assert pixel_values is None
+        outputs = self.llm.forward(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+        return outputs
+    @override
+    @classmethod
+    @modeling_utils.restore_default_torch_dtype
+    def from_pretrained(
+        cls: Type[modeling_utils.SpecificPreTrainedModelType],
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ) -> modeling_utils.SpecificPreTrainedModelType:
+        state_dict = kwargs.pop("state_dict", None)
+        if pretrained_model_name_or_path is not None:
+            config = VILAConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                local_files_only=local_files_only,
+                revision=revision,
+                use_safetensors=use_safetensors,
+                **kwargs,
+            )
+        else:
+            assert (
+                config is not None and state_dict is not None
+            ), "Both config and state_dict must be provided if pretrained_model_name_or_path is None."
+        inner_model = VILAForCausalLM.from_pretrained(
+            pretrained_model_name_or_path,  # type: ignore
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        state_dict = inner_model.state_dict()
+        # Prefix keys with "model.".
+        # state_dict = {f"model.{k}": v for k, v in state_dict.items()}
+        return super().from_pretrained(
+            None,
+            inner_model,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            state_dict=state_dict,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+    def _embed(
+        self,
+        input_ids: LongTensor,
+        pixel_values: Optional[Tensor],
+    ) -> FloatTensor:
+        """Gets the embedding of the input ids and pixel values.
+        Args:
+            input_ids: The input ids.
+            pixel_values: The pixel values.
+        Returns:
+            The embedding of the input ids and pixel values.
+        """
+        text_embedding = self.llm.get_input_embeddings().__call__(input_ids)
+        text_embedding = cast(FloatTensor, text_embedding)
+        if pixel_values is None:
+            return text_embedding
+        image_features: Tensor = self.vision_tower.__call__(pixel_values)
+        image_features: Tensor = self.mm_projector.__call__(image_features)
+        n_images, n_feature, dim_feature = image_features.shape
+        image_features = image_features.view(n_images * n_feature, dim_feature)
+        image_token_mask = input_ids == IMAGE_TOKEN_ID
+        text_embedding[image_token_mask] = image_features
+        return text_embedding

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 448,
+    "width": 448
+  }
+}

processing_vila.py ADDED Viewed

	@@ -0,0 +1,326 @@

+from typing import List, Optional, Tuple, Unpack, cast
+import numpy as np
+import transformers.image_transforms as image_transforms
+import transformers.image_utils as image_utils
+from numpy.typing import NDArray
+from PIL.Image import Image
+from torch import Tensor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_processing_utils_fast import BaseImageProcessorFast
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase, TextInput
+class VILAProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}  # type: ignore
+class VILAProcessorOutput(BatchFeature):
+    input_ids: List[List[int]] | NDArray[np.int64] | Tensor
+    attention_mask: List[List[int]] | NDArray[np.int64] | Tensor
+    pixel_values: Optional[List[NDArray[np.float32]] | NDArray[np.float32] | Tensor]
+class VILAProcessor(ProcessorMixin):
+    attributes: List[str] = [
+        "image_processor",
+        "tokenizer",
+    ]
+    image_processor_class: str = "AutoImageProcessor"
+    tokenizer_class: str = "AutoTokenizer"
+    # Attributes.
+    image_processor: BaseImageProcessor | BaseImageProcessorFast
+    tokenizer: PreTrainedTokenizerBase
+    # Configuration parameters.
+    image_pad_len: int
+    image_token: str
+    max_tiles: int
+    min_tiles: int
+    def __init__(
+        self,
+        image_processor: BaseImageProcessor,
+        tokenizer: PreTrainedTokenizer,
+        *,
+        image_pad_len: int,
+        image_token: str,
+        max_tiles: int,
+        min_tiles: int,
+        **kwargs,
+    ):
+        super().__init__(
+            image_processor,
+            tokenizer,
+            **kwargs,
+        )
+        self.image_pad_len = image_pad_len
+        self.image_token = image_token
+        self.max_tiles = max_tiles
+        self.min_tiles = min_tiles
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[TextInput | List[TextInput]] = None,
+        audio: None = None,
+        videos: Optional[VideoInput] = None,
+        **kwargs: Unpack[VILAProcessorKwargs],
+    ) -> VILAProcessorOutput:
+        # Validate arguments.
+        assert text is not None and text != [], "text must be provided"
+        assert not kwargs.get(
+            "is_split_into_words", False
+        ), "is_split_into_words=True is not supported"
+        output_kwargs = self._merge_kwargs(
+            VILAProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Process images.
+        if images is not None and images != []:
+            image_inputs, num_cropped_images = self._process_images(
+                images=images,
+                **output_kwargs["images_kwargs"],
+            )
+        else:
+            # If no images are provided, do not define pixel_values.
+            image_inputs = BatchFeature()
+            num_cropped_images = []
+        # TODO: video processing.
+        # Process text.
+        text = text if isinstance(text, list) else [text]
+        text = self._pad_image_tokens_by_num_crops(
+            text,
+            num_cropped_images=num_cropped_images,
+        )
+        text = self._pad_image_tokens_by_num_embeddings(
+            text,
+        )
+        text_inputs = self.tokenizer.__call__(
+            text,
+            **output_kwargs["text_kwargs"],
+        )
+        return VILAProcessorOutput(
+            data={
+                **text_inputs,
+                **image_inputs,
+            }
+        )
+    def _crop_image(
+        self,
+        image: Image,
+    ) -> List[Image]:
+        """Crops the image into multiple tiles.
+        Args:
+            image: The image to be cropped.
+        Returns:
+            The cropped images.
+        """
+        # TODO: Support more image processors.
+        assert isinstance(self.image_processor, SiglipImageProcessor)
+        assert self.image_processor.size["height"] == self.image_processor.size["width"]
+        cropped_size = self.image_processor.size["height"]
+        cropped_images: List[Image] = dynamic_preprocess(
+            image,
+            min_num=self.min_tiles,
+            max_num=self.max_tiles,
+            image_size=cropped_size,
+        )
+        return cropped_images
+    def _pad_image_tokens_by_num_crops(
+        self,
+        text: List[TextInput],
+        *,
+        num_cropped_images: List[int],
+    ) -> List[TextInput]:
+        """Pads each <image> to num_cropped_images of "<image>\n\n".
+        Args:
+            text: The text to be padded.
+            num_cropped_images: The number of cropped images for each image token.
+        Returns:
+            The padded text.
+        """
+        # Validate arguments.
+        num_images = len(num_cropped_images)
+        num_image_tokens = sum([item.count(self.image_token) for item in text])
+        assert num_images == num_image_tokens, (
+            f"Number of image tokens ({num_image_tokens}) in text does not match "
+            f"the number of images ({num_images})."
+        )
+        assert all(
+            image_pad_len > 0 for image_pad_len in num_cropped_images
+        ), "All image padding lengths should be positive integers."
+        # Pad image tokens.
+        image_idx = 0
+        padded_text: List[TextInput] = []
+        for i in range(len(text)):
+            padded_text_item = ""
+            remaining_text = text[i]
+            while True:
+                token_pos = remaining_text.find(self.image_token)
+                if token_pos == -1:
+                    padded_text_item += remaining_text
+                    break
+                padded_text_item += remaining_text[:token_pos] + (
+                    (self.image_token + "\n") * num_cropped_images[image_idx]
+                )
+                image_idx += 1
+                remaining_text = remaining_text[token_pos + len(self.image_token) :]
+            padded_text.append(padded_text_item)
+        return padded_text
+    def _pad_image_tokens_by_num_embeddings(
+        self,
+        text: List[TextInput],
+    ) -> List[TextInput]:
+        """Pads each <image> to image_pad_len times of "<image>".
+        Args:
+            text: The text to be padded.
+        Returns:
+            The padded text.
+        """
+        padded_text: List[TextInput] = []
+        for i in range(len(text)):
+            padded_text_item = ""
+            remaining_text = text[i]
+            while True:
+                token_pos = remaining_text.find(self.image_token)
+                if token_pos == -1:
+                    padded_text_item += remaining_text
+                    break
+                padded_text_item += remaining_text[:token_pos] + (
+                    self.image_token * self.image_pad_len
+                )
+                remaining_text = remaining_text[token_pos + len(self.image_token) :]
+            padded_text.append(padded_text_item)
+        return padded_text
+    def _process_images(
+        self,
+        images: ImageInput,
+        **kwargs: Unpack[VILAProcessorKwargs],
+    ) -> Tuple[BatchFeature, List[int]]:
+        images_flatten = cast(
+            List[Image] | List[NDArray] | List[Tensor],
+            image_utils.make_flat_list_of_images(images),
+        )
+        cropped_images: List[Image] = []
+        num_cropped_images: List[int] = []
+        for image in images_flatten:
+            pil_image: Image = image_transforms.to_pil_image(image)
+            single_cropped_images = self._crop_image(pil_image)
+            cropped_images.extend(single_cropped_images)
+            num_cropped_images.append(len(single_cropped_images))
+        image_inputs = self.image_processor(
+            cropped_images,
+            **kwargs,
+        )
+        return image_inputs, num_cropped_images
+def dynamic_preprocess(
+    image, min_num=1, max_num=12, image_size=384, use_thumbnail=True
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio

processor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "auto_map": {
+        "AutoProcessor": "processing_vila.VILAProcessor"
+    },
+    "max_tiles": 12,
+    "min_tiles": 1,
+    "image_pad_len": 121,
+    "image_token": "<image>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fc37d325d718c91319f527fbe8258c03ac890aba2f252b85af89a625927908a
+size 11419189

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<vila/sentinel>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<vila/video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "[BOS]",
+  "chat_template": "{% if messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{% for message in messages if message['content'] is not none %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}