Danny Yin commited on Mar 11

Commit

73b433d

1 Parent(s): d5add18

release

Browse files

Files changed (20) hide show

LICENSE +0 -0
README.md +176 -0
added_tokens.json +10 -0
chat_template.jinja +7 -0
config.json +104 -0
configuration_nvila.py +35 -0
generation_config.json +6 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
modeling_nvila.py +604 -0
preprocessor_config.json +39 -0
processing_nvila.py +1092 -0
processor_config.json +6 -0
pytorch_model.bin.index.json +793 -0
special_tokens_map.json +30 -0
tokenizer_config.json +96 -0
vocab.json +0 -0

LICENSE ADDED Viewed

File without changes

README.md ADDED Viewed

	@@ -0,0 +1,176 @@

+---
+license: cc-by-nc-4.0
+tags:
+- AutoGaze
+- NVILA
+---
+## Model Overview
+### Description: <br>
+NVILA-HD-Video is a Multi-modal Large Language Model with 8B parameters that understands and answers questions about videos with up to 4K resolution and 1K frames.
+Specifically, NVILA-HD-Video uses [AutoGaze](nvidia/AutoGaze) to reduce redundant patches in a video before running the ViT or LLM. Empirically, AutoGaze can reduce #tokens in in a video by up to 100x, reducing the latency of ViT/LLM by up to 19x/10x. This enables NVILA-HD-Video to efficiently scale to 4K-resolution, 1K-frame videos and achieve improved performance on benchmarks such as VideoMME and state-of-the-art performance on HLVid, a high-resolution long-form video benchmark proposed in this work as well.
+This model is for research and development only.
+### Quick Start:
+Note: please first install [AutoGaze](https://github.com/NVlabs/AutoGaze).
+```python
+import torch
+from transformers import AutoModel, AutoProcessor
+model_path = "nvidia/NVILA-8B-HD-Video"
+video_path = "https://huggingface.co/datasets/bfshi/HLVid/resolve/main/example/clip_av_video_5_001.mp4"
+prompt = "Question: What does the white text on the green road sign say?\n \
+A. Hampden St\n \
+B. Hampden Ave\n \
+C. HampdenBlvd\n \
+D. Hampden Rd\n \
+Please answer directly with the letter of the correct answer."
+# ----- Video processing args -----
+num_video_frames = 128           # Total sampled frames for tiles
+num_video_frames_thumbnail = 64 # Total sampled frames for thumbnails
+max_tiles_video = 48             # Max spatial tiles per video (one tile is 392x392)
+# ----- AutoGaze args (tiles) -----
+gazing_ratio_tile = [0.2] + [0.06] * 15  # Per-frame max gazing ratios (single float or list)
+task_loss_requirement_tile = 0.6
+# ----- AutoGaze args (thumbnails) -----
+gazing_ratio_thumbnail = 1       # Set to None to skip gazing on thumbnails
+task_loss_requirement_thumbnail = None
+# ----- Batching -----
+max_batch_size_autogaze = 16
+max_batch_size_siglip = 32
+# Load processor and model
+processor = AutoProcessor.from_pretrained(
+    model_path,
+    num_video_frames=num_video_frames,
+    num_video_frames_thumbnail=num_video_frames_thumbnail,
+    max_tiles_video=max_tiles_video,
+    gazing_ratio_tile=gazing_ratio_tile,
+    gazing_ratio_thumbnail=gazing_ratio_thumbnail,
+    task_loss_requirement_tile=task_loss_requirement_tile,
+    task_loss_requirement_thumbnail=task_loss_requirement_thumbnail,
+    max_batch_size_autogaze=max_batch_size_autogaze,
+    trust_remote_code=True,
+)
+model = AutoModel.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    device_map="auto",
+    max_batch_size_siglip=max_batch_size_siglip,
+)
+model.eval()
+# Run inference
+video_token = processor.tokenizer.video_token
+inputs = processor(text=f"{video_token}\n\n{prompt}", videos=video_path, return_tensors="pt")
+inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+outputs = model.generate(**inputs)
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0].strip()
+print(response)
+```
+For more details, see the [VILA github repo](https://github.com/NVlabs/VILA/tree/main/vila_hd/nvila_hd_video).
+### License/Terms of Use: <br>
+Governing Terms:  [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en). Additional Information:  [Apache License 2.0](https://choosealicense.com/licenses/apache-2.0/) for [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
+### Deployment Geography:
+Global
+### Use Case: <br>
+The model is used for understanding high-resolution long-form videos.
+## Reference(s):
+AutoGaze GitHub: https://github.com/NVlabs/AutoGaze <br>
+## Model Architecture:
+**Architecture Type:** Neural Network
+**Network Architecture:** Multi-modal Large Language Model
+**Number of model parameters:** 8B <br>
+**This model was developed based on [AutoGaze](https://huggingface.co/nvidia/AutoGaze) and [NVILA-Lite-8B](https://huggingface.co/Efficient-Large-Model/NVILA-Lite-8B) <br>
+## Input: <br>
+**Input Type(s):** Video and Text <br>
+**Input Format:** Red, Green, Blue (RGB) and strings <br>
+**Input Parameters:** Three Dimensional (3D) and One Dimensional (1D) <br>
+**Other Properties Related to Input:** Videos with resolution up to 4K and #frames up to 1K and text input up to 20K tokens <br>
+## Output: <br>
+**Output Type(s):** Text <br>
+**Output Format:** Strings <br>
+**Output Parameters:** One Dimensional (1D) <br>
+**Other Properties Related to Output:** Text output up to 20K tokens <br>
+Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions. <br>
+## Software Integration:
+**Runtime Engine(s):**
+Not Applicable (N/A) <br>
+**Supported Hardware Microarchitecture Compatibility:** <br>
+NVIDIA Ampere <br>
+NVIDIA Blackwell <br>
+NVIDIA Hopper <br>
+NVIDIA Jetson  <br>
+**Preferred/Supported Operating System(s):** <br>
+Linux <br>
+Linux 4 Tegra <br>
+QNX  <br>
+Windows <br>
+The integration of foundation and fine-tuned models into AI systems requires additional testing using use-case-specific data to ensure safe and effective deployment. Following the V-model methodology, iterative testing and validation at both unit and system levels are essential to mitigate risks, meet technical and functional requirements, and ensure compliance with safety and ethical standards before deployment. <br>
+## Model Version(s):
+v1.0 - Initial release
+## Training Datasets: <br>
+72 datasets. See NVILA paper for more details.
+Dataset partition: Training 100% <br>
+## Training Dataset:
+**Link:**
+See NVILA paper for more details.
+**Data Collection Method by dataset:**  <br>
+[Hybrid: Automated, Human]
+**Labeling Method by dataset:**  <br>
+[Hybrid: Automated, Human]
+**Properties (Quantity, Dataset Descriptions, Sensor(s)):**  <br>
+72 datasets split into 5 stages (Projector Alignment, Vision Encoder Alignment, Pre-Training, Image Instruction-Tuning, and Patch Selection Tuning) <br>
+## Inference:
+**Acceleration Engine:** N/A <br>
+**Test Hardware:** <br>
+The model is tested on NVIDIA A100 GPU.
+### Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications.  When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "<image>": 151649,
+  "<vila/sentinel>": 151648,
+  "<vila/video>": 151650,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "[BOS]": 151646,
+  "[PAD]": 151647
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '<|im_start|>system
+You are a helpful assistant<|im_end|>
+' }}{% endif %}{{ '<|im_start|>' + message['role'] + '
+' }}{% if message['content'] is string %}{{ message['content'] + '<|im_end|>
+' }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{{ '<image>' }}{% elif content['type'] == 'video' or 'video' in content %}{{ '<vila/video>' }}{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{{ '<|im_end|>
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "architectures": [
+    "NVILAForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_nvila.NVILAConfig",
+    "AutoModel": "modeling_nvila.NVILAForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_nvila.NVILAForConditionalGeneration",
+    "AutoModelForImageTextToText": "modeling_nvila.NVILAForConditionalGeneration",
+    "AutoModelForVision2Seq": "modeling_nvila.NVILAForConditionalGeneration"
+  },
+  "image_token_id": 151649,
+  "model_type": "nvila",
+  "text_config": {
+    "_attn_implementation_autoset": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "model_max_length": 40960,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "tokenizer_model_max_length": 40960,
+    "tokenizer_padding_side": "right",
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151651
+  },
+  "max_batch_size_siglip": 128,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "video_token_id": 151650,
+  "vision_config": {
+    "_attn_implementation_autoset": false,
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "attn_implementation": "sdpa",
+    "attn_type": "block_causal",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 448,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "max_embed_batch_size": 16,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_image_tokens": 256,
+    "patch_size": 14,
+    "projection_dim": 2048,
+    "projector_hidden_act": "gelu_fast",
+    "scales": "56+112+196+392",
+    "torch_dtype": "bfloat16",
+    "vision_use_head": false
+  }
+}

configuration_nvila.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import sys
+from pathlib import Path
+from typing import Any
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2 import Qwen2Config
+from autogaze.vision_encoders.siglip.configuration_siglip import SiglipVisionConfig
+class NVILAConfig(PretrainedConfig):
+    model_type = "nvila"
+    sub_configs = {
+        "text_config": Qwen2Config,
+        "vision_config": SiglipVisionConfig,
+    }
+    _auto_class = "AutoConfig"
+    def __init__(
+        self,
+        *,
+        text_config: dict[str, Any] | None = None,
+        vision_config: dict[str, Any] | None = None,
+        image_token_id: int | None = None,
+        video_token_id: int | None = None,
+        max_batch_size_siglip: int = 16,
+        **kwargs,
+    ):
+        self.text_config = Qwen2Config(**text_config) if text_config is not None else Qwen2Config()
+        self.vision_config = SiglipVisionConfig(**vision_config) if vision_config is not None else SiglipVisionConfig()
+        self.image_token_id = image_token_id if image_token_id is not None else -1
+        self.video_token_id = video_token_id if video_token_id is not None else -1
+        self.max_batch_size_siglip = max_batch_size_siglip
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "4.55.4"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fcee82d90b6709f451256bbebfc2cadf7fe55731cac9878d43dd35ce9443272
+size 5242359656

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6650c3b5f2192619c44e63ab4b52b86062162c7106e3d5bc7c336e17d16d1a4
+size 5321808048

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:610da6f8adea1111d364c7acb9abb46abd0123a440caccc50c9c9b6c8c30d7fe
+size 5368631104

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ced3a3efab23c75b5621ac2693c7c989c850a6c8aa61b9c743a43753f3fed37
+size 241471808

modeling_nvila.py ADDED Viewed

	@@ -0,0 +1,604 @@

+import contextlib
+import sys
+from pathlib import Path
+from typing import Optional
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+from transformers import Qwen2ForCausalLM
+from transformers.cache_utils import Cache
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from autogaze.vision_encoders.siglip.modeling_siglip import SiglipVisionModel
+from .configuration_nvila import NVILAConfig
+MM_HIDDEN_SIZE = 1152
+class TokenShuffle(nn.Module):
+    """Token shuffle module that groups tokens and concatenates their features."""
+    def __init__(self, shuffle_num: int):
+        super().__init__()
+        self.shuffle_num = shuffle_num
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: (B, N, C) tensor where B is batch size, N is sequence length, C is hidden size
+        Returns:
+            (B, N', C * shuffle_num) tensor where N' = ceil(N / shuffle_num)
+        """
+        # x: (B, N, C)
+        if x.shape[1] % self.shuffle_num != 0:
+            # Pad with the last token to make sequence length divisible by shuffle_num
+            pad_size = self.shuffle_num - (x.shape[1] % self.shuffle_num)
+            x = torch.cat([x, x[:, -1:].repeat(1, pad_size, 1)], dim=1)
+        # Rearrange: (B, N, C) -> (B, N//k, k*C) where k = shuffle_num
+        return einops.rearrange(x, "b (n k) c -> b n (k c)", k=self.shuffle_num)
+class NVILAMultiModalProjector(nn.Module):
+    """Multi-modal projector using mlp_shuffle_9 architecture."""
+    def __init__(self, config: NVILAConfig):
+        super().__init__()
+        self.layers = nn.Sequential(
+            TokenShuffle(9),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 9),
+            nn.Linear(MM_HIDDEN_SIZE * 9, MM_HIDDEN_SIZE * 3),
+            nn.GELU(),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 3),
+            nn.Linear(MM_HIDDEN_SIZE * 3, config.text_config.hidden_size),
+            nn.GELU(),
+            nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        return self.layers(x)
+class NVILAForConditionalGeneration(PreTrainedModel, GenerationMixin):
+    config_class = NVILAConfig
+    base_model_prefix: str = "llm"
+    _auto_class = "AutoModel"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def __init__(self, config: NVILAConfig):
+        super().__init__(config)
+        self.config: NVILAConfig
+        @contextlib.contextmanager
+        def default_torch_dtype(dtype):
+            original_dtype = torch.get_default_dtype()
+            torch.set_default_dtype(dtype)
+            try:
+                yield
+            finally:
+                torch.set_default_dtype(original_dtype)
+        with default_torch_dtype(config.torch_dtype):
+            self.vision_tower = SiglipVisionModel(config.vision_config)
+            self.mm_projector = NVILAMultiModalProjector(config)
+            self.llm = Qwen2ForCausalLM(config.text_config)
+        self.post_init()
+    def forward(
+        self,
+        *,
+        input_ids: Tensor | None = None,
+        inputs_embeds: Tensor | None = None,
+        pixel_values: Tensor | None = None,
+        pixel_values_images_tiles: list[Tensor] | None = None,
+        pixel_values_images_thumbnails: list[Tensor] | None = None,
+        num_spatial_tiles_each_image: list[int] | None = None,
+        pixel_values_videos_tiles: list[Tensor] | None = None,
+        pixel_values_videos_thumbnails: list[Tensor] | None = None,
+        gazing_info: dict | None = None,
+        num_spatial_tiles_each_video: list[int] | None = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        assert (input_ids is None) != (
+            inputs_embeds is None
+        ), "Exactly one of `input_ids` or `inputs_embeds` must be specified."
+        # Pop processor-only fields that the LLM should not see
+        kwargs.pop("pixel_values_videos_tiles_autogaze", None)
+        kwargs.pop("pixel_values_videos_thumbnails_autogaze", None)
+        kwargs.pop("pixel_values_videos", None)
+        if input_ids is not None and torch.any(
+            torch.isin(
+                input_ids,
+                torch.tensor(
+                    [self.config.image_token_id, self.config.video_token_id],
+                    device=input_ids.device,
+                ),
+            ).any()
+        ):  # Prefill
+            # Extract fields from kwargs if not passed as explicit args
+            if gazing_info is None:
+                gazing_info = kwargs.pop("gazing_info", None)
+            if pixel_values_images_tiles is None:
+                pixel_values_images_tiles = kwargs.pop("pixel_values_images_tiles", None)
+            if pixel_values_images_thumbnails is None:
+                pixel_values_images_thumbnails = kwargs.pop("pixel_values_images_thumbnails", None)
+            if num_spatial_tiles_each_image is None:
+                num_spatial_tiles_each_image = kwargs.pop("num_spatial_tiles_each_image", None)
+            if pixel_values_videos_tiles is None:
+                pixel_values_videos_tiles = kwargs.pop("pixel_values_videos_tiles", None)
+            if pixel_values_videos_thumbnails is None:
+                pixel_values_videos_thumbnails = kwargs.pop("pixel_values_videos_thumbnails", None)
+            if num_spatial_tiles_each_video is None:
+                num_spatial_tiles_each_video = kwargs.pop("num_spatial_tiles_each_video", None)
+            inputs_embeds = self._embed(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                pixel_values_images_tiles=pixel_values_images_tiles,
+                pixel_values_images_thumbnails=pixel_values_images_thumbnails,
+                num_spatial_tiles_each_image=num_spatial_tiles_each_image,
+                pixel_values_videos_tiles=pixel_values_videos_tiles,
+                pixel_values_videos_thumbnails=pixel_values_videos_thumbnails,
+                gazing_info=gazing_info,
+                num_spatial_tiles_each_video=num_spatial_tiles_each_video,
+            )
+            input_ids = None
+        outputs = self.llm(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        return outputs
+    def _embed(
+        self,
+        *,
+        input_ids: Tensor,
+        pixel_values: Tensor | None,
+        pixel_values_images_tiles: list[Tensor] | None,
+        pixel_values_images_thumbnails: list[Tensor] | None,
+        num_spatial_tiles_each_image: list[int] | None,
+        pixel_values_videos_tiles: list[Tensor] | None,
+        pixel_values_videos_thumbnails: list[Tensor] | None,
+        gazing_info: dict | None = None,
+        num_spatial_tiles_each_video: list[int] | None = None,
+    ) -> Tensor:
+        inputs_embeds: Tensor = self.llm.model.embed_tokens(input_ids)
+        # Handle images
+        if pixel_values_images_tiles is not None and len(pixel_values_images_tiles) > 0:
+            per_image_features = self._encode_images(
+                pixel_values_images_tiles=pixel_values_images_tiles,
+                pixel_values_images_thumbnails=pixel_values_images_thumbnails,
+                num_spatial_tiles_each_image=num_spatial_tiles_each_image,
+            )
+            all_features = torch.cat(per_image_features, dim=0)
+            image_token_mask = input_ids == self.config.image_token_id
+            num_image_tokens = image_token_mask.sum().item()
+            num_image_features = all_features.shape[0]
+            assert num_image_features == num_image_tokens, (
+                f"Number of image features {num_image_features} does not match "
+                f"number of image tokens {num_image_tokens}"
+            )
+            inputs_embeds[image_token_mask] = all_features.to(inputs_embeds.dtype)
+        # Handle videos
+        if pixel_values_videos_tiles is not None:
+            per_video_features = self._encode_vision(
+                pixel_values_videos_tiles=pixel_values_videos_tiles,
+                pixel_values_videos_thumbnails=pixel_values_videos_thumbnails,
+                gazing_info=gazing_info,
+                num_spatial_tiles_each_video=num_spatial_tiles_each_video,
+            )
+            # per_video_features: list of (num_tokens_i, llm_hidden) tensors
+            all_features = torch.cat(per_video_features, dim=0)
+            # Match vision features to video tokens
+            video_token_mask = input_ids == self.config.video_token_id
+            num_video_tokens = video_token_mask.sum().item()
+            num_vision_features = all_features.shape[0]
+            assert num_vision_features == num_video_tokens, (
+                f"Number of vision features {num_vision_features} does not match "
+                f"number of video tokens {num_video_tokens}"
+            )
+            inputs_embeds[video_token_mask] = all_features.to(inputs_embeds.dtype)
+        return inputs_embeds
+    def _make_default_gazing_info(
+        self,
+        total_items: int,
+        T: int,
+        device: torch.device,
+    ) -> dict:
+        """Create gazing_info that gazes at every patch (no reduction).
+        Args:
+            total_items: Number of items (tiles or thumbnails) in the batch.
+            T: Temporal frames per item.
+            device: Target torch device.
+        Returns:
+            gazing_info dict with ``gazing_pos``, ``num_gazing_each_frame``,
+            ``if_padded_gazing``.
+        """
+        image_size = self.vision_tower.config.image_size
+        patch_size = self.vision_tower.config.patch_size
+        scales = sorted(
+            int(s) for s in self.vision_tower.config.scales.split("+")
+        )
+        num_patches_each_scale = [(s // patch_size) ** 2 for s in scales]
+        total_patches_per_frame = sum(num_patches_each_scale)
+        # Gazing positions: all patches for every frame
+        per_item_pos = []
+        for t in range(T):
+            start = t * total_patches_per_frame
+            per_item_pos.append(
+                torch.arange(start, start + total_patches_per_frame, device=device, dtype=torch.long)
+            )
+        per_item_pos = torch.cat(per_item_pos)  # (T * total_patches_per_frame,)
+        gazing_pos = per_item_pos.unsqueeze(0).expand(total_items, -1)  # (B, N)
+        num_gazing_each_frame = torch.full(
+            (T,), total_patches_per_frame, device=device, dtype=torch.long
+        )
+        if_padded_gazing = torch.zeros_like(gazing_pos, dtype=torch.bool)
+        return {
+            "gazing_pos": gazing_pos,
+            "num_gazing_each_frame": num_gazing_each_frame,
+            "if_padded_gazing": if_padded_gazing,
+        }
+    def _encode_images(
+        self,
+        pixel_values_images_tiles: list[Tensor],
+        pixel_values_images_thumbnails: list[Tensor] | None,
+        num_spatial_tiles_each_image: list[int],
+    ) -> list[Tensor]:
+        """Encode image tiles + thumbnails and return projected features per image.
+        Each image is a set of spatial tiles plus one thumbnail (T=1 each).
+        All patches are kept (no gazing reduction).  For each image the
+        spatial tiles are merged into one effective frame, the thumbnail
+        forms a second effective frame, and both are padded to
+        ``shuffle_num`` before projection through the mm_projector.
+        Args:
+            pixel_values_images_tiles: Per-image tile tensors, each
+                ``(num_tiles_i, 1, C, H, W)``.
+            pixel_values_images_thumbnails: Per-image thumbnail tensors,
+                each ``(1, 1, C, H, W)``.  May be ``None``.
+            num_spatial_tiles_each_image: Number of spatial tiles per image.
+        Returns:
+            List of tensors (one per image), each ``(num_tokens_i, llm_hidden)``.
+        """
+        shuffle_num = 9
+        device = self.vision_tower.device
+        # --- Run vision tower on all tiles ---
+        all_tiles = torch.cat(pixel_values_images_tiles, dim=0)  # (total_tiles, 1, C, H, W)
+        total_tiles = all_tiles.shape[0]
+        gi_tiles = self._make_default_gazing_info(total_tiles, 1, device)
+        tiles_features = self._run_vision_tower_batched(all_tiles, gi_tiles)  # (total_tiles, N, H)
+        num_gaze_tiles = gi_tiles["num_gazing_each_frame"]  # (1,)
+        if_padded_tiles = gi_tiles["if_padded_gazing"]      # (total_tiles, N)
+        frame_lens_tiles = num_gaze_tiles.tolist()
+        tile_feats: list[Tensor] = []
+        for idx in range(total_tiles):
+            feats = tiles_features[idx]
+            pad_mask = if_padded_tiles[idx]
+            frame_feats = feats.split(frame_lens_tiles, dim=0)
+            frame_pads = pad_mask.split(frame_lens_tiles, dim=0)
+            tile_feats.append(
+                torch.cat([f[~p] for f, p in zip(frame_feats, frame_pads)], dim=0)
+            )
+        # --- Run vision tower on all thumbnails ---
+        thumb_feats: list[Tensor] | None = None
+        if pixel_values_images_thumbnails is not None and len(pixel_values_images_thumbnails) > 0:
+            all_thumbs = torch.cat(pixel_values_images_thumbnails, dim=0)  # (num_images, 1, C, H, W)
+            total_thumbs = all_thumbs.shape[0]
+            gi_thumbs = self._make_default_gazing_info(total_thumbs, 1, device)
+            thumbs_features = self._run_vision_tower_batched(all_thumbs, gi_thumbs)
+            num_gaze_thumbs = gi_thumbs["num_gazing_each_frame"]
+            if_padded_thumbs = gi_thumbs["if_padded_gazing"]
+            frame_lens_thumbs = num_gaze_thumbs.tolist()
+            thumb_feats = []
+            for idx in range(total_thumbs):
+                feats = thumbs_features[idx]
+                pad_mask = if_padded_thumbs[idx]
+                frame_feats = feats.split(frame_lens_thumbs, dim=0)
+                frame_pads = pad_mask.split(frame_lens_thumbs, dim=0)
+                thumb_feats.append(
+                    torch.cat([f[~p] for f, p in zip(frame_feats, frame_pads)], dim=0)
+                )
+        # --- Build per-image sequences ---
+        tile_offset = 0
+        per_image_sequences: list[Tensor] = []
+        per_image_token_counts: list[int] = []
+        for img_idx, ns in enumerate(num_spatial_tiles_each_image):
+            effective_frames: list[Tensor] = []
+            # Tiles effective frame: merge all spatial tiles
+            spatial_feats = tile_feats[tile_offset : tile_offset + ns]
+            tile_offset += ns
+            effective_frames.append(torch.cat(spatial_feats, dim=0))
+            # Thumbnail effective frame
+            if thumb_feats is not None:
+                effective_frames.append(thumb_feats[img_idx])
+            # Pad each effective frame to divisible by shuffle_num
+            padded_frames: list[Tensor] = []
+            for frame in effective_frames:
+                n = frame.shape[0]
+                pad = (shuffle_num - n % shuffle_num) % shuffle_num
+                if pad > 0:
+                    frame = torch.cat([frame, frame[-1:].expand(pad, -1)], dim=0)
+                padded_frames.append(frame)
+            image_seq = torch.cat(padded_frames, dim=0)
+            per_image_sequences.append(image_seq)
+            per_image_token_counts.append(image_seq.shape[0] // shuffle_num)
+        all_features = torch.cat(per_image_sequences, dim=0).unsqueeze(0)
+        projected = self.mm_projector(
+            all_features.to(device=self.device, dtype=self.dtype)
+        )
+        projected = projected.squeeze(0)
+        return list(projected.split(per_image_token_counts, dim=0))
+    def _run_vision_tower_batched(
+        self,
+        all_pixels: Tensor,
+        gazing_info_batch: dict,
+    ) -> Tensor:
+        """Run the vision tower in minibatches and concatenate features.
+        Args:
+            all_pixels: ``(B, T, C, H, W)`` tensor.
+            gazing_info_batch: Dict with ``gazing_pos`` ``(B, N)``,
+                ``if_padded_gazing`` ``(B, N)``, and
+                ``num_gazing_each_frame`` ``(T,)`` (shared across batch).
+        Returns:
+            ``(B, N, H)`` hidden features from the second-to-last layer.
+        """
+        device = self.vision_tower.device
+        dtype = self.vision_tower.dtype
+        total = all_pixels.shape[0]
+        bs = self.config.max_batch_size_siglip
+        if total <= bs:
+            out: BaseModelOutputWithPooling = self.vision_tower(
+                all_pixels.to(device=device, dtype=dtype),
+                gazing_info=gazing_info_batch,
+                output_hidden_states=True,
+            )
+            assert out.hidden_states is not None
+            return out.hidden_states[-2]
+        num_gaze_shared = gazing_info_batch["num_gazing_each_frame"]
+        all_pos = gazing_info_batch["gazing_pos"]
+        all_pad = gazing_info_batch["if_padded_gazing"]
+        feature_chunks: list[Tensor] = []
+        for start in range(0, total, bs):
+            end = min(start + bs, total)
+            mini_gi = {
+                "gazing_pos": all_pos[start:end],
+                "if_padded_gazing": all_pad[start:end],
+                "num_gazing_each_frame": num_gaze_shared,
+            }
+            out = self.vision_tower(
+                all_pixels[start:end].to(device=device, dtype=dtype),
+                gazing_info=mini_gi,
+                output_hidden_states=True,
+            )
+            assert out.hidden_states is not None
+            feature_chunks.append(out.hidden_states[-2])
+        return torch.cat(feature_chunks, dim=0)
+    def _encode_vision(
+        self,
+        pixel_values_videos_tiles: list[Tensor],
+        pixel_values_videos_thumbnails: list[Tensor],
+        gazing_info: dict | None,
+        num_spatial_tiles_each_video: list[int],
+    ) -> list[Tensor]:
+        """Encode tiles and thumbnails and return projected features per video.
+        Workflow
+        -------
+        1. Batch all tiles / thumbnails across videos and run the vision tower
+           (in minibatches controlled by ``config.max_batch_size_siglip``).
+        2. Remove padded gazing features.
+        3. Re-order per video: for each global temporal frame gather all spatial
+           tiles, then append thumbnail frames.
+        4. Pad each effective frame to be divisible by ``shuffle_num`` (9).
+        5. Concatenate all videos into a single sequence (batch=1), project
+           through ``mm_projector``, then split back per video.
+        Args:
+            pixel_values_videos_tiles: Per-video tile tensors, each
+                ``(num_tiles_i, T_tile, C, H, W)``.
+            pixel_values_videos_thumbnails: Per-video thumbnail tensors, each
+                ``(T_thumb_i, 1, C, H, W)``.
+            gazing_info: Dict produced by the processor containing per-video
+                gazing data for tiles and thumbnails.  ``None`` triggers
+                default "gaze at all patches" behaviour.
+            num_spatial_tiles_each_video: Number of spatial tiles per video.
+        Returns:
+            List of tensors (one per video), each ``(num_tokens_i, llm_hidden)``.
+        """
+        shuffle_num = 9  # must match TokenShuffle in NVILAMultiModalProjector
+        device = self.vision_tower.device
+        dtype = self.vision_tower.dtype
+        num_videos = len(pixel_values_videos_tiles)
+        num_tiles_per_video = [t.shape[0] for t in pixel_values_videos_tiles]
+        num_thumbs_per_video = [t.shape[0] for t in pixel_values_videos_thumbnails]
+        # ---- 1. Batch & run vision tower on tiles ----
+        all_tiles = torch.cat(pixel_values_videos_tiles, dim=0)  # (total_tiles, T_tile, C, H, W)
+        T_tile = all_tiles.shape[1]
+        if gazing_info is not None:
+            tiles_nge = gazing_info["num_gazing_each_frame_tiles"]
+            ref = tiles_nge[0][0]
+            assert all(
+                torch.equal(t[0], ref) for t in tiles_nge
+            ), "num_gazing_each_frame must be identical across all videos for tiles"
+            tiles_gi = {
+                "gazing_pos": torch.cat(gazing_info["gazing_pos_tiles"], dim=0).to(device),
+                "num_gazing_each_frame": gazing_info["num_gazing_each_frame_tiles"][0][0].to(device),
+                "if_padded_gazing": torch.cat(gazing_info["if_padded_gazing_tiles"], dim=0).to(device),
+            }
+        else:
+            tiles_gi = self._make_default_gazing_info(all_tiles.shape[0], T_tile, device)
+        tiles_features = self._run_vision_tower_batched(all_tiles, tiles_gi)  # (total_tiles, N, H)
+        # ---- 2. Batch & run vision tower on thumbnails ----
+        all_thumbs = torch.cat(pixel_values_videos_thumbnails, dim=0)  # (total_thumbs, 1, C, H, W)
+        if gazing_info is not None:
+            thumbs_nge = gazing_info["num_gazing_each_frame_thumbnails"]
+            ref = thumbs_nge[0][0]
+            assert all(
+                torch.equal(t[0], ref) for t in thumbs_nge
+            ), "num_gazing_each_frame must be identical across all videos for thumbnails"
+            thumbs_gi = {
+                "gazing_pos": torch.cat(gazing_info["gazing_pos_thumbnails"], dim=0).to(device),
+                "num_gazing_each_frame": gazing_info["num_gazing_each_frame_thumbnails"][0][0].to(device),
+                "if_padded_gazing": torch.cat(gazing_info["if_padded_gazing_thumbnails"], dim=0).to(device),
+            }
+        else:
+            thumbs_gi = self._make_default_gazing_info(all_thumbs.shape[0], 1, device)
+        thumbs_features = self._run_vision_tower_batched(all_thumbs, thumbs_gi)  # (total_thumbs, N', H)
+        # ---- 3. Remove padded features & split by frame ----
+        # For each tile: list of T_tile tensors, each (n_i, hidden)
+        all_tiles_if_padded = tiles_gi["if_padded_gazing"]
+        all_tiles_num_gaze = tiles_gi["num_gazing_each_frame"]  # 1-D (T_tile,)
+        tiles_frame_lens = all_tiles_num_gaze.tolist()
+        all_tiles_frame_feats: list[list[Tensor]] = []
+        for idx in range(tiles_features.shape[0]):
+            feats = tiles_features[idx]                 # (N, hidden)
+            pad_mask = all_tiles_if_padded[idx]         # (N,)
+            frame_feats = feats.split(tiles_frame_lens, dim=0)
+            frame_pads = pad_mask.split(tiles_frame_lens, dim=0)
+            all_tiles_frame_feats.append(
+                [f[~p] for f, p in zip(frame_feats, frame_pads)]
+            )
+        # For each thumbnail: list with 1 tensor (n_i, hidden)
+        all_thumbs_if_padded = thumbs_gi["if_padded_gazing"]
+        all_thumbs_num_gaze = thumbs_gi["num_gazing_each_frame"]  # 1-D (1,)
+        thumbs_frame_lens = all_thumbs_num_gaze.tolist()
+        all_thumbs_frame_feats: list[list[Tensor]] = []
+        for idx in range(thumbs_features.shape[0]):
+            feats = thumbs_features[idx]
+            pad_mask = all_thumbs_if_padded[idx]
+            frame_feats = feats.split(thumbs_frame_lens, dim=0)
+            frame_pads = pad_mask.split(thumbs_frame_lens, dim=0)
+            all_thumbs_frame_feats.append(
+                [f[~p] for f, p in zip(frame_feats, frame_pads)]
+            )
+        # ---- 4. Per-video: reorder, pad frames, build sequences ----
+        tile_offset = 0
+        thumb_offset = 0
+        per_video_sequences: list[Tensor] = []
+        per_video_token_counts: list[int] = []
+        for vid_idx in range(num_videos):
+            ns = num_spatial_tiles_each_video[vid_idx]
+            nt = num_tiles_per_video[vid_idx]
+            tc = nt // ns                       # temporal chunks
+            total_frames = tc * T_tile
+            n_thumbs = num_thumbs_per_video[vid_idx]
+            vid_tile_feats = all_tiles_frame_feats[tile_offset: tile_offset + nt]
+            tile_offset += nt
+            vid_thumb_feats = all_thumbs_frame_feats[thumb_offset: thumb_offset + n_thumbs]
+            thumb_offset += n_thumbs
+            # -- Reorder tile features to frame-first --
+            # Tiles from processor are ordered:
+            #   chunk0: [S0, S1, ..., S_{ns-1}], chunk1: [S0, ...], ...
+            # We want: for each global frame g, cat all spatial tiles.
+            effective_frames: list[Tensor] = []
+            for g in range(total_frames):
+                chunk = g // T_tile
+                f_in_chunk = g % T_tile
+                spatial_feats = [
+                    vid_tile_feats[chunk * ns + s][f_in_chunk]
+                    for s in range(ns)
+                ]
+                effective_frames.append(torch.cat(spatial_feats, dim=0))
+            # -- Append thumbnail frames --
+            for thumb in vid_thumb_feats:
+                effective_frames.append(thumb[0])  # single frame
+            # -- Pad each effective frame to divisible by shuffle_num --
+            padded_frames: list[Tensor] = []
+            for frame in effective_frames:
+                n = frame.shape[0]
+                pad = (shuffle_num - n % shuffle_num) % shuffle_num
+                if pad > 0:
+                    padded_frame = torch.cat(
+                        [frame, frame[-1:].expand(pad, -1)], dim=0
+                    )
+                else:
+                    padded_frame = frame
+                padded_frames.append(padded_frame)
+            video_seq = torch.cat(padded_frames, dim=0)  # (total_padded, hidden)
+            per_video_sequences.append(video_seq)
+            per_video_token_counts.append(video_seq.shape[0] // shuffle_num)
+        # ---- 5. Concat all videos, project, split back ----
+        all_features = torch.cat(per_video_sequences, dim=0).unsqueeze(0)  # (1, total, hidden)
+        projected = self.mm_projector(
+            all_features.to(device=self.device, dtype=self.dtype)
+        )  # (1, total // shuffle_num, llm_hidden)
+        projected = projected.squeeze(0)  # (total // shuffle_num, llm_hidden)
+        per_video_features = list(projected.split(per_video_token_counts, dim=0))
+        return per_video_features

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_nvila.NVILAProcessor"
+  },
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "NVILAProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 392,
+    "width": 392
+  },
+  "autogaze_model_id": "bfshi/AutoGaze",
+  "gazing_ratio_tile": 0.75,
+  "gazing_ratio_thumbnail": 0.75,
+  "task_loss_requirement_tile": 0.7,
+  "task_loss_requirement_thumbnail": 0.7,
+  "target_scales": [56, 112, 196, 392],
+  "target_patch_size": 16,
+  "num_video_frames": 8,
+  "max_tiles_video": 8,
+  "num_video_frames_thumbnail": 8,
+  "mm_projector_shuffle_num": 9,
+  "max_batch_size_autogaze": 32
+}

processing_nvila.py ADDED Viewed

	@@ -0,0 +1,1092 @@

+import glob
+import os
+import re
+import tempfile
+import urllib.request
+from os import PathLike
+from typing import cast, Optional
+from urllib.parse import urlparse
+import cv2
+import numpy as np
+import torch
+import transformers.image_transforms as image_transforms
+import transformers.image_utils as image_utils
+import transformers.video_utils as video_utils
+from PIL import Image
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.models.qwen2 import Qwen2Tokenizer, Qwen2TokenizerFast
+from transformers.models.siglip import SiglipImageProcessor, SiglipImageProcessorFast
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from transformers.tokenization_utils_base import BatchEncoding, TextInput
+from transformers.video_utils import VideoInput, VideoMetadata
+from autogaze.models.autogaze import AutoGaze
+from autogaze.models.autogaze import AutoGazeImageProcessor
+from autogaze.datasets.video_utils import transform_video_for_pytorch
+def _find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """Find the closest aspect ratio from a set of target ratios.
+    Referenced from https://github.com/OpenGVLab/InternVL and llava/mm_utils.py
+    """
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+class NVILAProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}  # type: ignore
+def _load_video_frames(video_path: str, num_frames: int = 8) -> list[Image]:
+    """
+    Load video frames from a video file path.
+    Similar to _load_video in llava/utils/media.py
+    Args:
+        video_path: Path to the video file or directory of frames
+        num_frames: Number of frames to extract
+    Returns:
+        List of PIL Images representing video frames
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    if not vidcap.isOpened():
+        raise ValueError(f"Failed to open video: {video_path}")
+    frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    while frame_count > 0:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_count - 1)
+        if vidcap.grab():
+            break
+        frame_count -= 1
+    else:
+        vidcap.release()
+        raise ValueError(f"Video '{video_path}' has no frames.")
+    indices = np.round(np.linspace(0, frame_count - 1, num_frames)).astype(int)
+    frames = {}
+    for index in indices:
+        if index in frames:
+            continue
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, index)
+        success, frame = vidcap.read()
+        if not success:
+            continue
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames[index] = Image.fromarray(frame)
+    vidcap.release()
+    frames_to_return = [frames[index] for index in indices if index in frames]
+    if len(frames_to_return) < num_frames:
+        if frames_to_return:
+            frames_to_return = frames_to_return + [frames_to_return[-1]] * (num_frames - len(frames_to_return))
+        else:
+            raise ValueError(f"Could not extract any frames from video: {video_path}")
+    return frames_to_return
+class NVILAProcessor(ProcessorMixin):
+    attributes = [
+        "image_processor",
+        "tokenizer",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    _auto_class = "AutoProcessor"
+    def __init__(
+        self,
+        image_processor: SiglipImageProcessor | SiglipImageProcessorFast,
+        tokenizer: Qwen2Tokenizer | Qwen2TokenizerFast,
+        chat_template: str | None = None,
+        autogaze_model_id: str | None = None,
+        gazing_ratio_tile: list[float] | float = 0.75,
+        gazing_ratio_thumbnail: float | None = 0.75,
+        task_loss_requirement_tile: float = 0.7,
+        task_loss_requirement_thumbnail: float | None = 0.7,
+        target_scales: list[int] | None = None,
+        target_patch_size: int | None = None,
+        max_tiles_image: int = 12,
+        num_video_frames: int = 8,
+        max_tiles_video: int = 8,
+        num_video_frames_thumbnail: int = 8,
+        mm_projector_shuffle_num: int = 9,
+        max_batch_size_autogaze: int = 32,
+        **kwargs,
+    ):
+        super().__init__(
+            image_processor,
+            tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+        self.image_processor: SiglipImageProcessor | SiglipImageProcessorFast
+        self.tokenizer: Qwen2Tokenizer | Qwen2TokenizerFast
+        # AutoGaze configuration
+        self.autogaze_model_id = autogaze_model_id or "bfshi/AutoGaze"
+        self.gazing_ratio_tile = gazing_ratio_tile
+        self.gazing_ratio_thumbnail = gazing_ratio_thumbnail
+        self.task_loss_requirement_tile = task_loss_requirement_tile
+        self.task_loss_requirement_thumbnail = task_loss_requirement_thumbnail
+        self.target_scales = target_scales or [56, 112, 224, 448]
+        self.target_patch_size = target_patch_size or 16
+        # Image / video processing configuration
+        self.max_tiles_image = max_tiles_image
+        self.num_video_frames = num_video_frames
+        self.max_tiles_video = max_tiles_video
+        self.num_video_frames_thumbnail = num_video_frames_thumbnail
+        self.mm_projector_shuffle_num = mm_projector_shuffle_num
+        self.max_batch_size_autogaze = max_batch_size_autogaze
+        # Load AutoGaze if available
+        self._autogaze_model = None
+        self._autogaze_model = AutoGaze.from_pretrained(
+            self.autogaze_model_id,
+            device_map=None,
+        )
+        self._autogaze_model.to("cuda").eval()
+        print("AutoGaze loaded successfully in processor")
+    def __call__(
+        self,
+        *,
+        text: TextInput | list[TextInput],
+        images: ImageInput | None = None,
+        videos: VideoInput | None = None,
+        **kwargs: Unpack[NVILAProcessorKwargs],
+    ) -> BatchFeature:
+        normalized_text, normalized_images, normalized_videos = self._normalize_inputs(
+            text=text,
+            images=images,
+            videos=videos,
+        )
+        images_inputs, image_token_padding_strategy = (
+            self._preprocess_images(
+                normalized_images,
+                **kwargs,
+            )
+            if len(normalized_images) > 0
+            else (BatchFeature(), [])
+        )
+        videos_inputs = (
+            self._preprocess_videos(
+                normalized_videos,
+                **kwargs,
+            )
+            if len(normalized_videos) > 0
+            else (BatchFeature(), [])
+        )
+        # Run AutoGaze on preprocessed tiles/thumbnails and compute padding
+        gazing_info = None
+        video_token_padding_strategy = []
+        skip_tiles_gaze = self._should_gaze_all_patches(self.gazing_ratio_tile, self.task_loss_requirement_tile)
+        skip_thumbs_gaze = self._should_gaze_all_patches(self.gazing_ratio_thumbnail, self.task_loss_requirement_thumbnail)
+        can_construct_without_autogaze = skip_tiles_gaze and skip_thumbs_gaze
+        if len(normalized_videos) > 0 and (self._autogaze_model is not None or can_construct_without_autogaze):
+            gazing_info = self._get_gazing_info_from_videos(videos_inputs)
+            # Compute video padding strategy from gazing results.
+            # Because the mm_projector uses TokenShuffle(9), each
+            # "effective frame" is padded to a multiple of 9 before
+            # projection, then divided by 9.  So total tokens per
+            # video = sum_over_frames(ceil(non_padded_per_frame / 9)).
+            shuffle_num = self.mm_projector_shuffle_num
+            ns_list = videos_inputs["num_spatial_tiles_each_video"]
+            for vid_idx in range(len(gazing_info["if_padded_gazing_tiles"])):
+                tiles_if_pad = gazing_info["if_padded_gazing_tiles"][vid_idx]   # (num_tiles, N)
+                tiles_num_gaze = gazing_info["num_gazing_each_frame_tiles"][vid_idx]  # (num_tiles, T_tile)
+                thumbs_if_pad = gazing_info["if_padded_gazing_thumbnails"][vid_idx]   # (T_thumb, N')
+                thumbs_num_gaze = gazing_info["num_gazing_each_frame_thumbnails"][vid_idx]  # (T_thumb, 1)
+                ns = ns_list[vid_idx]
+                num_tiles = tiles_if_pad.shape[0]
+                T_tile = tiles_num_gaze.shape[1]
+                tc = num_tiles // ns            # temporal chunks
+                total_frames = tc * T_tile
+                # Non-padded count per tile per frame
+                tile_non_padded = []  # tile_non_padded[tile][frame] = int
+                for t_idx in range(num_tiles):
+                    frame_sizes = tiles_num_gaze[t_idx].tolist()
+                    frame_pad_segs = tiles_if_pad[t_idx].split(frame_sizes)
+                    tile_non_padded.append(
+                        [int((~seg).sum().item()) for seg in frame_pad_segs]
+                    )
+                total_tokens = 0
+                # Tile effective frames (all spatial tiles for one temporal frame)
+                for g in range(total_frames):
+                    chunk = g // T_tile
+                    f_in_chunk = g % T_tile
+                    frame_count = sum(
+                        tile_non_padded[chunk * ns + s][f_in_chunk]
+                        for s in range(ns)
+                    )
+                    total_tokens += (frame_count + shuffle_num - 1) // shuffle_num
+                # Thumbnail frames (each is 1 frame)
+                for th_idx in range(thumbs_if_pad.shape[0]):
+                    frame_sizes = thumbs_num_gaze[th_idx].tolist()
+                    frame_pad_segs = thumbs_if_pad[th_idx].split(frame_sizes)
+                    non_pad = sum(int((~seg).sum().item()) for seg in frame_pad_segs)
+                    total_tokens += (non_pad + shuffle_num - 1) // shuffle_num
+                video_token_padding_strategy.append([total_tokens])
+        else:
+            video_token_padding_strategy = [[(self.num_video_frames + self.num_video_frames_thumbnail) * 118] * len(normalized_videos)]
+        # Remove AutoGaze-processed pixel values — they were only needed
+        # for computing gazing_info and should not be sent to the model.
+        if len(normalized_videos) > 0:
+            videos_inputs.pop("pixel_values_videos_tiles_autogaze", None)
+            videos_inputs.pop("pixel_values_videos_thumbnails_autogaze", None)
+        text_inputs = self._preprocess_text(
+            normalized_text,
+            image_token_padding_strategy=image_token_padding_strategy,
+            video_token_padding_strategy=video_token_padding_strategy,
+            **kwargs,
+        )
+        # Combine all inputs
+        batch_feature = BatchFeature(
+            {
+                **text_inputs,
+                **images_inputs,
+                **videos_inputs,
+            }
+        )
+        # Attach gazing_info so the model can use it downstream
+        if gazing_info is not None:
+            batch_feature["gazing_info"] = gazing_info
+        return batch_feature
+    def batch_decode(self, *args, **kwargs) -> list[str]:
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def _normalize_inputs(
+        self,
+        *,
+        text: TextInput | list[TextInput],
+        images: ImageInput | None,
+        videos: VideoInput | None,
+    ) -> tuple[list[str], list[Image], list[list[Image]]]:
+        if isinstance(text, list):
+            normalized_text = text
+        else:
+            normalized_text = [text]
+        if images is not None and images != []:
+            image_flat_list = cast(list, image_utils.make_flat_list_of_images(images))
+            normalized_images = [cast(Image, image_transforms.to_pil_image(image)) for image in image_flat_list]
+        else:
+            normalized_images = []
+        if videos is not None and videos != []:
+            # Handle video inputs - can be file paths (str) or lists of PIL Images
+            # videos can be a single item or a list
+            if not isinstance(videos, (list, tuple)):
+                videos = [videos]
+            normalized_videos = []
+            # Use num_video_frames from processor config
+            num_frames = self.num_video_frames
+            for video_input in videos:
+                if isinstance(video_input, str):
+                    parsed = urlparse(video_input)
+                    if parsed.scheme in ("http", "https"):
+                        suffix = os.path.splitext(parsed.path)[1] or ".mp4"
+                        tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+                        try:
+                            urllib.request.urlretrieve(video_input, tmp.name)
+                            video_frames = _load_video_frames(tmp.name, num_frames=num_frames)
+                        finally:
+                            tmp.close()
+                            os.unlink(tmp.name)
+                    else:
+                        video_frames = _load_video_frames(video_input, num_frames=num_frames)
+                    normalized_videos.append(video_frames)
+                elif isinstance(video_input, (list, tuple)):
+                    # If it's already a list of images, convert them to PIL Images
+                    normalized_videos.append([
+                        cast(Image, image_transforms.to_pil_image(image)) for image in video_input
+                    ])
+                else:
+                    # Try to use video_utils for other types
+                    try:
+                        video_list = cast(list[list], video_utils.make_batched_videos([video_input]))
+                        normalized_videos.extend([
+                            [cast(Image, image_transforms.to_pil_image(image)) for image in video]
+                            for video in video_list
+                        ])
+                    except Exception:
+                        raise ValueError(
+                            f"Unsupported video input type: {type(video_input)}. "
+                            "Expected str (file path) or list of PIL Images."
+                        )
+        else:
+            normalized_videos = []
+        return normalized_text, normalized_images, normalized_videos
+    def _preprocess_images(
+        self,
+        images: list[Image],
+        **kwargs: Unpack[NVILAProcessorKwargs],
+    ) -> tuple[BatchFeature, list[list[int]]]:
+        """Preprocess images into spatial tiles plus a thumbnail.
+        Each image is split into a grid of spatial tiles whose count is at
+        most ``max_tiles_image``.  A thumbnail (the whole image resized to
+        ``image_size × image_size``) is appended.  Every tile / thumbnail
+        is a single-frame "video" of shape ``(1, C, H, W)``.  No AutoGaze
+        is applied — all patches are kept.
+        Returns:
+            A tuple ``(images_inputs, padding_strategy)`` where
+            ``images_inputs`` is a ``BatchFeature`` with:
+            - ``"pixel_values_images_tiles"`` – list of tensors, one per
+              image, each ``(num_tiles_i, 1, C, H, W)``.
+            - ``"pixel_values_images_thumbnails"`` – list of tensors, one
+              per image, each ``(1, 1, C, H, W)``.
+            - ``"num_spatial_tiles_each_image"`` – list of ints.
+            ``padding_strategy`` is a list (one per image) of
+            ``[total_tokens]`` used for text-token padding.
+        """
+        merged_kwargs = self._merge_kwargs(
+            NVILAProcessorKwargs,  # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if hasattr(self.image_processor, "size"):
+            image_size = self.image_processor.size.get("height", 392)
+        else:
+            image_size = 392
+        shuffle_num = self.mm_projector_shuffle_num
+        num_patches_each_scale = [
+            (s // self.target_patch_size) ** 2 for s in self.target_scales
+        ]
+        total_patches_per_frame = sum(num_patches_each_scale)
+        pixel_values_images_tiles: list[torch.Tensor] = []
+        pixel_values_images_thumbnails: list[torch.Tensor] = []
+        num_spatial_tiles_each_image: list[int] = []
+        padding_strategy: list[list[int]] = []
+        for image in images:
+            image = image.convert("RGB")
+            orig_width, orig_height = image.size
+            max_spatial_tiles = max(self.max_tiles_image, 1)
+            aspect_ratio = orig_width / orig_height
+            target_ratios = {
+                (i, j)
+                for n in range(1, max_spatial_tiles + 1)
+                for i in range(1, n + 1)
+                for j in range(1, n + 1)
+                if 1 <= i * j <= max_spatial_tiles
+            }
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+            target_aspect_ratio = _find_closest_aspect_ratio(
+                aspect_ratio, target_ratios, orig_width, orig_height, image_size
+            )
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+            num_tiles = target_aspect_ratio[0] * target_aspect_ratio[1]
+            num_cols = target_aspect_ratio[0]
+            resized = image.resize((target_width, target_height))
+            # Spatial tiles + thumbnail (whole image resized)
+            all_tile_images: list[Image] = []
+            for tile_idx in range(num_tiles):
+                col = tile_idx % num_cols
+                row = tile_idx // num_cols
+                box = (
+                    col * image_size,
+                    row * image_size,
+                    (col + 1) * image_size,
+                    (row + 1) * image_size,
+                )
+                all_tile_images.append(resized.crop(box))
+            thumbnail = image.resize((image_size, image_size))
+            all_images_for_siglip = all_tile_images + [thumbnail]
+            # SigLIP: process tiles + thumbnail at once → (num_tiles+1, C, H, W)
+            siglip_processed = self.image_processor(
+                all_images_for_siglip, **merged_kwargs["images_kwargs"],
+            )["pixel_values"]
+            if not isinstance(siglip_processed, torch.Tensor):
+                siglip_processed = torch.tensor(np.array(siglip_processed))
+            # Split into tiles and thumbnail, add temporal dim
+            tiles_pv = siglip_processed[:num_tiles].unsqueeze(1)   # (num_tiles, 1, C, H, W)
+            thumb_pv = siglip_processed[num_tiles:].unsqueeze(1)   # (1, 1, C, H, W)
+            pixel_values_images_tiles.append(tiles_pv)
+            pixel_values_images_thumbnails.append(thumb_pv)
+            num_spatial_tiles_each_image.append(num_tiles)
+            # Padding: tiles effective frame + thumbnail effective frame
+            tiles_tokens = (num_tiles * total_patches_per_frame + shuffle_num - 1) // shuffle_num
+            thumb_tokens = (total_patches_per_frame + shuffle_num - 1) // shuffle_num
+            padding_strategy.append([tiles_tokens + thumb_tokens])
+        images_inputs = BatchFeature({
+            "pixel_values_images_tiles": pixel_values_images_tiles,
+            "pixel_values_images_thumbnails": pixel_values_images_thumbnails,
+            "num_spatial_tiles_each_image": num_spatial_tiles_each_image,
+        })
+        return images_inputs, padding_strategy
+    def _preprocess_text(
+        self,
+        text: list[str],
+        *,
+        image_token_padding_strategy: list[list[int]],
+        video_token_padding_strategy: list[list[int]],
+        **kwargs: Unpack[NVILAProcessorKwargs],
+    ) -> BatchEncoding:
+        # Apply chat template to text
+        messages = [[
+            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+            {"role": "user", "content": t}
+        ] for t in text]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Pad media tokens.
+        assert isinstance(self.tokenizer.image_token, str)
+        assert isinstance(self.tokenizer.video_token, str)
+        for media_token, padding_strategy in (
+            (self.tokenizer.image_token, image_token_padding_strategy),
+            (self.tokenizer.video_token, video_token_padding_strategy),
+        ):
+            assert sum([s.count(media_token) for s in text]) == len(padding_strategy)
+            # Pad to number of tiles.
+            pad_lens = [len(x) for x in padding_strategy]
+            text = [re.sub(rf"({re.escape(media_token)})", lambda _: media_token * pad_lens.pop(0), s) for s in text]
+            # Pad to number of features.
+            pad_lens = [y for x in padding_strategy for y in x]
+            text = [re.sub(rf"({re.escape(media_token)})", lambda _: media_token * pad_lens.pop(0), s) for s in text]
+        merged_kwargs = self._merge_kwargs(
+            NVILAProcessorKwargs,  # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        text_inputs = self.tokenizer(
+            text=text,
+            **merged_kwargs["text_kwargs"],
+        )
+        return text_inputs
+    def _preprocess_videos(
+        self,
+        videos: list[list[Image]],
+        **kwargs: Unpack[NVILAProcessorKwargs],
+    ) -> BatchFeature:
+        """Preprocess videos into spatiotemporal tiles and thumbnails.
+        Each video is split into a grid of spatiotemporal tiles and a set of
+        low-resolution thumbnail frames.  Both SigLIP-processed and
+        AutoGaze-processed copies are produced.
+        Spatial tiling
+            Every frame is resized so that its dimensions become a multiple of
+            ``image_size`` (from the SigLIP image processor) and then cropped
+            into ``(cols, rows)`` spatial tiles, where ``cols * rows <=
+            max_tiles_video``.  The best ``(cols, rows)`` is chosen by matching
+            the original frame aspect ratio (same logic as
+            ``dynamic_preprocess`` in ``llava/mm_utils.py``).
+        Temporal chunking
+            The T sampled frames are divided into ``T // max_num_frames``
+            consecutive chunks of ``max_num_frames`` frames each, where
+            ``max_num_frames`` comes from the AutoGaze model config.
+            ``T`` must be divisible by ``max_num_frames``.
+        Tile ordering
+            Tiles are ordered **temporal-chunk-first**: all spatial tiles for
+            the first temporal chunk, then all spatial tiles for the second
+            temporal chunk, and so on.
+        Thumbnails
+            Each frame is also resized to ``image_size × image_size`` to form a
+            thumbnail.  If the number of frames exceeds
+            ``num_video_frames_thumbnail``, thumbnails are uniformly subsampled
+            (every k-th frame) to that count.  Each thumbnail is treated as a
+            single-frame video (temporal dim = 1).
+        Args:
+            videos: List of videos, where each video is a list of PIL Images
+                (one per frame).
+            **kwargs: Additional keyword arguments forwarded to the SigLIP
+                image processor.
+        Returns:
+            A tuple ``(videos_inputs, padding_strategy)`` where
+            ``videos_inputs`` is a ``BatchFeature`` dict with the keys:
+            - ``"pixel_values_videos_tiles"`` – list of tensors, one per video.
+              Each tensor has shape ``(num_tiles, T_tile, C, H, W)`` where
+              ``num_tiles = num_spatial_tiles * temporal_chunks``,
+              ``T_tile = max_num_frames`` (from AutoGaze config),
+              and ``H = W = image_size``.
+              Processed by the SigLIP image processor.
+            - ``"pixel_values_videos_thumbnails"`` – list of tensors, one per
+              video.  Each tensor has shape
+              ``(T_thumbnail, 1, C, H, W)`` where ``T_thumbnail <=
+              num_video_frames_thumbnail`` and ``H = W = image_size``.
+              Processed by the SigLIP image processor.
+            - ``"pixel_values_videos_tiles_autogaze"`` *(optional)* – same
+              structure as ``pixel_values_videos_tiles`` but processed by the
+              AutoGaze ``transform_video_for_pytorch`` transform.
+              Only present when AutoGaze is available.
+            - ``"pixel_values_videos_thumbnails_autogaze"`` *(optional)* – same
+              structure as ``pixel_values_videos_thumbnails`` but processed by
+              the AutoGaze transform.  Only present when AutoGaze is available.
+            ``padding_strategy`` is a list (one entry per video) of lists of
+            ints used for text-token padding.  Currently a placeholder; the
+            final strategy depends on downstream gazing results.
+        """
+        merged_kwargs = self._merge_kwargs(
+            NVILAProcessorKwargs,  # type: ignore
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Get siglip image size (tile spatial resolution)
+        if hasattr(self.image_processor, "size"):
+            image_size = self.image_processor.size.get("height", 392)
+        else:
+            image_size = 392
+        # Get AutoGaze max_num_frames for temporal chunking
+        if self._autogaze_model is not None:
+            autogaze_max_num_frames = self._autogaze_model.config.max_num_frames
+        else:
+            autogaze_max_num_frames = 16  # default
+        # Load AutoGaze transform if available
+        autogaze_transform = None
+        largest_scale = max(self.target_scales)
+        autogaze_transform = AutoGazeImageProcessor.from_pretrained(
+            self.autogaze_model_id,
+            size=(largest_scale, largest_scale),
+        )
+        pixel_values_videos_tiles = []
+        pixel_values_videos_thumbnails = []
+        pixel_values_videos_tiles_autogaze = []
+        pixel_values_videos_thumbnails_autogaze = []
+        num_spatial_tiles_each_video = []
+        for video in videos:
+            video = [img.convert("RGB") for img in video]
+            num_frames = len(video)
+            orig_width, orig_height = video[0].size
+            # --- Temporal chunking ---
+            temporal_chunks = num_frames // autogaze_max_num_frames
+            assert temporal_chunks >= 1 and num_frames % autogaze_max_num_frames == 0, (
+                f"Number of frames ({num_frames}) must be divisible by "
+                f"AutoGaze max_num_frames ({autogaze_max_num_frames})"
+            )
+            # --- Spatial tiling ---
+            # max_tiles_video directly controls the max number of spatial tiles
+            max_spatial_tiles = max(self.max_tiles_video, 1)
+            # Use dynamic_preprocess-style approach for finding best spatial aspect ratio
+            aspect_ratio = orig_width / orig_height
+            target_ratios = {
+                (i, j)
+                for n in range(1, max_spatial_tiles + 1)
+                for i in range(1, n + 1)
+                for j in range(1, n + 1)
+                if 1 <= i * j <= max_spatial_tiles
+            }
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+            target_aspect_ratio = _find_closest_aspect_ratio(
+                aspect_ratio, target_ratios, orig_width, orig_height, image_size
+            )
+            target_width = image_size * target_aspect_ratio[0]   # cols * image_size
+            target_height = image_size * target_aspect_ratio[1]  # rows * image_size
+            num_spatial_tiles = target_aspect_ratio[0] * target_aspect_ratio[1]
+            num_cols = target_aspect_ratio[0]
+            # --- Build per-frame spatial tiles and thumbnails ---
+            # spatial_tile_frames[spatial_idx] = list of T PIL Images
+            spatial_tile_frames = [[] for _ in range(num_spatial_tiles)]
+            thumbnail_frames = []
+            for frame in video:
+                # Resize frame for spatial tiling
+                resized_frame = frame.resize((target_width, target_height))
+                # Split into spatial tiles
+                for tile_idx in range(num_spatial_tiles):
+                    col = tile_idx % num_cols
+                    row = tile_idx // num_cols
+                    box = (
+                        col * image_size,
+                        row * image_size,
+                        (col + 1) * image_size,
+                        (row + 1) * image_size,
+                    )
+                    tile = resized_frame.crop(box)
+                    spatial_tile_frames[tile_idx].append(tile)
+                # Thumbnail: resize whole frame to image_size x image_size
+                thumbnail = frame.resize((image_size, image_size))
+                thumbnail_frames.append(thumbnail)
+            # --- Assemble spatiotemporal tiles ---
+            # Collect all tile images in flat order: temporal chunk (outer) ×
+            # spatial tile (inner) × frame-within-chunk (innermost).
+            num_tiles = temporal_chunks * num_spatial_tiles
+            T_tile = autogaze_max_num_frames
+            all_tile_images = []
+            for t_chunk in range(temporal_chunks):
+                for spatial_idx in range(num_spatial_tiles):
+                    start = t_chunk * T_tile
+                    end = start + T_tile
+                    all_tile_images.extend(spatial_tile_frames[spatial_idx][start:end])
+            # SigLIP: process all tile images at once → (num_tiles * T_tile, C, H, W)
+            siglip_processed = self.image_processor(
+                all_tile_images, **merged_kwargs["images_kwargs"],
+            )["pixel_values"]
+            if not isinstance(siglip_processed, torch.Tensor):
+                siglip_processed = torch.tensor(np.array(siglip_processed))
+            video_tiles_siglip = siglip_processed.reshape(num_tiles, T_tile, *siglip_processed.shape[1:])
+            pixel_values_videos_tiles.append(video_tiles_siglip)
+            # AutoGaze transform: process all tile images at once
+            if autogaze_transform is not None:
+                all_tile_np = np.stack([np.array(f) for f in all_tile_images])  # (num_tiles * T_tile, H, W, 3)
+                autogaze_processed = transform_video_for_pytorch(all_tile_np, autogaze_transform)
+                video_tiles_autogaze = autogaze_processed.reshape(num_tiles, T_tile, *autogaze_processed.shape[1:])
+                pixel_values_videos_tiles_autogaze.append(video_tiles_autogaze)
+            # --- Assemble thumbnails ---
+            # Subsample thumbnails if needed (keep every k-th frame)
+            if len(thumbnail_frames) > self.num_video_frames_thumbnail:
+                step = len(thumbnail_frames) // self.num_video_frames_thumbnail
+                sampled_thumbnail_frames = thumbnail_frames[::step][: self.num_video_frames_thumbnail]
+            else:
+                sampled_thumbnail_frames = thumbnail_frames
+            T_thumb = len(sampled_thumbnail_frames)
+            # SigLIP: process all thumbnail images at once → (T_thumb, C, H, W)
+            siglip_processed = self.image_processor(
+                sampled_thumbnail_frames, **merged_kwargs["images_kwargs"],
+            )["pixel_values"]
+            if not isinstance(siglip_processed, torch.Tensor):
+                siglip_processed = torch.tensor(np.array(siglip_processed))
+            # Each thumbnail is a single-frame video → (T_thumb, 1, C, H, W)
+            video_thumbnails_siglip = siglip_processed.unsqueeze(1)
+            pixel_values_videos_thumbnails.append(video_thumbnails_siglip)
+            # AutoGaze transform: process all thumbnail images at once
+            if autogaze_transform is not None:
+                all_thumb_np = np.stack([np.array(f) for f in sampled_thumbnail_frames])  # (T_thumb, H, W, 3)
+                autogaze_processed = transform_video_for_pytorch(all_thumb_np, autogaze_transform)
+                video_thumbnails_autogaze = autogaze_processed.unsqueeze(1)  # (T_thumb, 1, C, H, W)
+                pixel_values_videos_thumbnails_autogaze.append(video_thumbnails_autogaze)
+            num_spatial_tiles_each_video.append(num_spatial_tiles)
+            print(
+                f"Video tiling: {num_frames} frames @ {orig_width}x{orig_height} → "
+                f"{num_spatial_tiles} spatial × {temporal_chunks} temporal = "
+                f"{num_spatial_tiles * temporal_chunks} tiles, each "
+                f"{autogaze_max_num_frames}×{image_size}×{image_size}; "
+                f"{len(sampled_thumbnail_frames)} thumbnail frames"
+            )
+        # Build output BatchFeature
+        videos_inputs = BatchFeature(
+            {
+                "pixel_values_videos_tiles": pixel_values_videos_tiles,
+                "pixel_values_videos_thumbnails": pixel_values_videos_thumbnails,
+                "num_spatial_tiles_each_video": num_spatial_tiles_each_video,
+            }
+        )
+        if pixel_values_videos_tiles_autogaze:
+            videos_inputs["pixel_values_videos_tiles_autogaze"] = pixel_values_videos_tiles_autogaze
+        if pixel_values_videos_thumbnails_autogaze:
+            videos_inputs["pixel_values_videos_thumbnails_autogaze"] = pixel_values_videos_thumbnails_autogaze
+        return videos_inputs
+    @staticmethod
+    def _should_gaze_all_patches(gazing_ratio, task_loss_requirement) -> bool:
+        """Return True when the gazing config means every patch is kept.
+        This is the case when ``gazing_ratio`` is ``None`` (no gazing at all),
+        or when ``gazing_ratio == 1`` (keep 100 %) **and**
+        ``task_loss_requirement is None`` (no adaptive pruning).
+        """
+        if gazing_ratio is None:
+            return True
+        if task_loss_requirement is not None:
+            return False
+        if isinstance(gazing_ratio, (list, tuple)):
+            return all(r == 1 for r in gazing_ratio)
+        return gazing_ratio == 1
+    @staticmethod
+    def _sort_gazing_pos_per_frame(
+        gazing_pos: torch.Tensor,
+        if_padded: torch.Tensor,
+        num_gazing_each_frame: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sort non-padded gazing positions in ascending order within each frame.
+        Padded positions are left untouched at the end of each frame's segment
+        so that the total count (padded + non-padded) per frame is unchanged.
+        Args:
+            gazing_pos: ``(B, N)`` tensor of gazing patch indices.
+            if_padded: ``(B, N)`` bool tensor (``True`` = padded / dummy).
+            num_gazing_each_frame: ``(B, T)`` tensor giving the number of
+                gazing positions (padded + non-padded) for each frame.
+        Returns:
+            A new ``(B, N)`` tensor with the same values as *gazing_pos*
+            except that the non-padded entries within every frame are sorted.
+        """
+        sorted_pos = gazing_pos.clone()
+        B, _ = gazing_pos.shape
+        T = num_gazing_each_frame.shape[1]
+        for b in range(B):
+            offset = 0
+            for t in range(T):
+                count = int(num_gazing_each_frame[b, t].item())
+                frame_pos = gazing_pos[b, offset : offset + count]
+                frame_pad = if_padded[b, offset : offset + count]
+                # Indices of non-padded (real) positions within the frame segment
+                real_mask = ~frame_pad
+                real_pos = frame_pos[real_mask]
+                # Sort the real positions
+                real_pos_sorted = real_pos.sort()[0]
+                # Write sorted values back at the correct locations
+                real_indices = real_mask.nonzero(as_tuple=True)[0]
+                sorted_pos[b, offset + real_indices] = real_pos_sorted
+                offset += count
+        return sorted_pos
+    def _run_autogaze_batched(
+        self,
+        all_videos: torch.Tensor,
+        autogaze_device: torch.device,
+        cpu_device: torch.device,
+        gazing_ratio,
+        task_loss_requirement,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Run AutoGaze in minibatches and return combined results on CPU.
+        Different minibatches may produce different per-frame gazing counts
+        (e.g. when ``task_loss_requirement`` triggers adaptive pruning).
+        This method pads each frame's segment to the *maximum* count across
+        all minibatches so that the results can be concatenated along the
+        batch dimension.
+        Args:
+            all_videos: ``(B, T, C, H, W)`` tensor of videos to process.
+            autogaze_device: Device where AutoGaze runs (typically CUDA).
+            cpu_device: Device for the returned tensors (typically CPU).
+            gazing_ratio: Gazing ratio to pass to AutoGaze.
+            task_loss_requirement: Task loss requirement to pass to AutoGaze.
+        Returns:
+            A tuple ``(gazing_pos, if_padded, num_gazing)`` where
+            - ``gazing_pos`` is ``(B, N_max)`` on *cpu_device*
+            - ``if_padded`` is ``(B, N_max)`` bool on *cpu_device*
+            - ``num_gazing`` is ``(B, T)`` on *cpu_device*
+            ``N_max = sum(max_per_frame)`` where ``max_per_frame[t]`` is the
+            largest per-frame count across all minibatches.
+        """
+        total = all_videos.shape[0]
+        bs = self.max_batch_size_autogaze
+        batch_results: list[dict] = []
+        with torch.inference_mode():
+            for start in range(0, total, bs):
+                batch = all_videos[start : start + bs]
+                gaze = self._autogaze_model(
+                    {"video": batch.to(autogaze_device)},
+                    gazing_ratio=gazing_ratio,
+                    task_loss_requirement=task_loss_requirement,
+                    target_scales=self.target_scales,
+                    target_patch_size=self.target_patch_size,
+                )
+                ng = gaze["num_gazing_each_frame"]
+                if isinstance(ng, list):
+                    ng = torch.tensor(ng, device=cpu_device, dtype=torch.long)
+                elif not isinstance(ng, torch.Tensor):
+                    ng = torch.tensor(ng, device=cpu_device, dtype=torch.long)
+                else:
+                    ng = ng.to(cpu_device)
+                if ng.dim() == 2:
+                    ng = ng[0]
+                batch_results.append({
+                    "gazing_pos": gaze["gazing_pos"].to(cpu_device),
+                    "if_padded": gaze["if_padded_gazing"].to(cpu_device),
+                    "num_gazing": ng,
+                    "batch_size": batch.shape[0],
+                })
+        # Fast path: single minibatch — no cross-batch padding needed
+        if len(batch_results) == 1:
+            r = batch_results[0]
+            num_gazing = r["num_gazing"].unsqueeze(0).expand(total, -1).contiguous()
+            return r["gazing_pos"], r["if_padded"], num_gazing
+        # Compute the max per-frame count across all minibatches
+        all_ng = torch.stack([r["num_gazing"] for r in batch_results], dim=0)  # (num_minibatches, T)
+        max_per_frame = all_ng.max(dim=0).values  # (T,)
+        max_N = int(max_per_frame.sum().item())
+        T = max_per_frame.shape[0]
+        padded_pos_list = []
+        padded_mask_list = []
+        for r in batch_results:
+            src_pos = r["gazing_pos"]   # (mini_B, N_src)
+            src_pad = r["if_padded"]    # (mini_B, N_src)
+            src_ng = r["num_gazing"]    # (T,)
+            mini_B = r["batch_size"]
+            if int(src_ng.sum().item()) == max_N:
+                padded_pos_list.append(src_pos)
+                padded_mask_list.append(src_pad)
+                continue
+            dst_pos = torch.zeros(mini_B, max_N, device=cpu_device, dtype=src_pos.dtype)
+            dst_pad = torch.ones(mini_B, max_N, device=cpu_device, dtype=torch.bool)
+            src_off = 0
+            dst_off = 0
+            for t in range(T):
+                sc = int(src_ng[t].item())
+                dc = int(max_per_frame[t].item())
+                dst_pos[:, dst_off : dst_off + sc] = src_pos[:, src_off : src_off + sc]
+                dst_pad[:, dst_off : dst_off + sc] = src_pad[:, src_off : src_off + sc]
+                src_off += sc
+                dst_off += dc
+            padded_pos_list.append(dst_pos)
+            padded_mask_list.append(dst_pad)
+        gazing_pos = torch.cat(padded_pos_list, dim=0)
+        if_padded = torch.cat(padded_mask_list, dim=0)
+        num_gazing = max_per_frame.unsqueeze(0).expand(total, -1).contiguous()
+        return gazing_pos, if_padded, num_gazing
+    def _get_gazing_info_from_videos(
+        self,
+        videos_inputs: BatchFeature,
+    ) -> Optional[dict]:
+        """Run AutoGaze on the preprocessed tiles and thumbnails.
+        All tiles from all videos are batched together (they share the same
+        temporal dimension ``T_tile``).  Similarly, all thumbnails are batched
+        together (temporal dim = 1).  AutoGaze is run once on each batch and
+        the results are split back per-video.
+        When a gazing ratio is 1 and the corresponding task_loss_requirement is
+        None (or gazing_ratio is None), all patches are kept and AutoGaze is
+        skipped for that component.  If both tiles and thumbnails meet this
+        condition, AutoGaze is not invoked at all.
+        Args:
+            videos_inputs: The ``BatchFeature`` returned by
+                ``_preprocess_videos``, which must contain the keys
+                ``pixel_values_videos_tiles_autogaze`` and
+                ``pixel_values_videos_thumbnails_autogaze`` (unless the
+                corresponding component can skip AutoGaze).
+        Returns:
+            A dict with the following keys (or ``None`` if AutoGaze is
+            unavailable or the required inputs are missing):
+            - ``"gazing_pos_tiles"`` – list of tensors, one per video, each
+              shaped ``(num_tiles_i, N)``.
+            - ``"num_gazing_each_frame_tiles"`` – list of tensors, one per
+              video, each shaped ``(num_tiles_i, T_tile)``.
+            - ``"if_padded_gazing_tiles"`` – list of bool tensors, one per
+              video, each shaped ``(num_tiles_i, N)``.
+            - ``"gazing_pos_thumbnails"`` – list of tensors, one per video,
+              each shaped ``(T_thumb_i, N')``.
+            - ``"num_gazing_each_frame_thumbnails"`` – list of tensors, one per
+              video, each shaped ``(T_thumb_i, 1)``.
+            - ``"if_padded_gazing_thumbnails"`` – list of bool tensors, one per
+              video, each shaped ``(T_thumb_i, N')``.
+        """
+        skip_tiles = self._should_gaze_all_patches(
+            self.gazing_ratio_tile, self.task_loss_requirement_tile
+        )
+        skip_thumbnails = self._should_gaze_all_patches(
+            self.gazing_ratio_thumbnail, self.task_loss_requirement_thumbnail
+        )
+        need_autogaze = not skip_tiles or not skip_thumbnails
+        if need_autogaze and self._autogaze_model is None:
+            return None
+        # Per-video tile/thumbnail counts from SigLIP tensors (always present)
+        siglip_tiles = videos_inputs["pixel_values_videos_tiles"]
+        siglip_thumbs = videos_inputs["pixel_values_videos_thumbnails"]
+        num_tiles_per_video = [t.shape[0] for t in siglip_tiles]
+        num_thumbs_per_video = [t.shape[0] for t in siglip_thumbs]
+        device = torch.device("cpu")
+        autogaze_device = torch.device("cuda") if torch.cuda.is_available() else device
+        # Total patches per frame across all scales
+        num_patches_each_scale = [
+            (s // self.target_patch_size) ** 2 for s in self.target_scales
+        ]
+        total_patches_per_frame = sum(num_patches_each_scale)
+        # Ensure AutoGaze model is on GPU for inference
+        if need_autogaze:
+            current_device = next(self._autogaze_model.parameters()).device
+            if current_device != autogaze_device:
+                self._autogaze_model = self._autogaze_model.to(autogaze_device)
+        # --- Tiles ---
+        if skip_tiles:
+            total_tiles = sum(num_tiles_per_video)
+            T_tile = siglip_tiles[0].shape[1]
+            per_frame_pos = torch.arange(total_patches_per_frame, device=device, dtype=torch.long)
+            tiles_gazing_pos = per_frame_pos.repeat(T_tile).unsqueeze(0).expand(total_tiles, -1).contiguous()
+            tiles_if_padded = torch.zeros(
+                total_tiles, T_tile * total_patches_per_frame, device=device, dtype=torch.bool
+            )
+            tiles_num_gazing = torch.full(
+                (total_tiles, T_tile), total_patches_per_frame, device=device, dtype=torch.long
+            )
+        else:
+            tiles_autogaze = videos_inputs.get("pixel_values_videos_tiles_autogaze")
+            if tiles_autogaze is None:
+                return None
+            all_tiles = torch.cat(tiles_autogaze, dim=0)
+            tiles_gazing_pos, tiles_if_padded, tiles_num_gazing = self._run_autogaze_batched(
+                all_tiles, autogaze_device, device,
+                self.gazing_ratio_tile, self.task_loss_requirement_tile,
+            )
+            tiles_gazing_pos = self._sort_gazing_pos_per_frame(
+                tiles_gazing_pos, tiles_if_padded, tiles_num_gazing
+            )
+        # --- Thumbnails ---
+        if skip_thumbnails:
+            total_thumbs = sum(num_thumbs_per_video)
+            per_thumb_pos = torch.arange(
+                total_patches_per_frame, device=device, dtype=torch.long
+            )
+            thumbs_gazing_pos = per_thumb_pos.unsqueeze(0).expand(total_thumbs, -1).contiguous()
+            thumbs_if_padded = torch.zeros_like(thumbs_gazing_pos, dtype=torch.bool)
+            thumbs_num_gazing = torch.full(
+                (total_thumbs, 1), total_patches_per_frame,
+                device=device, dtype=torch.long,
+            )
+        else:
+            thumbs_autogaze = videos_inputs.get("pixel_values_videos_thumbnails_autogaze")
+            if thumbs_autogaze is None:
+                return None
+            all_thumbs = torch.cat(thumbs_autogaze, dim=0)
+            thumbs_gazing_pos, thumbs_if_padded, thumbs_num_gazing = self._run_autogaze_batched(
+                all_thumbs, autogaze_device, device,
+                self.gazing_ratio_thumbnail, self.task_loss_requirement_thumbnail,
+            )
+            thumbs_gazing_pos = self._sort_gazing_pos_per_frame(
+                thumbs_gazing_pos, thumbs_if_padded, thumbs_num_gazing
+            )
+        # --- Split results back per video ---
+        tiles_gazing_pos_list = list(torch.split(tiles_gazing_pos, num_tiles_per_video, dim=0))
+        tiles_if_padded_list = list(torch.split(tiles_if_padded, num_tiles_per_video, dim=0))
+        tiles_num_gazing_list = list(torch.split(tiles_num_gazing, num_tiles_per_video, dim=0))
+        thumbs_gazing_pos_list = list(torch.split(thumbs_gazing_pos, num_thumbs_per_video, dim=0))
+        thumbs_if_padded_list = list(torch.split(thumbs_if_padded, num_thumbs_per_video, dim=0))
+        thumbs_num_gazing_list = list(torch.split(thumbs_num_gazing, num_thumbs_per_video, dim=0))
+        return {
+            "gazing_pos_tiles": tiles_gazing_pos_list,
+            "num_gazing_each_frame_tiles": tiles_num_gazing_list,
+            "if_padded_gazing_tiles": tiles_if_padded_list,
+            "gazing_pos_thumbnails": thumbs_gazing_pos_list,
+            "num_gazing_each_frame_thumbnails": thumbs_num_gazing_list,
+            "if_padded_gazing_thumbnails": thumbs_if_padded_list,
+        }

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_nvila.NVILAProcessor"
+  },
+  "processor_class": "NVILAProcessor"
+}

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,793 @@

+{
+  "metadata": {
+    "total_size": 16174169312
+  },
+  "weight_map": {
+    "llm.lm_head.weight": "model-00001-of-00004.safetensors",
+    "llm.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "llm.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "llm.model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "llm.model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "llm.model.layers.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "llm.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "llm.model.norm.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.post_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vision_tower.vision_model.post_layernorm.bias": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.1.bias": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.1.weight": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.2.bias": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.2.weight": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.4.bias": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.4.weight": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.5.bias": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.5.weight": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.7.bias": "model-00004-of-00004.safetensors",
+    "mm_projector.layers.7.weight": "model-00004-of-00004.safetensors"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image>",
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sentinel_token": "<vila/sentinel>",
+  "video_token": "<vila/video>"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,96 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<vila/sentinel>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<vila/video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_nvila.NVILAProcessor"
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "image_token": "<image>",
+    "sentinel_token": "<vila/sentinel>",
+    "video_token": "<vila/video>"
+  },
+  "image_token": "<image>",
+  "legacy": false,
+  "model_max_length": 40960,
+  "pad_token": "[PAD]",
+  "padding_side": "left",
+  "processor_class": "NVILAProcessor",
+  "sentinel_token": "<vila/sentinel>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<vila/video>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff