CCCCyx commited on Apr 7

Commit

b66ac48

1 Parent(s): 9de46f9

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.gitattributes +5 -0
README.md +355 -0
__pycache__/modeling_moss_vl.cpython-312.pyc +3 -0
assets/3d-rope.png +3 -0
assets/logo.png +3 -0
assets/structure.png +3 -0
assets/timestamp_input.svg +78 -0
chat_template.json +3 -0
config.json +83 -0
configuration_moss_vl.py +164 -0
generation_config.json +6 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model.safetensors.index.json +902 -0
modeling_moss_vl.py +0 -0
preprocessor_config.json +26 -0
processing_moss_vl.py +1079 -0
requirements.txt +15 -0
tokenizer.json +3 -0
tokenizer_config.json +258 -0
video_preprocessor_config.json +30 -0
video_processing_moss_vl.py +1132 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+__pycache__/modeling_moss_vl.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
+assets/3d-rope.png filter=lfs diff=lfs merge=lfs -text
+assets/logo.png filter=lfs diff=lfs merge=lfs -text
+assets/structure.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,355 @@

+---
+title: MOSS-VL-SFT-0408
+date: 2026-04-08
+category: Multimodal-LLM
+status: SFT
+language:
+- en
+library_name: transformers
+pipeline_tag: video-text-to-text
+license: apache-2.0
+base_model: fnlp-vision/moss-video-preview-base
+tags:
+- SFT
+- Video-Understanding
+- Image-Understanding
+- MOSS-VL
+- OpenMOSS
+- multimodal
+- video
+- vision-language
+---
+<p align="center">
+   <img src="assets/logo.png" width="320"/>
+</p>
+# MOSS-VL-SFT-0408
+## 📌 Introduction
+We introduce **MOSS-VL-SFT-0408**, the supervised fine-tuned checkpoint in the **MOSS-VL** series (part of the **OpenMOSS** ecosystem).
+> [!IMPORTANT]
+> This is an **SFT** checkpoint (instruction-tuned). It is **NOT** the Real-Time SFT streaming checkpoint.
+This model is designed as a high-performance offline engine for multimodal tasks, bridging the gap between static image understanding and dynamic real-time interaction.
+### This checkpoint is intended for:
+-   **video/image understanding** with significantly improved instruction following capabilities.
+-   Serving as a **strong starting point** for further **Real-Time SFT** or specific domain adaptation.
+---
+## 🚀 Key Features & Status
+| Feature | Status | Description |
+| :--- | :---: | :--- |
+| **Model Loading** | ✅ | Standard HF loading with `trust_remote_code=True` |
+| **Image Understanding** | ✅ | Single/Multi-image input support |
+| **Video Understanding** | ✅ | Native video frame sequence processing |
+| **Mixed Inference** | ✅ | Interleaved image and video inputs |
+| **Offline Generation** | ✅ | Optimized `offline_generate` & `offline_batch_generate` |
+| **Benchmarks/Metrics** | ⏳ | Coming in future updates |
+---
+## 🏗 Model Architecture
+**MOSS-VL-SFT-0408** adopts a decoupled multimodal design, utilizing a cross-attention mechanism to bridge high-resolution visual encoding with advanced language reasoning.
+<p align="center">
+    <img src="assets/structure.png" alt="MOSS-VL Architecture" width="90%"/>
+    <br>
+    <em>Figure 1: MOSS-VL Core Architecture.</em>
+</p>
+## Temporal-Aware Prompting
+At the model-family level, MOSS-VL uses timestamp-aware multimodal prompting for video understanding. This design gives sampled frames explicit temporal anchors, which helps the model reason about order, duration, and event localization more robustly.
+<p align="center">
+    <img src="assets/timestamp_input.svg" alt="Timestamped Sequence Input Illustration" width="90%"/>
+    <br>
+    <em>Figure 2: Illustration of the timestamped sequence input pipeline.</em>
+</p>
+## Multimodal RoPE
+MOSS-VL uses multimodal rotary position encoding to align text tokens and visual features in a shared spatial-temporal coordinate system. At a high level, this improves video-text grounding and helps preserve temporal structure during multimodal reasoning.
+<p align="center">
+    <img src="assets/3d-rope.png" alt="MOSS-VL mRoPE Architecture Illustration" width="80%"/>
+    <br>
+    <em>Figure 3: 3D-RoPE spatial-temporal alignment.</em>
+</p>
+## 🚀 Quickstart
+<details>
+<summary><strong>Queue-based offline inference (Python)</strong></summary>
+<br>
+```python
+import os
+import queue
+import threading
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+checkpoint = "path/to/checkpoint"
+video_path = "data/example_video.mp4"
+prompt = "Describe the video."
+max_new_tokens = 1024
+temperature = 1.0
+top_k = 50
+top_p = 1.0
+repetition_penalty = 1.0
+video_fps = 1.0
+video_minlen = 8
+video_maxlen = 256
+def load_model(checkpoint: str):
+    processor = AutoProcessor.from_pretrained(
+        checkpoint,
+        trust_remote_code=True,
+        frame_extract_num_threads=1,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        checkpoint,
+        trust_remote_code=True,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    )
+    return model, processor
+if not checkpoint:
+    raise ValueError("Missing `checkpoint`.")
+if not video_path:
+    raise ValueError("Missing `video_path`.")
+if not os.path.isfile(video_path):
+    raise FileNotFoundError(f"Video not found: {video_path}")
+model, processor = load_model(checkpoint)
+new_queries: "queue.Queue[dict]" = queue.Queue()
+output_text_queue: "queue.Queue[str]" = queue.Queue()
+query = {
+    "prompt": prompt,
+    "images": [],
+    "videos": [video_path],
+    "media_kwargs": {
+        "video_fps": video_fps,
+        "video_minlen": video_minlen,
+        "video_maxlen": video_maxlen,
+    },
+    "generate_kwargs": {
+        "temperature": temperature,
+        "top_k": top_k,
+        "top_p": top_p,
+        "max_new_tokens": max_new_tokens,
+        "repetition_penalty": repetition_penalty,
+        "do_sample": False,
+    },
+}
+def drain_output():
+    while True:
+        tok = output_text_queue.get()
+        if tok == "<|round_end|>":
+            break
+        print(tok, end="", flush=True)
+worker = threading.Thread(
+    target=model.offline_generate,
+    args=(processor, new_queries, output_text_queue),
+    kwargs={"vision_chunked_length": 64},
+    daemon=True,
+)
+worker.start()
+new_queries.put(query)
+drain_output()
+new_queries.put({"stop_offline_generate": True})
+worker.join(timeout=5.0)
+```
+For image-only usage, keep the same template and change:
+- replace `video_path` with `image_path`
+- validate `image_path` instead of `video_path`
+- set `images` to `[image_path]`
+- set `videos` to `[]`
+- remove `media_kwargs` if you do not need video-specific controls
+</details>
+<details>
+<summary><strong>Batched offline inference (Python)</strong></summary>
+<br>
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+checkpoint = "path/to/checkpoint"
+shared_generate_kwargs = {
+    "temperature": 1.0,
+    "top_k": 50,
+    "top_p": 1.0,
+    "max_new_tokens": 256,
+    "repetition_penalty": 1.0,
+    "do_sample": False,
+}
+shared_media_kwargs = {
+    "video_fps": 1.0,
+    "video_minlen": 8,
+    "video_maxlen": 256,
+}
+def load_model(checkpoint: str):
+    processor = AutoProcessor.from_pretrained(
+        checkpoint,
+        trust_remote_code=True,
+        frame_extract_num_threads=1,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        checkpoint,
+        trust_remote_code=True,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    )
+    return model, processor
+model, processor = load_model(checkpoint)
+queries = [
+    {
+        "prompt": "Describe sample A.",
+        "images": [],
+        "videos": ["data/sample_a.mp4"],
+        "media_kwargs": dict(shared_media_kwargs),
+        "generate_kwargs": dict(shared_generate_kwargs),
+    },
+    {
+        "prompt": "Describe sample B.",
+        "images": [],
+        "videos": ["data/sample_b.mp4"],
+        "media_kwargs": dict(shared_media_kwargs),
+        "generate_kwargs": dict(shared_generate_kwargs),
+    },
+]
+with torch.no_grad():
+    result = model.offline_batch_generate(
+        processor,
+        queries,
+        session_states=None,
+        vision_chunked_length=64,
+    )
+texts = [item["text"] for item in result["results"]]
+session_states = result["session_states"]
+```
+```python
+followup_queries = [
+    {
+        "prompt": "Summarize sample A in one sentence.",
+        "generate_kwargs": dict(shared_generate_kwargs),
+    },
+    {
+        "prompt": "Restart sample B and answer again.",
+        "reset_session": True,
+        "generate_kwargs": dict(shared_generate_kwargs),
+    },
+]
+with torch.no_grad():
+    followup_result = model.offline_batch_generate(
+        processor,
+        followup_queries,
+        session_states=session_states,
+        vision_chunked_length=64,
+    )
+```
+</details>
+## Intended Use
+- offline image understanding
+- offline video understanding
+- multimodal prompt experiments for release validation
+- checkpoint-level inference integration and debugging
+## Requirements
+Core validated inference dependencies:
+- `python==3.12.13`
+- `torch==2.8.0+cu128`
+- `torchvision==0.23.0+cu128`
+- `transformers==4.57.1`
+- `accelerate==1.12.0`
+- `flash_attn==2.8.1`
+- `torchcodec==0.7.0`
+- `numpy==2.4.3`
+- `pillow==12.1.1`
+- `joblib==1.5.2`
+- `einops==0.8.2`
+Installation commands:
+```bash
+conda create -n moss_vl python=3.12 pip -y
+conda activate moss_vl
+pip install -i https://pypi.org/simple --no-build-isolation -r requirements.txt
+```
+Validated setup notes:
+- CUDA runtime used for validation: `12.8`
+- Inference loading uses `trust_remote_code=True` and `attn_implementation="flash_attention_2"`
+## Limitations and Future Work
+- realtime usage is not documented here
+- benchmark, metric, and training details are still blank
+- some sections are intentionally placeholders until release information is finalized
+- batch calls currently require shared `generate_kwargs` and shared `media_kwargs` within one call
+- batch streaming and batch cancel / stop protocol are not part of `offline_batch_generate(...)`
+- the queue example is intentionally minimal and does not include production-grade timeout or worker error handling
+## Citation
+```bibtex
+@misc{moss_vl_2026,
+  title         = {{MOSS-VL Technical Report}},
+  author        = {OpenMOSS Team},
+  year          = {2026},
+  howpublished  = {\url{https://github.com/fnlp-vision/MOSS-VL}},
+  note          = {GitHub repository}
+}
+```

__pycache__/modeling_moss_vl.cpython-312.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c2dac0006732b9b43f3257298ad053f74aec76cab84967d3740ad5fdde54e1
+size 126448

assets/3d-rope.png ADDED Viewed

Git LFS Details

SHA256: 8800079720aa8bd81b3e0e03e272343d46945ee80df9f2772cea2b8f26e65dd8
Pointer size: 131 Bytes
Size of remote file: 194 kB

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: f7a6fedc94ceb363185ee75b6265893cbda1ab3f6472f19e9ee217d4783f5200
Pointer size: 132 Bytes
Size of remote file: 2.08 MB

assets/structure.png ADDED Viewed

Git LFS Details

SHA256: 057422f85693e783a4d4d53c1e7348bbbf8bbd0f9d76d57773c48afa9592f19a
Pointer size: 131 Bytes
Size of remote file: 178 kB

assets/timestamp_input.svg ADDED Viewed

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|image|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|video|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|image|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|video|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+}

config.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "architectures": [
+    "MossVLForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_moss_vl.MossVLConfig",
+    "AutoModel": "modeling_moss_vl.MossVLForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_moss_vl.MossVLForConditionalGeneration"
+  },
+  "dtype": "bfloat16",
+  "image_token_id": 151655,
+  "model_type": "moss_vl",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "cross_attention_layers": [
+      2,
+      6,
+      10,
+      14,
+      18,
+      22,
+      26,
+      30,
+      34,
+      38,
+      42,
+      46
+    ],
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 262144,
+    "model_type": "moss_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 48,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      8,
+      16,
+      24
+    ],
+    "depth": 27,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "moss_vl_vision",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 4096,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 1
+  },
+  "vision_end_token_id": 151653,
+  "vision_seq_pad_multiple": 8,
+  "vision_start_token_id": 151652
+}

configuration_moss_vl.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MossVL model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MossVLVisionConfig(PretrainedConfig):
+    """
+    Configuration for MossVL Vision Model
+    """
+    model_type = "moss_vl_vision"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=1,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        deepstack_visual_indexes=[8, 16, 24],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+        self.deepstack_visual_indexes = deepstack_visual_indexes
+class MossVLTextConfig(PretrainedConfig):
+    """
+    Configuration for MossVL Text Model
+    """
+    model_type = "moss_vl_text"
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        # Cross attention specific
+        cross_attention_layers=None,  # List of layer indices to insert cross attention
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
+        self.cross_attention_layers = cross_attention_layers or [2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class MossVLConfig(PretrainedConfig):
+    """
+    Configuration for MossVL Model
+    """
+    model_type = "moss_vl"
+    sub_configs = {"vision_config": MossVLVisionConfig, "text_config": MossVLTextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        vision_seq_pad_multiple=8,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        self.vision_seq_pad_multiple = vision_seq_pad_multiple
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+__all__ = ["MossVLConfig", "MossVLTextConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "4.57.1"
+}

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e06b965b124358c760daf4fd0df1f4f96fd3489ec1acf2df07f8cc30228f6470
+size 5274500800

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b30b6c802724afa309d1b46eec351fa3b935b7e241d4203d02217e29cd42e02
+size 5360568508

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48cce2c73d9e62b9af51ceaacbb35d59f56e062c4bd5fa006ef53a47e9b6070c
+size 5360577920

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5a162acab8d8b9dc63c0b06f840db983babe44813701ce8ee937aef4e621269
+size 5366957460

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,902 @@

+{
+  "metadata": {
+    "total_size": 22672742416
+  },
+  "weight_map": {
+    "model.language_model.embed_tokens.weight": "model-00001-of-00005.safetensors",
+    "model.separator_token": "model-00001-of-00005.safetensors",
+    "model.language_model.norm.weight": "model-00001-of-00005.safetensors",
+    "lm_head.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn_attn_gate": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.cross_attn_mlp_gate": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn_attn_gate": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.cross_attn_mlp_gate": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.language_model.layers.7.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.7.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.7.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn_attn_gate": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.cross_attn_mlp_gate": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn_attn_gate": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.cross_attn_mlp_gate": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn_attn_gate": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.cross_attn_mlp_gate": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.language_model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.21.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.21.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.21.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.32.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.33.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.34.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.35.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.36.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.37.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn_attn_gate": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.cross_attn_mlp_gate": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.38.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.39.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.40.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.41.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn_attn_gate": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.cross_attn_mlp_gate": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.42.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.43.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.44.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.45.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn_attn_gate": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.cross_attn_mlp_gate": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.46.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.language_model.layers.47.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.patch_embed.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.patch_embed.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.pos_embed.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.attn.proj.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.attn.proj.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.norm2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.norm2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.11.norm1.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.11.norm1.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.11.attn.qkv.weight": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.11.attn.qkv.bias": "model-00004-of-00005.safetensors",
+    "model.visual.blocks.11.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.11.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.11.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.11.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.norm1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.norm1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.attn.qkv.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.attn.qkv.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.attn.proj.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.attn.proj.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.norm2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.norm2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.merger.linear_fc1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.merger.linear_fc1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.merger.linear_fc2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.merger.linear_fc2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.0.weight": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.0.bias": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.1.weight": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.1.bias": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.2.weight": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.2.bias": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.3.weight": "model-00005-of-00005.safetensors",
+    "model.visual.merger.norms.3.bias": "model-00005-of-00005.safetensors"
+  }
+}

modeling_moss_vl.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "auto_map": {
+        "AutoProcessor": "processing_moss_vl.MossVLProcessor",
+        "AutoImageProcessor": "processing_moss_vl.MossVLImageProcessorFast"
+    },
+    "size": {
+        "longest_edge": 16777216,
+        "shortest_edge": 4096
+    },
+    "multi_image_max_pixels": 943718400,
+    "patch_size": 16,
+    "temporal_patch_size": 1,
+    "merge_size": 2,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "MossVLProcessor",
+    "image_processor_type": "MossVLImageProcessorFast"
+}

processing_moss_vl.py ADDED Viewed

	@@ -0,0 +1,1079 @@

+# coding=utf-8
+# Copyright 2025 The FNLP Vision Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Moss-VL.
+"""
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import torch
+from torchvision.transforms.v2 import functional as F
+from PIL import Image
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, SizeDict
+from transformers.image_processing_utils_fast import group_images_by_shape, reorder_images
+from transformers.utils import TensorType
+from transformers.processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import Qwen2VLImageProcessorFast
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
+logger = logging.get_logger(__name__)
+class MossVLImageProcessorFast(Qwen2VLImageProcessorFast):
+    """
+    Custom image processor that overrides _preprocess to support multi_image_max_pixels.
+    Inherits from Qwen2VLImageProcessorFast.
+    """
+    # Multi-image batch total pixels limit (read from config)
+    multi_image_max_pixels = None
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        patch_size: int,
+        temporal_patch_size: int,
+        merge_size: int,
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ):
+        """Override _preprocess to use custom smart_resize with batch-level max_pixels.
+        multi_image_max_pixels is treated as a batch-level total budget, proportionally allocated
+        to each image based on its original pixel count. min_pixels remains a per-image
+        constraint. multi_image_max_pixels can be configured separately from longest_edge.
+        """
+        min_pixels = size["shortest_edge"]
+        max_pixels = size["longest_edge"]  # Per-image upper limit
+        # Use multi_image_max_pixels if configured, otherwise fall back to longest_edge
+        multi_image_max_pixels = getattr(self, "multi_image_max_pixels", None) or max_pixels
+        # Calculate total original pixels across all images in the batch
+        # This is used to proportionally allocate max_pixels to each image
+        total_original_pixels = sum(img.shape[-2] * img.shape[-1] for img in images)
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            height, width = stacked_images.shape[-2:]
+            if do_resize:
+                # Calculate proportional max_pixels for images with this shape
+                # Each image's max_pixels is allocated based on its proportion of total pixels
+                original_pixels = height * width
+                if total_original_pixels > 0:
+                    proportion = original_pixels / total_original_pixels
+                    proportional_max_pixels = int(multi_image_max_pixels * proportion)
+                else:
+                    proportional_max_pixels = multi_image_max_pixels
+                # Ensure proportional max_pixels is within [min_pixels, max_pixels] range
+                # min_pixels: per-image lower limit (shortest_edge)
+                # max_pixels: per-image upper limit (longest_edge)
+                proportional_max_pixels = max(proportional_max_pixels, min_pixels)
+                proportional_max_pixels = min(proportional_max_pixels, max_pixels)
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=patch_size * merge_size,
+                    min_pixels=min_pixels,
+                    max_pixels=proportional_max_pixels,
+                )
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=SizeDict(height=resized_height, width=resized_width),
+                    interpolation=interpolation,
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+        # Warn if multi-image batch exceeds multi_image_max_pixels due to min_pixels constraint
+        if len(images) > 1:
+            total_resized_pixels = sum(img.shape[-2] * img.shape[-1] for img in resized_images)
+            if total_resized_pixels > multi_image_max_pixels:
+                logger.warning_once(
+                    f"Multi-image batch total pixels ({total_resized_pixels}) exceeds multi_image_max_pixels ({multi_image_max_pixels}). "
+                    f"This may happen when image_count * min_pixels > multi_image_max_pixels."
+                )
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        processed_grids = {}
+        for shape, stacked_images in grouped_images.items():
+            resized_height, resized_width = stacked_images.shape[-2:]
+            # Fused rescale and normalize
+            patches = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            if patches.ndim == 4:
+                # add a temporal dimension if we have images
+                patches = patches.unsqueeze(1)
+            if patches.shape[1] % temporal_patch_size != 0:
+                repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
+                patches = torch.cat([patches, repeats], dim=1)
+            batch_size, grid_t, channel = patches.shape[:3]
+            grid_t = grid_t // temporal_patch_size
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+            patches = patches.view(
+                batch_size,
+                grid_t,
+                temporal_patch_size,
+                channel,
+                grid_h // merge_size,
+                merge_size,
+                patch_size,
+                grid_w // merge_size,
+                merge_size,
+                patch_size,
+            )
+            # Reorder dimensions to group grid and patch information for subsequent flattening.
+            # (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w)
+            patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
+            flatten_patches = patches.reshape(
+                batch_size,
+                grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size,
+            )
+            processed_images_grouped[shape] = flatten_patches
+            processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_grids = reorder_images(processed_grids, grouped_images_index)
+        pixel_values = torch.cat(processed_images, dim=0)
+        image_grid_thw = torch.tensor(processed_grids)
+        return BatchFeature(
+            data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
+        )
+def _to_numpy(x):
+    """
+    Convert various tensor types to numpy array.
+    Supports torch.Tensor, tf.Tensor, jax.Array, np.ndarray, lists, and primitives.
+    Args:
+        x: Input value that can be a tensor from various frameworks or a Python primitive
+    Returns:
+        np.ndarray: NumPy array representation of the input
+    """
+    # Already numpy
+    if isinstance(x, np.ndarray):
+        return x
+    # Torch tensor or TensorFlow tensor (both have .numpy() method)
+    if hasattr(x, 'numpy'):
+        # For torch tensors on CUDA, need to move to CPU first
+        if hasattr(x, 'cpu'):
+            return x.cpu().numpy()
+        # For TensorFlow or already on CPU
+        return x.numpy()
+    # JAX arrays and other array-like objects that support __array__ protocol
+    if hasattr(x, '__array__'):
+        return np.asarray(x)
+    # Python primitives (list, tuple, int, float)
+    return np.array(x)
+class MossVLImagesKwargs(ImagesKwargs):
+    min_pixels: Optional[int]
+    max_pixels: Optional[int]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+class MossVLVideosKwargs(VideosKwargs, total=False):
+    video_fps: Optional[Union[int, float]]
+    min_frames: Optional[int]
+    max_frames: Optional[int]
+    num_extract_threads: Optional[int]
+class MossVLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: MossVLImagesKwargs
+    videos_kwargs: MossVLVideosKwargs
+    # _defaults = {
+    #     "text_kwargs": {
+    #         "padding": True,                    # 👈 启用 padding
+    #         "padding_side": "left",            # 👈 左 padding
+    #         "pad_to_multiple_of": 8,           # 👈 pad 到 8 的倍数
+    #         "return_token_type_ids": False,
+    #         "return_mm_token_type_ids": False,
+    #     },
+    #     "videos_kwargs": {"return_metadata": True},
+    # }
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_token_type_ids": False,
+            "return_mm_token_type_ids": False,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+class MossVLProcessor(ProcessorMixin):
+    r"""
+    Constructs a Moss-VL processor which wraps a Qwen2VL image processor, Moss-VL video processor and a Qwen2 tokenizer
+    into a single processor.
+    [`MossVLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`], [`MossVLVideoProcessor`] and [`Qwen2TokenizerFast`].
+    See the [`~MossVLProcessor.__call__`] and [`~MossVLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`MossVLVideoProcessor`], *optional*):
+            The video processor is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs
+    ):
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
+        self.vision_start_token = (
+            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
+        )
+        self.vision_end_token = (
+            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
+        )
+        # Placeholders used in input text
+        self.image_placeholder = "<|image|>"
+        self.video_placeholder = "<|video|>"
+        self.time_start_token = "<|time_start|>"
+        self.time_end_token = "<|time_end|>"
+        # EOS token for labels generation (assistant's response should end with this)
+        self.im_end_token = "<|im_end|>"
+        self.im_end_token_id = tokenizer.convert_tokens_to_ids(self.im_end_token)
+        # Vision-related token ids (all should be masked in labels)
+        self.vision_start_token_id = tokenizer.convert_tokens_to_ids(self.vision_start_token)
+        self.vision_end_token_id = tokenizer.convert_tokens_to_ids(self.vision_end_token)
+        # Token ids that should always be masked in labels (e.g. <|image_pad|>)
+        self.mask_token_ids = {self.image_token_id}
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        videos: Union[str, Dict[str, Any], List[Union[str, Dict[str, Any]]]] = None,
+        labels_spans: Optional[Union[List[tuple], List[List[tuple]]]] = None,
+        ignore_index: int = -100,
+        **kwargs: Unpack[MossVLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s)/video(s).
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded.
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared.
+            videos (`str`, `Dict`, `list[str]`, `list[Dict]`):
+                The video or batch of videos to be prepared. Each video can be:
+                - A string path to a video file
+                - A dict with keys:
+                    - "video_path": str, path to the video file
+                    - "segments": list of segments, where each segment is:
+                        - [start, end]: a time segment (left-closed, right-open interval in seconds)
+                        - [time]: a single frame at the specified time (in seconds)
+                  The number of segments should match the number of video placeholders in the text.
+            labels_spans (`list[list[int]]`, `list[list[list[int]]]`, *optional*):
+                Character-level spans indicating assistant regions in original text.
+                Each span is a [start, end] list with inclusive start and exclusive end.
+                Example: [[10, 50], [100, 150]] means characters [10:50) and [100:150) are assistant.
+                Note: Use list (not tuple) for spans as they will be modified in place during processing.
+                When provided, the processor will generate `labels` in the output, where:
+                - Non-assistant tokens have value `ignore_index` (-100 by default)
+                - Image tokens always have value `ignore_index` even in assistant part
+                - Other assistant tokens have their token id as label
+            ignore_index (`int`, *optional*, defaults to -100):
+                Value for masked positions in labels.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+            - **pixel_values** -- Pixel values to be fed to a model (concatenation of images and videos).
+            - **grid_thw** -- List of grid sizes (t, h, w) for each media item.
+            - **media_nums_per_sample** -- List of number of media items per sample.
+            - **labels** -- (Optional) Labels for training, only present when `labels_spans` is provided.
+        """
+        # Merge kwargs with defaults
+        output_kwargs = self._merge_kwargs(
+            MossVLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Step 1: Process images if provided
+        if images is not None:
+            images_kwargs = output_kwargs["images_kwargs"].copy()
+            images_kwargs["return_tensors"] = None
+            image_inputs = self.image_processor(images=images, **images_kwargs)
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        # Step 2: Process videos if provided
+        if videos is not None:
+            videos_kwargs = output_kwargs["videos_kwargs"].copy()
+            videos_kwargs["return_tensors"] = None
+            videos_inputs = self.video_processor(videos=videos, **videos_kwargs)
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            # If user has not requested video metadata, pop it
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+            video_metadata = None
+        # Step 3: Process text with placeholder replacement
+        if text is None or (isinstance(text, str) and len(text.strip()) == 0):
+            raise ValueError("Text input is required for MossVL processor and cannot be empty.")
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy()  # Copy to avoid in-place modifications
+        # Prepare labels_spans if provided
+        # labels_spans format: List[List[List[int]]] - batch of samples, each sample has multiple spans
+        # Each span is [start, end] (list, not tuple) so it can be modified in place
+        should_create_labels = labels_spans is not None
+        if should_create_labels:
+            # Ensure batch format: convert single sample spans to batch format
+            # Single sample: [[start, end], [start, end], ...]
+            # Batch: [[[start, end], ...], [[start, end], ...], ...]
+            if labels_spans and isinstance(labels_spans[0], list) and len(labels_spans[0]) == 2 and isinstance(labels_spans[0][0], int):
+                labels_spans = [labels_spans]
+        # Step 3.0-pre: Check if we need to reorder (when both images and videos exist)
+        # If only one media type exists, we can skip the expensive split+reorder+concat
+        has_images = images is not None and "pixel_values" in image_inputs
+        has_videos = videos is not None and "pixel_values_videos" in videos_inputs
+        needs_reorder = has_images and has_videos
+        image_pixel_values_list = []
+        video_pixel_values_list = []
+        # Step 3.0: Record the order of media in original text (before replacement)
+        # This will be used later to correctly order pixel_values and grid_thw
+        media_order_per_sample = []
+        for i in range(len(text)):
+            media_order = []
+            temp_text = text[i]
+            pos = 0
+            while pos < len(temp_text):
+                img_pos = temp_text.find(self.image_placeholder, pos)
+                vid_pos = temp_text.find(self.video_placeholder, pos)
+                if img_pos == -1 and vid_pos == -1:
+                    break
+                if img_pos != -1 and (vid_pos == -1 or img_pos < vid_pos):
+                    media_order.append(("image", img_pos))
+                    pos = img_pos + len(self.image_placeholder)
+                elif vid_pos != -1:
+                    media_order.append(("video", vid_pos))
+                    pos = vid_pos + len(self.video_placeholder)
+            media_order_per_sample.append(media_order)
+        # Step 3.0.1: Check if any sample has no media (empty samples need blank image)
+        # If there are empty samples, we need to enter slow path to handle them properly
+        has_empty_samples = any(len(order) == 0 for order in media_order_per_sample)
+        if has_empty_samples:
+            needs_reorder = True
+        # Split pixel values for reordering if needed
+        if needs_reorder:
+            if has_images:
+                flat_pixel_values = image_inputs["pixel_values"]
+                flat_grid_thw = image_inputs["image_grid_thw"]
+                # grid_thw is (t, h, w), num_patches = t * h * w
+                patch_counts = [int(np.prod(_to_numpy(grid))) for grid in flat_grid_thw]
+                if len(patch_counts) == 1:
+                    # Single image case: no need to split
+                    image_pixel_values_list = [flat_pixel_values]
+                elif len(patch_counts) > 1:
+                    # Multiple images: split by cumulative counts
+                    split_indices = np.cumsum(patch_counts)[:-1]
+                    image_pixel_values_list = np.split(flat_pixel_values, split_indices)
+            if has_videos:
+                flat_video_values = videos_inputs["pixel_values_videos"]
+                flat_video_grid = videos_inputs["video_grid_thw"]
+                video_patch_counts = [int(np.prod(_to_numpy(grid))) for grid in flat_video_grid]
+                if len(video_patch_counts) == 1:
+                    # Single video case: no need to split
+                    video_pixel_values_list = [flat_video_values]
+                elif len(video_patch_counts) > 1:
+                    # Multiple videos: split by cumulative counts
+                    split_indices = np.cumsum(video_patch_counts)[:-1]
+                    video_pixel_values_list = np.split(flat_video_values, split_indices)
+        # Step 3.1: Replace placeholders (simple replacement, no expansion yet)
+        # In MossVL, one image placeholder = one image token
+        # One video placeholder = one video token (will be expanded later)
+        for i in range(len(text)):
+            if should_create_labels:
+                # Replace and update spans for image placeholders
+                text[i], labels_spans[i] = self._replace_and_update_spans(
+                    text[i], self.image_placeholder, self.image_token, labels_spans[i]
+                )
+                # Replace and update spans for video placeholders
+                text[i], labels_spans[i] = self._replace_and_update_spans(
+                    text[i], self.video_placeholder, self.video_token, labels_spans[i]
+                )
+            else:
+                text[i] = text[i].replace(self.image_placeholder, self.image_token)
+                text[i] = text[i].replace(self.video_placeholder, self.video_token)
+        # Step 3.2: Validate token counts
+        n_images_in_text = [t.count(self.image_token) for t in text]
+        n_videos_in_text = [t.count(self.video_token) for t in text]
+        # Count placeholders in text
+        total_images_in_text = sum(n_images_in_text)
+        total_videos_in_text = sum(n_videos_in_text)
+        # Count actual images and videos provided
+        total_images_provided = len(image_grid_thw) if image_grid_thw is not None else 0
+        total_videos_provided = len(video_grid_thw) if video_grid_thw is not None else 0
+        # Validate image counts
+        if total_images_in_text != total_images_provided:
+            raise ValueError(
+                "Number of image tokens does not match number of images provided. "
+                f"Found {total_images_in_text} image tokens in text and {total_images_provided} images."
+            )
+        # Validate video counts
+        if total_videos_in_text != total_videos_provided:
+            raise ValueError(
+                "Number of video tokens does not match number of videos provided. "
+                f"Found {total_videos_in_text} video tokens in text and {total_videos_provided} videos."
+            )
+        # Step 3.3: Expand video tokens with timestamps
+        # Now expand each video token to multiple tokens (one per frame) with timestamps
+        if video_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    metadata = video_metadata[index]
+                    if metadata.fps is None:
+                        logger.warning_once(
+                            "MossVL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
+                            "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
+                            "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
+                        )
+                        metadata.fps = 24 if metadata.fps is None else metadata.fps
+                    # Calculate timestamps
+                    # Use actual_timestamps if available (for segments), otherwise use frames_indices
+                    actual_timestamps = getattr(metadata, 'actual_timestamps', None)
+                    curr_timestamp = self._calculate_timestamps(
+                        metadata.frames_indices,
+                        metadata.total_num_frames,
+                        metadata.fps,
+                        metadata.duration,
+                        self.video_processor.temporal_patch_size,
+                        actual_timestamps=actual_timestamps,
+                    )
+                    # Build video placeholder: one video token per frame with timestamp
+                    # video_grid_thw[index][0] is the temporal dimension (number of frames after merging)
+                    video_tokens = []
+                    for frame_idx in range(video_grid_thw[index][0]):
+                        curr_time = curr_timestamp[frame_idx]
+                        # Format: <|time_start|>X.X seconds<|time_end|><|image_pad|>
+                        video_tokens.append(
+                            f"{self.time_start_token}{curr_time:.1f} seconds{self.time_end_token}{self.image_token}"
+                        )
+                    # Wrap the entire video sequence with vision_start and vision_end tokens
+                    video_placeholder = f"{self.vision_start_token}{''.join(video_tokens)}{self.vision_end_token}"
+                    # Replace the video token with expanded sequence and update spans if needed
+                    if should_create_labels:
+                        text[i], labels_spans[i] = self._replace_and_update_spans(
+                            text[i], self.video_token, video_placeholder, labels_spans[i], replace_count=1
+                        )
+                    else:
+                        text[i] = text[i].replace(self.video_token, video_placeholder, 1)
+                    index += 1
+        # Step 4: Tokenize text
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        # Request offset_mapping if we need to create labels
+        if should_create_labels:
+            output_kwargs["text_kwargs"]["return_offsets_mapping"] = True
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        # ignore check_special_mm_tokens nums in test and input ids.
+        # self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
+        # Create labels if labels_spans was provided
+        if should_create_labels:
+            offset_mapping = text_inputs.pop("offset_mapping")
+            labels = self._create_labels_from_spans(
+                text_inputs["input_ids"],
+                offset_mapping,
+                labels_spans,
+                ignore_index
+            )
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        # Step 5: Concatenate pixel_values and grid_thw in sequence order
+        # Prepare output
+        output_data = {**text_inputs}
+        if not needs_reorder:
+            # Fast path: only one media type, no reordering needed
+            final_pixel_values = []
+            final_grid_thw = []
+            if has_images:
+                final_pixel_values.append(image_inputs["pixel_values"])
+                final_grid_thw.extend(image_grid_thw)
+            if has_videos:
+                final_pixel_values.append(videos_inputs["pixel_values_videos"])
+                final_grid_thw.extend(video_grid_thw)
+            if final_pixel_values:
+                output_data["pixel_values"] = np.concatenate(final_pixel_values, axis=0) if len(final_pixel_values) > 1 else final_pixel_values[0]
+            if final_grid_thw:
+                output_data["grid_thw"] = np.stack(final_grid_thw, axis=0)
+            # Calculate media_nums_per_sample
+            media_nums_per_sample = []
+            for batch_idx in range(len(text)):
+                media_order = media_order_per_sample[batch_idx]
+                media_nums_per_sample.append(len(media_order) if len(media_order) > 0 else 1)
+            # Don't add media_nums_per_sample to output_data yet
+            # Will add it after BatchFeature to keep it as list
+        else:
+            # Slow path: both images and videos exist, need reordering
+            final_pixel_values = []
+            final_grid_thw = []
+            media_nums_per_sample = []
+            # Global indices to track position in flattened image/video arrays
+            global_image_idx = 0
+            global_video_idx = 0
+            for batch_idx in range(len(text)):
+                # Use the recorded media order from Step 3.0
+                media_order = media_order_per_sample[batch_idx]
+                if len(media_order) == 0:
+                    # If no media provided for this sample, add a blank image
+                    media_nums_per_sample.append(1)
+                    min_pixels = 128 * 128
+                    patch_size = getattr(self.image_processor, "patch_size", None) or 16
+                    temporal_patch_size = getattr(self.image_processor, "temporal_patch_size", None) or 1
+                    merge_size = getattr(self.image_processor, "merge_size", None) or 2
+                    factor = patch_size * merge_size
+                    side = int(np.ceil(np.sqrt(min_pixels) / factor) * factor)
+                    grid_h = side // patch_size
+                    grid_w = side // patch_size
+                    grid_t = 1
+                    # Channel = 3 (RGB)
+                    channel = 3
+                    dim = channel * temporal_patch_size * patch_size * patch_size
+                    num_patches = grid_t * grid_h * grid_w
+                    blank_pixel_values = np.zeros((num_patches, dim), dtype=np.float32)
+                    blank_grid_thw = np.array([grid_t, grid_h, grid_w], dtype=np.int64)
+                    final_pixel_values.append(blank_pixel_values)
+                    final_grid_thw.append(blank_grid_thw)
+                else:
+                    media_nums_per_sample.append(len(media_order))
+                    # Collect media data according to the recorded order
+                    for media_type, _ in media_order:
+                        if media_type == "image" and image_grid_thw is not None:
+                            # Get image data
+                            if image_pixel_values_list:
+                                final_pixel_values.append(image_pixel_values_list[global_image_idx])
+                            final_grid_thw.append(image_grid_thw[global_image_idx])
+                            global_image_idx += 1
+                        elif media_type == "video" and video_grid_thw is not None:
+                            # Get video data
+                            if video_pixel_values_list:
+                                final_pixel_values.append(video_pixel_values_list[global_video_idx])
+                            final_grid_thw.append(video_grid_thw[global_video_idx])
+                            global_video_idx += 1
+            # Concatenate/stack to unified format
+            if final_pixel_values:
+                output_data["pixel_values"] = np.concatenate(final_pixel_values, axis=0)
+            if final_grid_thw:
+                output_data["grid_thw"] = np.stack(final_grid_thw, axis=0)
+            # Don't add media_nums_per_sample to output_data yet
+            # Will add it after BatchFeature to keep it as list
+        # Create cross_attention_mask using media_nums_per_sample
+        if "input_ids" in output_data and "grid_thw" in output_data and media_nums_per_sample:
+            cross_attention_mask = self._create_cross_attention_mask(
+                output_data["input_ids"],
+                output_data["grid_thw"],
+                media_nums_per_sample,
+                output_data.get("attention_mask", None)
+            )
+            output_data["cross_attention_mask"] = cross_attention_mask
+        # Add labels to output if created
+        if should_create_labels:
+            output_data["labels"] = labels
+        # BatchFeature will handle conversion to pt/tf/jax/np based on tensor_type
+        batch_feature = BatchFeature(data=output_data, tensor_type=return_tensors)
+        # Add media_nums_per_sample after BatchFeature to keep it as list (not tensor)
+        if media_nums_per_sample:
+            batch_feature["media_nums_per_sample"] = media_nums_per_sample
+        return batch_feature
+    def _create_cross_attention_mask(self, input_ids, grid_thw, media_nums_per_sample, attention_mask=None):
+        """
+        Create cross_attention_mask of shape (batch_size, 1, text_len, num_images).
+        Video frames are treated as individual images.
+        Mask values: True for masked, False for visible.
+        Causal masking: text can see images that appear at or before the text position.
+        Args:
+            input_ids: List of token ids
+            grid_thw: Grid sizes for each media item
+            media_nums_per_sample: Number of media items per sample
+            attention_mask: Optional attention mask to filter out padding positions
+        """
+        batch_size = len(input_ids)
+        max_text_len = max(len(ids) for ids in input_ids)
+        # Calculate total frames per sample to find max_num_frames
+        total_frames_per_sample = []
+        media_idx = 0
+        for b in range(batch_size):
+            num_media = media_nums_per_sample[b]
+            if num_media == 0:
+                total_frames_per_sample.append(0)
+                continue
+            sample_frames = 0
+            for _ in range(num_media):
+                # grid_thw is (N, 3) where first dim is t (num_frames)
+                t = grid_thw[media_idx][0]
+                sample_frames += t
+                media_idx += 1
+            total_frames_per_sample.append(sample_frames)
+        max_num_frames = max(total_frames_per_sample) if total_frames_per_sample else 0
+        if max_num_frames == 0:
+            return None
+        # Vectorized implementation for speed
+        # 1. Pad input_ids to create a tensor
+        # We use -1 as pad value since token ids are positive
+        input_ids_tensor = torch.full((batch_size, max_text_len), -1, dtype=torch.long)
+        for b, ids in enumerate(input_ids):
+            l = len(ids)
+            input_ids_tensor[b, :l] = torch.tensor(ids, dtype=torch.long)
+        # 2. Identify image tokens
+        is_image_token = (input_ids_tensor == self.image_token_id)
+        # 3. Compute cumulative image tokens (how many image tokens appeared up to position t)
+        # shape: (batch_size, text_len)
+        cum_image_tokens = is_image_token.cumsum(dim=1)
+        # 4. Create frame indices
+        # shape: (1, 1, max_num_frames)
+        frame_indices = torch.arange(max_num_frames).reshape(1, 1, -1)
+        # 5. Determine visibility based on causal relationship
+        # Text at `t` sees frame `i` if `cum_image_tokens[t] > i`
+        # Because if frame `i` is the (i+1)-th image token, it becomes visible when count reaches i+1
+        # shape: (batch_size, text_len, max_num_frames)
+        visible_mask = cum_image_tokens.unsqueeze(-1) > frame_indices
+        # 6. Apply attention_mask if provided
+        if attention_mask is not None:
+            # Convert to tensor if needed
+            if isinstance(attention_mask, torch.Tensor):
+                 attn_mask_tensor = attention_mask
+            else:
+                 # List of lists
+                 attn_mask_tensor = torch.zeros((batch_size, max_text_len), dtype=torch.long)
+                 for b, mask_row in enumerate(attention_mask):
+                     l = len(mask_row)
+                     attn_mask_tensor[b, :l] = torch.tensor(mask_row, dtype=torch.long)
+            # shape: (batch_size, text_len, 1)
+            valid_text = (attn_mask_tensor.unsqueeze(-1) == 1)
+            visible_mask = visible_mask & valid_text
+        # 7. Mask out frames that don't exist for a sample
+        # shape: (batch_size, 1, 1)
+        total_frames_tensor = torch.tensor(total_frames_per_sample).reshape(batch_size, 1, 1)
+        # shape: (batch_size, 1, max_num_frames)
+        valid_frames = frame_indices < total_frames_tensor
+        visible_mask = visible_mask & valid_frames
+        # 8. Create final mask (True for masked, False for visible)
+        mask = ~visible_mask
+        # 9. Add channel dimension: (batch_size, 1, text_len, max_num_frames)
+        mask = mask.unsqueeze(1)
+        return mask
+    def _replace_and_update_spans(
+        self,
+        text: str,
+        old_str: str,
+        new_str: str,
+        spans: List[List[int]],
+        replace_count: int = -1
+    ) -> tuple:
+        """
+        Replace occurrences of old_str with new_str and update spans accordingly.
+        Args:
+            text: The text to perform replacement on
+            old_str: String to be replaced
+            new_str: String to replace with
+            spans: List of [start, end] spans to update (modified in place)
+            replace_count: Maximum number of replacements (-1 for all)
+        Returns:
+            Tuple of (new_text, updated_spans)
+        """
+        delta = len(new_str) - len(old_str)
+        result_text = text
+        count = 0
+        search_start = 0
+        while True:
+            pos = result_text.find(old_str, search_start)
+            if pos == -1:
+                break
+            if replace_count != -1 and count >= replace_count:
+                break
+            # Update all spans that come after this position
+            for span in spans:
+                if span[0] > pos:
+                    # Span starts after replacement point
+                    span[0] += delta
+                    span[1] += delta
+                elif span[1] > pos:
+                    # Span ends after replacement point (spans the replacement)
+                    span[1] += delta
+            # Perform the replacement
+            result_text = result_text[:pos] + new_str + result_text[pos + len(old_str):]
+            search_start = pos + len(new_str)
+            count += 1
+        return result_text, spans
+    def _create_labels_from_spans(
+        self,
+        input_ids: List[List[int]],
+        offset_mapping: List[List[tuple]],
+        labels_spans: List[List[List[int]]],
+        ignore_index: int = -100,
+        mask_token_ids: Optional[set] = None
+    ) -> List[List[int]]:
+        """
+        Create labels from spans and offset_mapping.
+        Args:
+            input_ids: Tokenized input ids
+            offset_mapping: Character offsets for each token from tokenizer (special tokens included)
+            labels_spans: Updated spans indicating assistant regions (after text transformations)
+            ignore_index: Value for masked positions
+            mask_token_ids: Set of token ids that should always be masked (set to ignore_index)
+                in labels, regardless of whether they fall inside a span.
+                Defaults to self.mask_token_ids if not provided.
+        Returns:
+            labels: List of label ids, same shape as input_ids
+        Note:
+            - Tokenizer's offset_mapping already includes correct offsets for special tokens in text
+            - Only need to mask tokens inside <|vision_start|>...<|vision_end|>
+            - Tokens whose id is in mask_token_ids are always masked
+            - All other tokens in spans (including special tokens like <|im_end|>) get labels
+        """
+        if mask_token_ids is None:
+            mask_token_ids = self.mask_token_ids
+        batch_labels = []
+        for batch_idx in range(len(input_ids)):
+            ids = input_ids[batch_idx]
+            offsets = offset_mapping[batch_idx]
+            spans = labels_spans[batch_idx]
+            labels = [ignore_index] * len(ids)
+            # Process each span: find token range and set labels
+            for span_start, span_end in spans:
+                in_vision = False
+                # Find tokens that overlap with this span
+                for token_idx, (token_id, (char_start, char_end)) in enumerate(zip(ids, offsets)):
+                    # Skip tokens completely before this span
+                    if char_end <= span_start:
+                        continue
+                    # Stop when tokens are completely after this span
+                    if char_start >= span_end:
+                        break
+                    # Token overlaps with span, process it
+                    # Track vision region: <|vision_start|> ... <|vision_end|>
+                    if token_id == self.vision_start_token_id:
+                        in_vision = True
+                        continue
+                    if token_id == self.vision_end_token_id:
+                        in_vision = False
+                        continue
+                    # Skip tokens inside vision region
+                    if in_vision:
+                        continue
+                    # Always mask special tokens that should never have labels
+                    if token_id in mask_token_ids:
+                        continue
+                    # Set label for this token
+                    labels[token_idx] = token_id
+            batch_labels.append(labels)
+        return batch_labels
+    def _calculate_timestamps(
+        self,
+        frames_indices: Optional[Union[List[int], np.ndarray]],
+        total_num_frames: int,
+        video_fps: float,
+        duration: float,
+        merge_size: int = 1,
+        actual_timestamps: Optional[List[float]] = None
+    ):
+        """
+        Calculate timestamps for video frames.
+        Args:
+            frames_indices: Actual frame indices extracted (if available)
+            total_num_frames: Total number of sampled frames
+            video_fps: Video frames per second
+            duration: Video duration in seconds
+            merge_size: Temporal merge size
+            actual_timestamps: Pre-calculated actual timestamps (for segments)
+        Returns:
+            List of timestamps (one per merged temporal patch)
+        """
+        # If actual timestamps are provided (from segment), use them directly
+        if actual_timestamps is not None:
+            timestamps = list(actual_timestamps)
+            # Pad timestamps to be multiple of merge_size
+            if len(timestamps) % merge_size != 0:
+                timestamps.extend([timestamps[-1]] * (merge_size - len(timestamps) % merge_size))
+            # Frames are merged by merge_size, so we average the timestamps within each temporal patch
+            timestamps = [
+                (timestamps[i] + timestamps[i + merge_size - 1]) / 2
+                for i in range(0, len(timestamps), merge_size)
+            ]
+            return timestamps
+        # Use frames_indices if available, otherwise generate uniformly sampled indices
+        if frames_indices is not None:
+            if isinstance(frames_indices, np.ndarray):
+                indices = frames_indices.tolist()
+            else:
+                indices = list(frames_indices)
+        else:
+            # Generate uniformly sampled frame indices
+            if total_num_frames <= 1:
+                indices = [0]
+            else:
+                # Uniformly sample frames across the video duration
+                indices = np.linspace(0, duration * video_fps - 1, total_num_frames).astype(np.int32).tolist()
+        # Pad indices to be multiple of merge_size
+        if len(indices) % merge_size != 0:
+            indices.extend([indices[-1]] * (merge_size - len(indices) % merge_size))
+        # Convert frame indices to timestamps
+        timestamps = [idx / video_fps for idx in indices]
+        # Frames are merged by merge_size, so we average the timestamps within each temporal patch
+        timestamps = [
+            (timestamps[i] + timestamps[i + merge_size - 1]) / 2
+            for i in range(0, len(timestamps), merge_size)
+        ]
+        return timestamps
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's batch_decode.
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to the tokenizer's decode.
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor
+                of shape `(batch_size, sequence_length)` or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode` method.
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+__all__ = ["MossVLProcessor", "MossVLImageProcessorFast"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+--extra-index-url https://download.pytorch.org/whl/cu128
+--extra-index-url https://pypi.nvidia.com
+torch==2.8.0+cu128
+torchvision==0.23.0+cu128
+transformers==4.57.1
+accelerate==1.12.0
+flash-attn==2.8.1
+torchcodec==0.7.0
+numpy==2.4.3
+pillow==12.1.1
+joblib==1.5.2
+einops==0.8.2
+ninja==1.13.0
+packaging==26.0

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52d44d7e09e05fb10f9ec5dc913bf1d62ff37ac249cb9ec47d891935149f5e3e
+size 11423034

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,258 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|time_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|time_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<|time_start|>",
+    "<|time_end|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|image|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|video|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|image|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|video|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+}

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "auto_map": {
+        "AutoProcessor": "processing_moss_vl.MossVLProcessor",
+        "AutoVideoProcessor": "video_processing_moss_vl.MossVLVideoProcessor"
+    },
+    "size": {
+        "longest_edge": 16777216,
+        "shortest_edge": 4096
+    },
+    "video_max_pixels": 943718400,
+    "patch_size": 16,
+    "temporal_patch_size": 1,
+    "merge_size": 2,
+    "video_fps": 1.0,
+    "min_frames": 1,
+    "max_frames": 256,
+    "num_extract_threads": 4,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "MossVLProcessor",
+    "video_processor_type": "MossVLVideoProcessor"
+}

video_processing_moss_vl.py ADDED Viewed

	@@ -0,0 +1,1132 @@

+# coding=utf-8
+# Copyright 2025 The FNLP Vision Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""video processor class for Moss-VL."""
+import json
+import logging as system_logging
+import math
+import os
+import re
+import subprocess
+import traceback
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import torch
+from joblib import Parallel, delayed
+from torchcodec.decoders import VideoDecoder
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size, validate_kwargs
+from transformers.processing_utils import Unpack, VideosKwargs
+from transformers.utils import TensorType, add_start_docstrings, logging
+from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
+from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
+logger = logging.get_logger(__name__)
+# -----------------------------------------------------------------------------
+# Torchcodec video frame extraction utilities
+# -----------------------------------------------------------------------------
+def check_video_for_extra_streams_and_errors(video_path: str) -> dict:
+    """
+    Check if video file has abnormal streams or errors reported by ffprobe.
+    Args:
+        video_path: Path to the video file.
+    Returns:
+        A dictionary containing:
+        - 'has_extra_streams': bool, whether there are streams other than video and audio.
+        - 'unsupported_codec_errors': list, all "Unsupported codec" error messages.
+        - 'ffprobe_output_error': str, other errors/warnings from ffprobe stderr.
+        - 'ffprobe_successful': bool, whether ffprobe command executed successfully (return code 0).
+        - 'stream_details': list, codec_type and index for each stream.
+        - 'num_streams': int, total number of streams identified in the video file.
+    """
+    result = {
+        'has_extra_streams': False,
+        'unsupported_codec_errors': [],
+        'ffprobe_output_error': '',
+        'ffprobe_successful': False,
+        'stream_details': [],
+        'num_streams': 0
+    }
+    command = [
+        "ffprobe",
+        "-v", "error",
+        "-show_streams",
+        "-show_format",
+        "-of", "json",
+        video_path
+    ]
+    try:
+        process = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            check=False
+        )
+        result['ffprobe_successful'] = (process.returncode == 0)
+        if process.stderr:
+            result['ffprobe_output_error'] = process.stderr
+            unsupported_codec_pattern = re.compile(r"Unsupported codec with id \d+ for input stream \d+")
+            result['unsupported_codec_errors'] = unsupported_codec_pattern.findall(process.stderr)
+        if process.stdout:
+            ffprobe_data = json.loads(process.stdout)
+            if 'streams' in ffprobe_data:
+                result['num_streams'] = len(ffprobe_data['streams'])
+                for stream in ffprobe_data['streams']:
+                    stream_type = stream.get('codec_type')
+                    stream_index = stream.get('index')
+                    result['stream_details'].append({'index': stream_index, 'codec_type': stream_type})
+                    if stream_type not in ['video', 'audio']:
+                        result['has_extra_streams'] = True
+            if 'format' in ffprobe_data and 'nb_streams' in ffprobe_data['format']:
+                if result['num_streams'] == 0:
+                    result['num_streams'] = ffprobe_data['format']['nb_streams']
+                elif result['num_streams'] != ffprobe_data['format']['nb_streams']:
+                    logger.warning(
+                        f"Number of streams in 'streams' list ({result['num_streams']}) "
+                        f"differs from 'nb_streams' in 'format' ({ffprobe_data['format']['nb_streams']})."
+                    )
+    except FileNotFoundError:
+        result['ffprobe_output_error'] = "ffprobe command not found. Please ensure FFmpeg is installed and in your PATH."
+        result['ffprobe_successful'] = False
+    except json.JSONDecodeError:
+        result['ffprobe_output_error'] = "Failed to parse ffprobe JSON output. Check ffprobe installation or video file."
+        result['ffprobe_successful'] = False
+    except Exception as e:
+        result['ffprobe_output_error'] = f"An unexpected error occurred: {e}"
+        result['ffprobe_successful'] = False
+    return result
+def remove_video_extra_stream_ffmpeg(input_video: str, output_video: str) -> bool:
+    """
+    Remove extra streams from video using ffmpeg.
+    Args:
+        input_video: Path to input video.
+        output_video: Path to output video.
+    Returns:
+        bool: True if successful, False otherwise.
+    """
+    command_list = [
+        "ffmpeg", "-y", "-i", input_video,
+        "-map", "0:v:0",
+        "-c", "copy",
+        "-an",
+        "-sn",
+        "-dn",
+        "-map_metadata", "-1",
+        "-map_chapters", "-1",
+        "-movflags", "faststart",
+        output_video,
+    ]
+    try:
+        subprocess.run(command_list, shell=False, check=True, capture_output=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        system_logging.error(f"Command execution failed with return code: {e.returncode}, video: {input_video}")
+        system_logging.error(f"Error output:\n{e.stderr}")
+        return False
+    except FileNotFoundError:
+        system_logging.error("Error: ffmpeg command not found. Please ensure ffmpeg is installed and in PATH.")
+        return False
+    except Exception as e:
+        system_logging.error(f"Unexpected error executing command: {e}, video: {input_video}", exc_info=True)
+        return False
+def clean_video_streams(video_path: str) -> str:
+    """
+    Clean video streams if extra streams are detected.
+    Args:
+        video_path: Path to the video file.
+    Returns:
+        str: Path to cleaned video (or original if no cleaning needed).
+    """
+    ffprobe_res = check_video_for_extra_streams_and_errors(video_path)
+    if ffprobe_res['has_extra_streams']:
+        base_name = os.path.basename(video_path)
+        output_folder = os.path.dirname(video_path)
+        file_name_without_ext, file_ext = os.path.splitext(base_name)
+        new_base_name = f"{file_name_without_ext}_fix{file_ext}"
+        video_path_output = os.path.join(output_folder, new_base_name)
+        process_flag = remove_video_extra_stream_ffmpeg(video_path, video_path_output)
+        if not process_flag:
+            logger.warning("Failed to remove extra streams with ffmpeg")
+            return video_path
+        return video_path_output
+    return video_path
+def split_indices(indices: List[Union[int, float]], num_chunks: int) -> List[List[Union[int, float]]]:
+    """
+    Split an index list into roughly equal chunks.
+    Args:
+        indices: List of indices to split.
+        num_chunks: Number of chunks to create.
+    Returns:
+        List of index chunks.
+    """
+    chunk_size = len(indices) // num_chunks
+    chunks = []
+    for i in range(num_chunks - 1):
+        chunks.append(indices[i * chunk_size:(i + 1) * chunk_size])
+    chunks.append(indices[(num_chunks - 1) * chunk_size:])
+    return chunks
+def decode_sequentially(indices: List[int], video_path: str, ffmpeg_threads: int = 0):
+    """
+    Decode frames sequentially from a video.
+    Args:
+        indices: List of frame indices to decode.
+        video_path: Path to the video file.
+        ffmpeg_threads: Number of ffmpeg threads to use.
+    Returns:
+        FrameBatch from torchcodec.
+    """
+    decoder = VideoDecoder(video_path, num_ffmpeg_threads=ffmpeg_threads)
+    try:
+        return decoder.get_frames_at(indices)
+    finally:
+        del decoder
+def decode_with_multithreading(indices: List[int], num_threads: int, video_path: str) -> dict:
+    """
+    Decode frames using multithreading with joblib.
+    Args:
+        indices: List of frame indices to decode.
+        num_threads: Number of threads to use.
+        video_path: Path to the video file.
+    Returns:
+        dict: Contains 'data', 'duration_seconds', 'pts_seconds' tensors.
+    """
+    chunks = split_indices(indices, num_chunks=num_threads)
+    results = Parallel(n_jobs=num_threads, prefer="threads", verbose=0)(
+        delayed(decode_sequentially)(chunk, video_path) for chunk in chunks
+    )
+    return {
+        "data": torch.cat([frame_batch.data for frame_batch in results], dim=0),
+        "duration_seconds": torch.cat([frame_batch.duration_seconds for frame_batch in results], dim=0),
+        "pts_seconds": torch.cat([frame_batch.pts_seconds for frame_batch in results], dim=0)
+    }
+def decode_sequentially_timestamp(timestamp_list: List[float], video_path: str, ffmpeg_threads: int = 0):
+    """
+    Decode frames sequentially from a video based on timestamps.
+    Args:
+        timestamp_list: List of timestamps (in seconds) to decode.
+        video_path: Path to the video file.
+        ffmpeg_threads: Number of ffmpeg threads to use.
+    Returns:
+        FrameBatch from torchcodec.
+    """
+    decoder = VideoDecoder(video_path, num_ffmpeg_threads=ffmpeg_threads)
+    try:
+        metadata = decoder.metadata
+        min_pts = metadata.begin_stream_seconds_from_content
+        if min_pts is None:
+            min_pts = 0.0
+        max_pts = None
+        if metadata.num_frames_from_content and metadata.average_fps:
+            max_pts = (metadata.num_frames_from_content - 1) / metadata.average_fps + min_pts
+        elif metadata.end_stream_seconds_from_content is not None:
+            max_pts = metadata.end_stream_seconds_from_content
+        else:
+            max_pts = metadata.duration_seconds
+        if max_pts is not None and max_pts > 0:
+            timestamp_list = [max(min_pts, min(t, max_pts)) for t in timestamp_list]
+        elif min_pts > 0:
+            timestamp_list = [max(min_pts, t) for t in timestamp_list]
+        return decoder.get_frames_played_at(timestamp_list)
+    finally:
+        del decoder
+def timestamp_decode_with_multithreading(timestamp_list: List[float], num_threads: int, video_path: str) -> dict:
+    """
+    Decode frames using multithreading based on timestamps.
+    Args:
+        timestamp_list: List of timestamps (in seconds) to decode.
+        num_threads: Number of threads to use.
+        video_path: Path to the video file.
+    Returns:
+        dict: Contains 'data', 'duration_seconds', 'pts_seconds' tensors.
+    """
+    chunks = split_indices(timestamp_list, num_chunks=num_threads)
+    results = Parallel(n_jobs=num_threads, prefer="threads", verbose=0)(
+        delayed(decode_sequentially_timestamp)(chunk, video_path) for chunk in chunks
+    )
+    # Concatenate results from all threads
+    data_list = [frame_batch.data for frame_batch in results]
+    duration_list = [frame_batch.duration_seconds for frame_batch in results]
+    pts_list = [frame_batch.pts_seconds for frame_batch in results]
+    if not data_list:
+        logger.warning("No frames were successfully decoded.")
+        return {"data": torch.empty(0), "duration_seconds": torch.empty(0), "pts_seconds": torch.empty(0)}
+    return {
+        "data": torch.cat(data_list, dim=0),
+        "duration_seconds": torch.cat(duration_list, dim=0),
+        "pts_seconds": torch.cat(pts_list, dim=0)
+    }
+def extract_frames_with_torchcodec(
+    video_path: str,
+    sample_frames_count: int,
+    num_threads: int = 4,
+) -> Optional[dict]:
+    """
+    Extract frames from video using torchcodec with multithreading.
+    Args:
+        video_path: Path to the video file.
+        sample_frames_count: Number of frames to sample.
+        num_threads: Number of threads to use for extraction.
+        sampling_method: Sampling method, either "index" (uniform frame indices) or "timestamp" (uniform timestamps).
+    Returns:
+        dict: Contains 'data' (N, C, H, W), 'duration_seconds' (N,), 'pts_seconds' (N,) tensors.
+              Returns None if extraction fails.
+    """
+    try:
+        video_path = clean_video_streams(video_path)
+        decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
+        metadata = decoder.metadata
+        total_frames_in_video = metadata.num_frames_from_content
+        effective_sample_count = min(sample_frames_count, total_frames_in_video)
+        if effective_sample_count == 0:
+            logger.error("Cannot extract frames: video has 0 frames or specified frame count is 0")
+            return None
+        # Generate uniform frame indices
+        frame_indices = np.linspace(0, total_frames_in_video - 1, effective_sample_count).astype(np.int32)
+        # Ensure indices are valid and remove duplicates
+        frame_indices = np.unique(np.clip(frame_indices, 0, total_frames_in_video - 1))
+        result = decode_with_multithreading(frame_indices.tolist(), num_threads=num_threads, video_path=video_path)
+        # Add frame_indices to the result for later use
+        result["frame_indices"] = frame_indices
+        return result
+    except Exception:
+        traceback.print_exc()
+        return None
+def smart_resize(
+    num_frames: int,
+    height: int,
+    width: int,
+    temporal_factor: int = 1,
+    factor: int = 32,
+    min_pixels: int = 128 * 128,
+    max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144,
+    per_frame_min_pixels: int = None,
+    per_frame_max_pixels: int = None,
+):
+    if num_frames < temporal_factor:
+        raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}")
+    if height < factor or width < factor:
+        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    t_bar = round(num_frames / temporal_factor) * temporal_factor
+    # Step 1: Apply per-frame upper limit constraint
+    if per_frame_max_pixels is not None and h_bar * w_bar > per_frame_max_pixels:
+        beta = math.sqrt((height * width) / per_frame_max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    # Step 2: Apply 3D volume constraints (frames * height * width)
+    if t_bar * h_bar * w_bar > max_pixels:
+        beta = math.sqrt((num_frames * height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif t_bar * h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (num_frames * height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    # Step 3: Ensure per-frame lower limit is respected (after volume constraint)
+    # This guarantees single frame stays within [per_frame_min_pixels, per_frame_max_pixels]
+    if per_frame_min_pixels is not None and h_bar * w_bar < per_frame_min_pixels:
+        beta = math.sqrt(per_frame_min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class MossVLVideoProcessorInitKwargs(VideosKwargs):
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+    min_frames: Optional[int]
+    max_frames: Optional[int]
+    video_fps: Optional[Union[int, float]]
+    num_extract_threads: Optional[int]
+    # Total 3D volume budget across all videos; distributed proportionally per video by T*H*W
+    video_max_pixels: Optional[int]
+@add_start_docstrings(
+    "Constructs a fast Moss-VL video processor that dynamically resizes videos based on the original videos.",
+    BASE_VIDEO_PROCESSOR_DOCSTRING,
+    """
+        patch_size (`int`, *optional*, defaults to 16):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 1):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+        video_fps (`float`, *optional*, defaults to 1.0):
+            Target frames per second for video sampling.
+        min_frames (`int`, *optional*, defaults to 1):
+            Minimum number of frames to sample from a video.
+        max_frames (`int`, *optional*, defaults to 256):
+            Maximum number of frames to sample from a video.
+        num_extract_threads (`int`, *optional*, defaults to 4):
+            Number of threads to use for frame extraction.
+    """,
+)
+class MossVLVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    size = {"shortest_edge": 128 * 32 * 32, "longest_edge": 32 * 32 * 768}
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    patch_size = 16
+    temporal_patch_size = 1
+    merge_size = 2
+    video_fps = 1.0
+    min_frames = 1
+    max_frames = 256
+    num_extract_threads = 4
+    do_sample_frames = True
+    # Total 3D volume budget across all videos; distributed proportionally per video by T*H*W
+    video_max_pixels = None  # read from config
+    valid_kwargs = MossVLVideoProcessorInitKwargs
+    model_input_names = ["pixel_values_videos", "video_grid_thw"]
+    def __init__(self, **kwargs: Unpack[MossVLVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+        if self.size is not None and (
+            self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
+        ):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+        return super()._further_process_kwargs(size=size, **kwargs)
+    def _get_video_path_from_input(self, video_input: Union[str, Dict[str, Any]]) -> str:
+        """Normalize a video input into a video path."""
+        if isinstance(video_input, dict):
+            return video_input["video_path"]
+        return video_input
+    def _get_video_duration_seconds(self, video_input: Union[str, Dict[str, Any]]) -> float:
+        """Get video duration in seconds for weighted frame-budget allocation."""
+        video_path = clean_video_streams(self._get_video_path_from_input(video_input))
+        decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
+        try:
+            metadata = decoder.metadata
+            duration = None
+            if (
+                metadata.end_stream_seconds_from_content is not None
+                and metadata.begin_stream_seconds_from_content is not None
+            ):
+                duration = metadata.end_stream_seconds_from_content - metadata.begin_stream_seconds_from_content
+            if duration is None or duration <= 0:
+                duration = metadata.duration_seconds
+            return max(0.0, float(duration or 0.0))
+        finally:
+            del decoder
+    def _allocate_max_frames_for_multiple_videos(
+        self,
+        video_inputs: List[Union[str, Dict[str, Any]]],
+        total_max_frames: Optional[int],
+    ) -> List[Optional[int]]:
+        """
+        Treat max_frames as a total budget for multi-video input and allocate it by duration.
+        The returned values are per-video max_frames. Segment dict inputs still keep their
+        existing per-segment weighting logic after receiving the video-level allocation.
+        """
+        if not video_inputs:
+            return []
+        if total_max_frames is None or len(video_inputs) == 1:
+            return [total_max_frames] * len(video_inputs)
+        total_max_frames = int(total_max_frames)
+        num_videos = len(video_inputs)
+        if total_max_frames < num_videos:
+            logger.warning(
+                "Received max_frames=%s for %s videos. At least one frame per video is required, "
+                "so falling back to 1 frame per video.",
+                total_max_frames,
+                num_videos,
+            )
+            return [1] * num_videos
+        video_durations = [self._get_video_duration_seconds(video_input) for video_input in video_inputs]
+        total_duration = sum(video_durations)
+        # Reserve one frame per video first, then distribute the remaining budget by duration.
+        allocations = [1] * num_videos
+        remaining_budget = total_max_frames - num_videos
+        if remaining_budget == 0:
+            return allocations
+        if total_duration <= 0:
+            raw_extra_allocations = [remaining_budget / num_videos] * num_videos
+        else:
+            raw_extra_allocations = [
+                remaining_budget * (duration / total_duration) for duration in video_durations
+            ]
+        base_extra_allocations = [int(math.floor(value)) for value in raw_extra_allocations]
+        allocations = [base + extra for base, extra in zip(allocations, base_extra_allocations)]
+        remainder = remaining_budget - sum(base_extra_allocations)
+        if remainder > 0:
+            fractional_parts = [
+                (raw_value - base_value, index)
+                for index, (raw_value, base_value) in enumerate(zip(raw_extra_allocations, base_extra_allocations))
+            ]
+            fractional_parts.sort(key=lambda item: (-item[0], item[1]))
+            for _, index in fractional_parts[:remainder]:
+                allocations[index] += 1
+        return allocations
+    def calculate_num_frames(
+        self,
+        metadata: VideoMetadata,
+        num_frames: Optional[int] = None,
+        fps: Optional[Union[int, float]] = None,
+        min_frames: Optional[int] = None,
+        max_frames: Optional[int] = None,
+        **kwargs,
+    ) -> int:
+        """
+        Calculate the number of frames to sample using fps-based logic with min/max constraints.
+        Logic:
+        1. Calculate target_frames based on fps and video duration
+        2. Apply min_frames and max_frames constraints
+        3. Apply max_allowed_frames protection (rough cap from total video_max_pixels budget)
+        4. Return the number of frames to sample
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. If provided, overrides fps-based calculation.
+            fps (`int` or `float`, *optional*):
+                Target frames to sample per second. Defaults to `self.video_fps`.
+            min_frames (`int`, *optional*):
+                Minimum number of frames to sample. If None, uses self.min_frames.
+            max_frames (`int`, *optional*):
+                Maximum number of frames to sample. If None, uses self.max_frames.
+        Returns:
+            int:
+                Number of frames to sample.
+        """
+        if fps is not None and num_frames is not None:
+            raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+        total_num_frames = metadata.total_num_frames
+        # Use provided min/max or fall back to defaults
+        effective_min_frames = min_frames if min_frames is not None else self.min_frames
+        effective_max_frames = max_frames if max_frames is not None else self.max_frames
+        # Rough per-video frame cap derived from the multi-video total budget
+        # (exact allocation happens later in _preprocess via weighted distribution)
+        per_frame_min_pixels = self.size.get("shortest_edge", None) if self.size else None
+        video_max_pixels = getattr(self, "video_max_pixels", None)
+        if per_frame_min_pixels is not None and video_max_pixels is not None and per_frame_min_pixels > 0:
+            max_allowed_frames = video_max_pixels // per_frame_min_pixels
+            effective_max_frames = min(effective_max_frames, max_allowed_frames)
+        # Get video duration
+        if hasattr(metadata, 'duration') and metadata.duration is not None:
+            duration = metadata.duration
+        else:
+            video_fps = metadata.fps
+            if video_fps is not None and video_fps > 0:
+                duration = total_num_frames / video_fps
+            else:
+                # Fallback: assume 24 fps
+                video_fps = 24.0
+                duration = total_num_frames / video_fps
+                logger.warning_once(
+                    "Could not determine video fps from metadata, defaulting to 24 fps for duration calculation."
+                )
+        # Use provided fps or default
+        target_fps = fps if fps is not None else self.video_fps
+        # Calculate target frames based on fps and duration
+        if num_frames is None:
+            # Calculate how many frames we should sample based on target fps
+            target_total_frames = int(math.ceil(duration * target_fps - 1e-6))
+            # Apply min/max constraints
+            sample_frames = max(target_total_frames, effective_min_frames)
+            sample_frames = min(sample_frames, effective_max_frames, total_num_frames)
+        else:
+            # If num_frames is explicitly provided, use it directly with constraints
+            sample_frames = min(max(num_frames, effective_min_frames), effective_max_frames, total_num_frames)
+        return sample_frames
+    def _fetch_video_segment(
+        self,
+        video_path: str,
+        segment: List[float],
+        min_frames: Optional[int] = None,
+        max_frames: Optional[int] = None,
+        video_fps: Optional[float] = None,
+    ):
+        """
+        Fetch video frames for a specific segment.
+        Args:
+            video_path: Path to the video file
+            segment: [start, end] for a segment (left-closed, right-open) or [time] for a single frame
+            min_frames: Minimum frames for this segment (weighted). Defaults to self.min_frames. Must be >= 1.
+            max_frames: Maximum frames for this segment (weighted). Defaults to self.max_frames. Must be >= 1.
+            video_fps: Target frames per second for video sampling. If None, uses self.video_fps.
+        Returns:
+            Tuple of (video_tensor, video_metadata)
+        """
+        # Use provided min/max or fall back to defaults, ensure >= 1
+        min_frames = max(1, min_frames if min_frames is not None else self.min_frames)
+        max_frames = max(1, max_frames if max_frames is not None else self.max_frames)
+        # Use provided video_fps or fall back to self.video_fps
+        target_video_fps = video_fps if video_fps is not None else self.video_fps
+        video_path = clean_video_streams(video_path)
+        decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
+        try:
+            torchcodec_metadata = decoder.metadata
+            video_fps = torchcodec_metadata.average_fps
+            # Calculate duration
+            duration = None
+            if torchcodec_metadata.end_stream_seconds_from_content is not None and torchcodec_metadata.begin_stream_seconds_from_content is not None:
+                duration = torchcodec_metadata.end_stream_seconds_from_content - torchcodec_metadata.begin_stream_seconds_from_content
+            if duration is None or duration <= 0:
+                duration = torchcodec_metadata.duration_seconds
+            if len(segment) == 1:
+                # Single frame at specified time
+                timestamp = segment[0]
+                frame_batch = decoder.get_frames_played_at([timestamp])
+                video_tensor = frame_batch.data
+                actual_timestamps = [timestamp]
+                sample_count = 1
+            else:
+                # Segment [start, end) - left-closed, right-open interval
+                start_time, end_time = segment
+                segment_duration = end_time - start_time
+                # Calculate number of frames to sample for this segment
+                target_frames = int(math.ceil(segment_duration * target_video_fps))
+                target_frames = max(target_frames, min_frames)
+                target_frames = min(target_frames, max_frames)
+                # Generate timestamps for uniform sampling within segment
+                if target_frames == 1:
+                    actual_timestamps = [start_time]  # Use start_time for single frame
+                else:
+                    # Sample uniformly within [start, end), endpoint=False for left-closed right-open
+                    actual_timestamps = np.linspace(start_time, end_time, target_frames, endpoint=False).tolist()
+                # Use multithreading for extraction
+                result = timestamp_decode_with_multithreading(actual_timestamps, self.num_extract_threads, video_path)
+                video_tensor = result["data"]
+                sample_count = len(actual_timestamps)
+            # Create VideoMetadata
+            video_metadata = VideoMetadata(
+                total_num_frames=sample_count,
+                fps=video_fps,
+                duration=duration,
+                video_backend="torchcodec",
+                height=torchcodec_metadata.height,
+                width=torchcodec_metadata.width,
+                frames_indices=None
+            )
+            # Store actual timestamps as a custom attribute for _calculate_timestamps to use
+            video_metadata.actual_timestamps = actual_timestamps
+            return video_tensor, video_metadata
+        finally:
+            del decoder
+    def fetch_videos(
+        self,
+        video_url_or_urls: Union[str, Dict[str, Any], List[Union[str, Dict[str, Any]]]],
+        sample_indices_fn=None,
+        video_fps: Optional[float] = None,
+        min_frames: Optional[int] = None,
+        max_frames: Optional[int] = None,
+    ):
+        """
+        Override fetch_videos to use torchcodec for frame extraction.
+        This method uses torchcodec with multithreading for efficient frame extraction.
+        Frame count is calculated by the calculate_num_frames method
+        (fps-based with min/max constraints).
+        Args:
+            video_url_or_urls: Can be one of:
+                - str: Single video path
+                - Dict: Video with segments {"video_path": str, "segments": List[List[float]]}
+                - List[Union[str, Dict]]: List of video paths or segment dicts
+            sample_indices_fn: (Not used) Kept for compatibility with base class signature.
+            video_fps: Target frames per second for video sampling. If None, uses self.video_fps.
+            min_frames: Minimum number of frames to sample. If None, uses self.min_frames.
+            max_frames: Maximum number of frames to sample. If None, uses self.max_frames.
+        Returns:
+            Tuple of (videos, metadata) where videos are torch.Tensors and metadata are VideoMetadata objects.
+        """
+        # Use provided values or fall back to self defaults
+        effective_video_fps = video_fps if video_fps is not None else self.video_fps
+        effective_min_frames = min_frames if min_frames is not None else self.min_frames
+        effective_max_frames = max_frames if max_frames is not None else self.max_frames
+        # Handle recursive calls for lists
+        if isinstance(video_url_or_urls, list):
+            all_videos = []
+            all_metadata = []
+            if len(video_url_or_urls) == 1:
+                per_video_max_frames = [effective_max_frames]
+            else:
+                per_video_max_frames = self._allocate_max_frames_for_multiple_videos(
+                    video_url_or_urls,
+                    effective_max_frames,
+                )
+            for x, allocated_max_frames in zip(video_url_or_urls, per_video_max_frames):
+                result = self.fetch_videos(
+                    x,
+                    video_fps=effective_video_fps,
+                    min_frames=effective_min_frames,
+                    max_frames=allocated_max_frames,
+                )
+                # Check if result is from segment expansion (returns lists) or single item
+                if isinstance(result[0], list):
+                    all_videos.extend(result[0])
+                    all_metadata.extend(result[1])
+                else:
+                    all_videos.append(result[0])
+                    all_metadata.append(result[1])
+            return all_videos, all_metadata
+        # Handle dict with segments - returns lists (one per segment)
+        if isinstance(video_url_or_urls, dict):
+            video_path = video_url_or_urls["video_path"]
+            segments = video_url_or_urls["segments"]
+            # Calculate total duration of all time-range segments (len == 2) for weighted min/max frames
+            # Single-frame segments (len == 1) are excluded from weighting
+            segment_durations = []
+            for seg in segments:
+                if len(seg) == 2:
+                    segment_durations.append(seg[1] - seg[0])
+                else:
+                    segment_durations.append(None)  # Single frame, no weighting
+            total_segment_duration = sum(d for d in segment_durations if d is not None)
+            videos = []
+            metadata = []
+            for i, segment in enumerate(segments):
+                if len(segment) == 1:
+                    # Single frame - no weighted min/max, just extract directly
+                    video, meta = self._fetch_video_segment(video_path, segment, video_fps=effective_video_fps)
+                else:
+                    # Time-range segment - apply weighted min/max frames
+                    if total_segment_duration > 0:
+                        weight = segment_durations[i] / total_segment_duration
+                    else:
+                        # Fallback: equal weight among time-range segments
+                        num_range_segments = sum(1 for d in segment_durations if d is not None)
+                        weight = 1.0 / num_range_segments if num_range_segments > 0 else 1.0
+                    # Calculate weighted min/max frames (ensure >= 1)
+                    weighted_min_frames = max(1, int(round(effective_min_frames * weight)))
+                    weighted_max_frames = max(1, int(round(effective_max_frames * weight)))
+                    video, meta = self._fetch_video_segment(
+                        video_path, segment,
+                        min_frames=weighted_min_frames,
+                        max_frames=weighted_max_frames,
+                        video_fps=effective_video_fps,
+                    )
+                videos.append(video)
+                metadata.append(meta)
+            return videos, metadata
+        # Single video path
+        video_path = video_url_or_urls
+        # Clean video streams first (remove extra streams if needed)
+        video_path = clean_video_streams(video_path)
+        decoder = None
+        try:
+            # Create VideoDecoder only once for both metadata and frame extraction
+            decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
+            torchcodec_metadata = decoder.metadata
+            duration = None
+            if torchcodec_metadata.end_stream_seconds_from_content is not None and torchcodec_metadata.begin_stream_seconds_from_content is not None:
+                duration = torchcodec_metadata.end_stream_seconds_from_content - torchcodec_metadata.begin_stream_seconds_from_content
+            if duration is None or duration <= 0:
+                duration = torchcodec_metadata.duration_seconds
+            # Use num_frames_from_content for accurate frame count (consistent with extraction)
+            total_frames_in_video = torchcodec_metadata.num_frames_from_content
+            # Create VideoMetadata object for sample_frames method
+            temp_metadata = VideoMetadata(
+                total_num_frames=total_frames_in_video,
+                fps=torchcodec_metadata.average_fps,
+                duration=duration,
+                video_backend="torchcodec",
+                height=torchcodec_metadata.height,
+                width=torchcodec_metadata.width,
+                frames_indices=None
+            )
+            # Use calculate_num_frames method to get the number of frames to sample
+            sample_frames_count = self.calculate_num_frames(
+                temp_metadata,
+                fps=effective_video_fps,
+                min_frames=effective_min_frames,
+                max_frames=effective_max_frames,
+            )
+            # Ensure sample count is valid
+            effective_sample_count = min(sample_frames_count, total_frames_in_video)
+            if effective_sample_count == 0:
+                raise ValueError(f"Cannot extract frames: video has 0 frames or specified frame count is 0")
+            # Generate uniform frame indices
+            frame_indices = np.linspace(0, total_frames_in_video - 1, effective_sample_count).astype(np.int32)
+            # Ensure indices are valid and remove duplicates
+            frame_indices = np.unique(np.clip(frame_indices, 0, total_frames_in_video - 1))
+            # Extract frames using multithreading (decoder is created inside each thread for thread safety)
+            result = decode_with_multithreading(frame_indices.tolist(), num_threads=self.num_extract_threads, video_path=video_path)
+            # Extract frame tensor (N, C, H, W)
+            frames_tensor = result["data"]
+            # Create final VideoMetadata object
+            video_metadata = VideoMetadata(
+                total_num_frames=len(frame_indices),
+                fps=torchcodec_metadata.average_fps,
+                duration=duration,
+                video_backend="torchcodec",
+                height=torchcodec_metadata.height,
+                width=torchcodec_metadata.width,
+                frames_indices=frame_indices
+            )
+            # Ensure frames are in (T, C, H, W) format
+            if frames_tensor.dim() == 4:  # (N, C, H, W)
+                video_tensor = frames_tensor
+            else:
+                raise ValueError(f"Unexpected frame tensor shape: {frames_tensor.shape}")
+            return video_tensor, video_metadata
+        except Exception as e:
+            logger.error(f"Error loading video {video_path}: {e}")
+            traceback.print_exc()
+            raise ValueError(f"Failed to load video {video_path}: {e}")
+        finally:
+            if decoder is not None:
+                del decoder
+    def _preprocess(
+        self,
+        videos: list[torch.Tensor],
+        do_convert_rgb: bool = True,
+        do_resize: bool = True,
+        size: Optional[SizeDict] = None,
+        interpolation: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255.0,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ):
+        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
+        resized_videos_grouped = {}
+        video_max_pixels = getattr(self, "video_max_pixels", None)
+        if video_max_pixels is not None:
+            total_volume = sum(
+                sv.shape[0] * sv.shape[1] * sv.shape[3] * sv.shape[4]
+                for sv in grouped_videos.values()
+            )
+        else:
+            total_volume = 0
+        for shape, stacked_videos in grouped_videos.items():
+            B, T, C, H, W = stacked_videos.shape
+            num_frames, height, width = T, H, W
+            # Convert to RGB if needed (reuse from base class)
+            if do_convert_rgb:
+                stacked_videos = self.convert_to_rgb(stacked_videos)
+            if do_resize:
+                if video_max_pixels is not None and total_volume > 0:
+                    allocated_max_pixels = int(video_max_pixels * (T * H * W) / total_volume)
+                else:
+                    allocated_max_pixels = size.longest_edge
+                resized_height, resized_width = smart_resize(
+                    num_frames=num_frames,
+                    height=height,
+                    width=width,
+                    temporal_factor=temporal_patch_size,
+                    factor=patch_size * merge_size,
+                    min_pixels=size.shortest_edge,
+                    max_pixels=allocated_max_pixels,
+                    per_frame_min_pixels=size.shortest_edge,
+                    per_frame_max_pixels=size.longest_edge,
+                )
+                stacked_videos = stacked_videos.view(B * T, C, H, W)
+                stacked_videos = self.resize(
+                    stacked_videos,
+                    size=SizeDict(height=resized_height, width=resized_width),
+                    interpolation=interpolation,
+                )
+                stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
+            resized_videos_grouped[shape] = stacked_videos
+        resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
+        # Group videos by size for further processing
+        # Needed in case do_resize is False, or resize returns videos with different sizes
+        grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
+        processed_videos_grouped = {}
+        processed_grids = {}
+        for shape, stacked_videos in grouped_videos.items():
+            resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
+            # Fused rescale and normalize
+            stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            patches = stacked_videos
+            # Check that videos have `num_frames` divisible by `temporal_patch_size`
+            if patches.shape[1] % temporal_patch_size != 0:
+                repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
+                patches = torch.cat([patches, repeats], dim=1)
+            batch_size, grid_t, channel = patches.shape[:3]
+            grid_t = grid_t // temporal_patch_size
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+            patches = patches.view(
+                batch_size,
+                grid_t,
+                temporal_patch_size,
+                channel,
+                grid_h // merge_size,
+                merge_size,
+                patch_size,
+                grid_w // merge_size,
+                merge_size,
+                patch_size,
+            )
+            patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
+            flatten_patches = patches.reshape(
+                batch_size,
+                grid_t * grid_h * grid_w,
+                channel * temporal_patch_size * patch_size * patch_size,
+            )
+            processed_videos_grouped[shape] = flatten_patches
+            processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
+        processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
+        processed_grids = reorder_videos(processed_grids, grouped_videos_index)
+        pixel_values_videos = torch.cat(processed_videos, dim=0)
+        video_grid_thw = torch.tensor(processed_grids)
+        data = {
+            "pixel_values_videos": pixel_values_videos,
+            "video_grid_thw": video_grid_thw,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def preprocess(
+        self,
+        videos: Union[str, Dict[str, Any], List[Union[str, Dict[str, Any]]]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess videos for the model.
+        This method overrides the base class to handle two video input formats:
+        1. String path: "path/to/video.mp4"
+        2. Dict with segments: {"video_path": "...", "segment": [[start, end], [time], ...]}
+        Args:
+            videos: Video input(s) in one of the supported formats.
+            **kwargs: Additional arguments passed to _preprocess.
+        Returns:
+            BatchFeature with pixel_values_videos, video_grid_thw, and optionally video_metadata.
+        """
+        # Validate kwargs
+        validate_kwargs(
+            captured_kwargs=kwargs.keys(),
+            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
+        )
+        # Set default kwargs from self
+        for kwarg_name in self.valid_kwargs.__annotations__:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+        # Pop kwargs that are handled separately
+        return_tensors = kwargs.pop("return_tensors", None)
+        return_metadata = kwargs.pop("return_metadata", False)
+        input_data_format = kwargs.pop("input_data_format", None)
+        device = kwargs.pop("device", None)
+        kwargs.pop("video_metadata", None)  # We generate our own metadata
+        kwargs.pop("do_sample_frames", None)  # We handle sampling ourselves
+        kwargs.pop("data_format", None)  # Not used
+        # Normalize input to list format
+        if not isinstance(videos, list):
+            videos = [videos]
+        # Get video processing params from kwargs (may be passed explicitly for per-batch configuration)
+        video_fps = kwargs.pop("video_fps", None)
+        min_frames = kwargs.pop("min_frames", None)
+        max_frames = kwargs.pop("max_frames", None)
+        # Use fetch_videos to handle both string and dict formats
+        video_tensors, video_metadata = self.fetch_videos(
+            videos,
+            video_fps=video_fps,
+            min_frames=min_frames,
+            max_frames=max_frames,
+        )
+        # Prepare video tensors using _prepare_input_videos
+        prepared_videos = self._prepare_input_videos(
+            videos=video_tensors,
+            input_data_format=input_data_format,
+            device=device,
+        )
+        # Process kwargs for _preprocess
+        kwargs = self._further_process_kwargs(**kwargs)
+        self._validate_preprocess_kwargs(**kwargs)
+        # Call _preprocess with prepared videos
+        result = self._preprocess(videos=prepared_videos, return_tensors=return_tensors, **kwargs)
+        # Add metadata if requested
+        if return_metadata:
+            result["video_metadata"] = video_metadata
+        return result
+__all__ = ["MossVLVideoProcessor"]

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff