Upload folder using huggingface_hub
Browse files- .gitattributes +5 -0
- README.md +355 -0
- __pycache__/modeling_moss_vl.cpython-312.pyc +3 -0
- assets/3d-rope.png +3 -0
- assets/logo.png +3 -0
- assets/structure.png +3 -0
- assets/timestamp_input.svg +78 -0
- chat_template.json +3 -0
- config.json +83 -0
- configuration_moss_vl.py +164 -0
- generation_config.json +6 -0
- model-00001-of-00005.safetensors +3 -0
- model-00002-of-00005.safetensors +3 -0
- model-00003-of-00005.safetensors +3 -0
- model-00004-of-00005.safetensors +3 -0
- model.safetensors.index.json +902 -0
- modeling_moss_vl.py +0 -0
- preprocessor_config.json +26 -0
- processing_moss_vl.py +1079 -0
- requirements.txt +15 -0
- tokenizer.json +3 -0
- tokenizer_config.json +258 -0
- video_preprocessor_config.json +30 -0
- video_processing_moss_vl.py +1132 -0
- vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
__pycache__/modeling_moss_vl.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/3d-rope.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/logo.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/structure.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MOSS-VL-SFT-0408
|
| 3 |
+
date: 2026-04-08
|
| 4 |
+
category: Multimodal-LLM
|
| 5 |
+
status: SFT
|
| 6 |
+
language:
|
| 7 |
+
- en
|
| 8 |
+
library_name: transformers
|
| 9 |
+
pipeline_tag: video-text-to-text
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
base_model: fnlp-vision/moss-video-preview-base
|
| 12 |
+
tags:
|
| 13 |
+
- SFT
|
| 14 |
+
- Video-Understanding
|
| 15 |
+
- Image-Understanding
|
| 16 |
+
- MOSS-VL
|
| 17 |
+
- OpenMOSS
|
| 18 |
+
- multimodal
|
| 19 |
+
- video
|
| 20 |
+
- vision-language
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
<p align="center">
|
| 24 |
+
<img src="assets/logo.png" width="320"/>
|
| 25 |
+
</p>
|
| 26 |
+
|
| 27 |
+
# MOSS-VL-SFT-0408
|
| 28 |
+
|
| 29 |
+
## 📌 Introduction
|
| 30 |
+
|
| 31 |
+
We introduce **MOSS-VL-SFT-0408**, the supervised fine-tuned checkpoint in the **MOSS-VL** series (part of the **OpenMOSS** ecosystem).
|
| 32 |
+
|
| 33 |
+
> [!IMPORTANT]
|
| 34 |
+
> This is an **SFT** checkpoint (instruction-tuned). It is **NOT** the Real-Time SFT streaming checkpoint.
|
| 35 |
+
|
| 36 |
+
This model is designed as a high-performance offline engine for multimodal tasks, bridging the gap between static image understanding and dynamic real-time interaction.
|
| 37 |
+
|
| 38 |
+
### This checkpoint is intended for:
|
| 39 |
+
|
| 40 |
+
- **video/image understanding** with significantly improved instruction following capabilities.
|
| 41 |
+
- Serving as a **strong starting point** for further **Real-Time SFT** or specific domain adaptation.
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
## 🚀 Key Features & Status
|
| 46 |
+
|
| 47 |
+
| Feature | Status | Description |
|
| 48 |
+
| :--- | :---: | :--- |
|
| 49 |
+
| **Model Loading** | ✅ | Standard HF loading with `trust_remote_code=True` |
|
| 50 |
+
| **Image Understanding** | ✅ | Single/Multi-image input support |
|
| 51 |
+
| **Video Understanding** | ✅ | Native video frame sequence processing |
|
| 52 |
+
| **Mixed Inference** | ✅ | Interleaved image and video inputs |
|
| 53 |
+
| **Offline Generation** | ✅ | Optimized `offline_generate` & `offline_batch_generate` |
|
| 54 |
+
| **Benchmarks/Metrics** | ⏳ | Coming in future updates |
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## 🏗 Model Architecture
|
| 59 |
+
|
| 60 |
+
**MOSS-VL-SFT-0408** adopts a decoupled multimodal design, utilizing a cross-attention mechanism to bridge high-resolution visual encoding with advanced language reasoning.
|
| 61 |
+
|
| 62 |
+
<p align="center">
|
| 63 |
+
<img src="assets/structure.png" alt="MOSS-VL Architecture" width="90%"/>
|
| 64 |
+
<br>
|
| 65 |
+
<em>Figure 1: MOSS-VL Core Architecture.</em>
|
| 66 |
+
</p>
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
## Temporal-Aware Prompting
|
| 70 |
+
|
| 71 |
+
At the model-family level, MOSS-VL uses timestamp-aware multimodal prompting for video understanding. This design gives sampled frames explicit temporal anchors, which helps the model reason about order, duration, and event localization more robustly.
|
| 72 |
+
|
| 73 |
+
<p align="center">
|
| 74 |
+
<img src="assets/timestamp_input.svg" alt="Timestamped Sequence Input Illustration" width="90%"/>
|
| 75 |
+
<br>
|
| 76 |
+
<em>Figure 2: Illustration of the timestamped sequence input pipeline.</em>
|
| 77 |
+
</p>
|
| 78 |
+
|
| 79 |
+
## Multimodal RoPE
|
| 80 |
+
|
| 81 |
+
MOSS-VL uses multimodal rotary position encoding to align text tokens and visual features in a shared spatial-temporal coordinate system. At a high level, this improves video-text grounding and helps preserve temporal structure during multimodal reasoning.
|
| 82 |
+
|
| 83 |
+
<p align="center">
|
| 84 |
+
<img src="assets/3d-rope.png" alt="MOSS-VL mRoPE Architecture Illustration" width="80%"/>
|
| 85 |
+
<br>
|
| 86 |
+
<em>Figure 3: 3D-RoPE spatial-temporal alignment.</em>
|
| 87 |
+
</p>
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
## 🚀 Quickstart
|
| 93 |
+
|
| 94 |
+
<details>
|
| 95 |
+
<summary><strong>Queue-based offline inference (Python)</strong></summary>
|
| 96 |
+
|
| 97 |
+
<br>
|
| 98 |
+
|
| 99 |
+
```python
|
| 100 |
+
import os
|
| 101 |
+
import queue
|
| 102 |
+
import threading
|
| 103 |
+
|
| 104 |
+
import torch
|
| 105 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 106 |
+
|
| 107 |
+
checkpoint = "path/to/checkpoint"
|
| 108 |
+
video_path = "data/example_video.mp4"
|
| 109 |
+
prompt = "Describe the video."
|
| 110 |
+
|
| 111 |
+
max_new_tokens = 1024
|
| 112 |
+
temperature = 1.0
|
| 113 |
+
top_k = 50
|
| 114 |
+
top_p = 1.0
|
| 115 |
+
repetition_penalty = 1.0
|
| 116 |
+
|
| 117 |
+
video_fps = 1.0
|
| 118 |
+
video_minlen = 8
|
| 119 |
+
video_maxlen = 256
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def load_model(checkpoint: str):
|
| 123 |
+
processor = AutoProcessor.from_pretrained(
|
| 124 |
+
checkpoint,
|
| 125 |
+
trust_remote_code=True,
|
| 126 |
+
frame_extract_num_threads=1,
|
| 127 |
+
)
|
| 128 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 129 |
+
checkpoint,
|
| 130 |
+
trust_remote_code=True,
|
| 131 |
+
device_map="auto",
|
| 132 |
+
torch_dtype=torch.bfloat16,
|
| 133 |
+
attn_implementation="flash_attention_2",
|
| 134 |
+
)
|
| 135 |
+
return model, processor
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
if not checkpoint:
|
| 139 |
+
raise ValueError("Missing `checkpoint`.")
|
| 140 |
+
if not video_path:
|
| 141 |
+
raise ValueError("Missing `video_path`.")
|
| 142 |
+
if not os.path.isfile(video_path):
|
| 143 |
+
raise FileNotFoundError(f"Video not found: {video_path}")
|
| 144 |
+
|
| 145 |
+
model, processor = load_model(checkpoint)
|
| 146 |
+
new_queries: "queue.Queue[dict]" = queue.Queue()
|
| 147 |
+
output_text_queue: "queue.Queue[str]" = queue.Queue()
|
| 148 |
+
|
| 149 |
+
query = {
|
| 150 |
+
"prompt": prompt,
|
| 151 |
+
"images": [],
|
| 152 |
+
"videos": [video_path],
|
| 153 |
+
"media_kwargs": {
|
| 154 |
+
"video_fps": video_fps,
|
| 155 |
+
"video_minlen": video_minlen,
|
| 156 |
+
"video_maxlen": video_maxlen,
|
| 157 |
+
},
|
| 158 |
+
"generate_kwargs": {
|
| 159 |
+
"temperature": temperature,
|
| 160 |
+
"top_k": top_k,
|
| 161 |
+
"top_p": top_p,
|
| 162 |
+
"max_new_tokens": max_new_tokens,
|
| 163 |
+
"repetition_penalty": repetition_penalty,
|
| 164 |
+
"do_sample": False,
|
| 165 |
+
},
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def drain_output():
|
| 170 |
+
while True:
|
| 171 |
+
tok = output_text_queue.get()
|
| 172 |
+
if tok == "<|round_end|>":
|
| 173 |
+
break
|
| 174 |
+
print(tok, end="", flush=True)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
worker = threading.Thread(
|
| 178 |
+
target=model.offline_generate,
|
| 179 |
+
args=(processor, new_queries, output_text_queue),
|
| 180 |
+
kwargs={"vision_chunked_length": 64},
|
| 181 |
+
daemon=True,
|
| 182 |
+
)
|
| 183 |
+
worker.start()
|
| 184 |
+
|
| 185 |
+
new_queries.put(query)
|
| 186 |
+
drain_output()
|
| 187 |
+
|
| 188 |
+
new_queries.put({"stop_offline_generate": True})
|
| 189 |
+
worker.join(timeout=5.0)
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
For image-only usage, keep the same template and change:
|
| 193 |
+
|
| 194 |
+
- replace `video_path` with `image_path`
|
| 195 |
+
- validate `image_path` instead of `video_path`
|
| 196 |
+
- set `images` to `[image_path]`
|
| 197 |
+
- set `videos` to `[]`
|
| 198 |
+
- remove `media_kwargs` if you do not need video-specific controls
|
| 199 |
+
|
| 200 |
+
</details>
|
| 201 |
+
|
| 202 |
+
<details>
|
| 203 |
+
<summary><strong>Batched offline inference (Python)</strong></summary>
|
| 204 |
+
|
| 205 |
+
<br>
|
| 206 |
+
|
| 207 |
+
```python
|
| 208 |
+
import torch
|
| 209 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 210 |
+
|
| 211 |
+
checkpoint = "path/to/checkpoint"
|
| 212 |
+
|
| 213 |
+
shared_generate_kwargs = {
|
| 214 |
+
"temperature": 1.0,
|
| 215 |
+
"top_k": 50,
|
| 216 |
+
"top_p": 1.0,
|
| 217 |
+
"max_new_tokens": 256,
|
| 218 |
+
"repetition_penalty": 1.0,
|
| 219 |
+
"do_sample": False,
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
shared_media_kwargs = {
|
| 223 |
+
"video_fps": 1.0,
|
| 224 |
+
"video_minlen": 8,
|
| 225 |
+
"video_maxlen": 256,
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def load_model(checkpoint: str):
|
| 230 |
+
processor = AutoProcessor.from_pretrained(
|
| 231 |
+
checkpoint,
|
| 232 |
+
trust_remote_code=True,
|
| 233 |
+
frame_extract_num_threads=1,
|
| 234 |
+
)
|
| 235 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 236 |
+
checkpoint,
|
| 237 |
+
trust_remote_code=True,
|
| 238 |
+
device_map="auto",
|
| 239 |
+
torch_dtype=torch.bfloat16,
|
| 240 |
+
attn_implementation="flash_attention_2",
|
| 241 |
+
)
|
| 242 |
+
return model, processor
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
model, processor = load_model(checkpoint)
|
| 246 |
+
queries = [
|
| 247 |
+
{
|
| 248 |
+
"prompt": "Describe sample A.",
|
| 249 |
+
"images": [],
|
| 250 |
+
"videos": ["data/sample_a.mp4"],
|
| 251 |
+
"media_kwargs": dict(shared_media_kwargs),
|
| 252 |
+
"generate_kwargs": dict(shared_generate_kwargs),
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"prompt": "Describe sample B.",
|
| 256 |
+
"images": [],
|
| 257 |
+
"videos": ["data/sample_b.mp4"],
|
| 258 |
+
"media_kwargs": dict(shared_media_kwargs),
|
| 259 |
+
"generate_kwargs": dict(shared_generate_kwargs),
|
| 260 |
+
},
|
| 261 |
+
]
|
| 262 |
+
|
| 263 |
+
with torch.no_grad():
|
| 264 |
+
result = model.offline_batch_generate(
|
| 265 |
+
processor,
|
| 266 |
+
queries,
|
| 267 |
+
session_states=None,
|
| 268 |
+
vision_chunked_length=64,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
texts = [item["text"] for item in result["results"]]
|
| 272 |
+
session_states = result["session_states"]
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
```python
|
| 276 |
+
followup_queries = [
|
| 277 |
+
{
|
| 278 |
+
"prompt": "Summarize sample A in one sentence.",
|
| 279 |
+
"generate_kwargs": dict(shared_generate_kwargs),
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"prompt": "Restart sample B and answer again.",
|
| 283 |
+
"reset_session": True,
|
| 284 |
+
"generate_kwargs": dict(shared_generate_kwargs),
|
| 285 |
+
},
|
| 286 |
+
]
|
| 287 |
+
|
| 288 |
+
with torch.no_grad():
|
| 289 |
+
followup_result = model.offline_batch_generate(
|
| 290 |
+
processor,
|
| 291 |
+
followup_queries,
|
| 292 |
+
session_states=session_states,
|
| 293 |
+
vision_chunked_length=64,
|
| 294 |
+
)
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
</details>
|
| 298 |
+
|
| 299 |
+
## Intended Use
|
| 300 |
+
|
| 301 |
+
- offline image understanding
|
| 302 |
+
- offline video understanding
|
| 303 |
+
- multimodal prompt experiments for release validation
|
| 304 |
+
- checkpoint-level inference integration and debugging
|
| 305 |
+
|
| 306 |
+
## Requirements
|
| 307 |
+
|
| 308 |
+
Core validated inference dependencies:
|
| 309 |
+
|
| 310 |
+
- `python==3.12.13`
|
| 311 |
+
- `torch==2.8.0+cu128`
|
| 312 |
+
- `torchvision==0.23.0+cu128`
|
| 313 |
+
- `transformers==4.57.1`
|
| 314 |
+
- `accelerate==1.12.0`
|
| 315 |
+
- `flash_attn==2.8.1`
|
| 316 |
+
- `torchcodec==0.7.0`
|
| 317 |
+
- `numpy==2.4.3`
|
| 318 |
+
- `pillow==12.1.1`
|
| 319 |
+
- `joblib==1.5.2`
|
| 320 |
+
- `einops==0.8.2`
|
| 321 |
+
|
| 322 |
+
Installation commands:
|
| 323 |
+
|
| 324 |
+
```bash
|
| 325 |
+
conda create -n moss_vl python=3.12 pip -y
|
| 326 |
+
conda activate moss_vl
|
| 327 |
+
pip install -i https://pypi.org/simple --no-build-isolation -r requirements.txt
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
Validated setup notes:
|
| 331 |
+
|
| 332 |
+
- CUDA runtime used for validation: `12.8`
|
| 333 |
+
- Inference loading uses `trust_remote_code=True` and `attn_implementation="flash_attention_2"`
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
## Limitations and Future Work
|
| 337 |
+
|
| 338 |
+
- realtime usage is not documented here
|
| 339 |
+
- benchmark, metric, and training details are still blank
|
| 340 |
+
- some sections are intentionally placeholders until release information is finalized
|
| 341 |
+
- batch calls currently require shared `generate_kwargs` and shared `media_kwargs` within one call
|
| 342 |
+
- batch streaming and batch cancel / stop protocol are not part of `offline_batch_generate(...)`
|
| 343 |
+
- the queue example is intentionally minimal and does not include production-grade timeout or worker error handling
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
## Citation
|
| 347 |
+
```bibtex
|
| 348 |
+
@misc{moss_vl_2026,
|
| 349 |
+
title = {{MOSS-VL Technical Report}},
|
| 350 |
+
author = {OpenMOSS Team},
|
| 351 |
+
year = {2026},
|
| 352 |
+
howpublished = {\url{https://github.com/fnlp-vision/MOSS-VL}},
|
| 353 |
+
note = {GitHub repository}
|
| 354 |
+
}
|
| 355 |
+
```
|
__pycache__/modeling_moss_vl.cpython-312.pyc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0c2dac0006732b9b43f3257298ad053f74aec76cab84967d3740ad5fdde54e1
|
| 3 |
+
size 126448
|
assets/3d-rope.png
ADDED
|
Git LFS Details
|
assets/logo.png
ADDED
|
Git LFS Details
|
assets/structure.png
ADDED
|
Git LFS Details
|
assets/timestamp_input.svg
ADDED
|
|
chat_template.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {%- if messages[0].content is string %}\n {{- messages[0].content }}\n {%- else %}\n {%- for content in messages[0].content %}\n {%- if 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].content is string %}\n {{- messages[0].content }}\n {%- else %}\n {%- for content in messages[0].content %}\n {%- if 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n {%- if message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n <|image|>\n {%- elif content.type == 'video' or 'video' in content %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n <|video|>\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content_item in message.content %}\n {%- if 'text' in content_item %}\n {{- content_item.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and message.content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n <|image|>\n {%- elif content.type == 'video' or 'video' in content %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n <|video|>\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
|
| 3 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"MossVLForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "configuration_moss_vl.MossVLConfig",
|
| 7 |
+
"AutoModel": "modeling_moss_vl.MossVLForConditionalGeneration",
|
| 8 |
+
"AutoModelForCausalLM": "modeling_moss_vl.MossVLForConditionalGeneration"
|
| 9 |
+
},
|
| 10 |
+
"dtype": "bfloat16",
|
| 11 |
+
"image_token_id": 151655,
|
| 12 |
+
"model_type": "moss_vl",
|
| 13 |
+
"text_config": {
|
| 14 |
+
"attention_bias": false,
|
| 15 |
+
"attention_dropout": 0.0,
|
| 16 |
+
"bos_token_id": 151643,
|
| 17 |
+
"cross_attention_layers": [
|
| 18 |
+
2,
|
| 19 |
+
6,
|
| 20 |
+
10,
|
| 21 |
+
14,
|
| 22 |
+
18,
|
| 23 |
+
22,
|
| 24 |
+
26,
|
| 25 |
+
30,
|
| 26 |
+
34,
|
| 27 |
+
38,
|
| 28 |
+
42,
|
| 29 |
+
46
|
| 30 |
+
],
|
| 31 |
+
"dtype": "bfloat16",
|
| 32 |
+
"eos_token_id": 151645,
|
| 33 |
+
"head_dim": 128,
|
| 34 |
+
"hidden_act": "silu",
|
| 35 |
+
"hidden_size": 4096,
|
| 36 |
+
"initializer_range": 0.02,
|
| 37 |
+
"intermediate_size": 12288,
|
| 38 |
+
"max_position_embeddings": 262144,
|
| 39 |
+
"model_type": "moss_vl_text",
|
| 40 |
+
"num_attention_heads": 32,
|
| 41 |
+
"num_hidden_layers": 48,
|
| 42 |
+
"num_key_value_heads": 8,
|
| 43 |
+
"rms_norm_eps": 1e-06,
|
| 44 |
+
"rope_scaling": {
|
| 45 |
+
"mrope_interleaved": true,
|
| 46 |
+
"mrope_section": [
|
| 47 |
+
24,
|
| 48 |
+
20,
|
| 49 |
+
20
|
| 50 |
+
],
|
| 51 |
+
"rope_type": "default"
|
| 52 |
+
},
|
| 53 |
+
"rope_theta": 5000000,
|
| 54 |
+
"use_cache": true,
|
| 55 |
+
"vocab_size": 151936
|
| 56 |
+
},
|
| 57 |
+
"tie_word_embeddings": false,
|
| 58 |
+
"transformers_version": "4.57.1",
|
| 59 |
+
"video_token_id": 151656,
|
| 60 |
+
"vision_config": {
|
| 61 |
+
"deepstack_visual_indexes": [
|
| 62 |
+
8,
|
| 63 |
+
16,
|
| 64 |
+
24
|
| 65 |
+
],
|
| 66 |
+
"depth": 27,
|
| 67 |
+
"hidden_act": "gelu_pytorch_tanh",
|
| 68 |
+
"hidden_size": 1152,
|
| 69 |
+
"in_channels": 3,
|
| 70 |
+
"initializer_range": 0.02,
|
| 71 |
+
"intermediate_size": 4304,
|
| 72 |
+
"model_type": "moss_vl_vision",
|
| 73 |
+
"num_heads": 16,
|
| 74 |
+
"num_position_embeddings": 2304,
|
| 75 |
+
"out_hidden_size": 4096,
|
| 76 |
+
"patch_size": 16,
|
| 77 |
+
"spatial_merge_size": 2,
|
| 78 |
+
"temporal_patch_size": 1
|
| 79 |
+
},
|
| 80 |
+
"vision_end_token_id": 151653,
|
| 81 |
+
"vision_seq_pad_multiple": 8,
|
| 82 |
+
"vision_start_token_id": 151652
|
| 83 |
+
}
|
configuration_moss_vl.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""MossVL model configuration"""
|
| 16 |
+
|
| 17 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 18 |
+
from transformers.modeling_rope_utils import rope_config_validation
|
| 19 |
+
from transformers.utils import logging
|
| 20 |
+
|
| 21 |
+
logger = logging.get_logger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class MossVLVisionConfig(PretrainedConfig):
|
| 25 |
+
"""
|
| 26 |
+
Configuration for MossVL Vision Model
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
model_type = "moss_vl_vision"
|
| 30 |
+
base_config_key = "vision_config"
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
depth=27,
|
| 35 |
+
hidden_size=1152,
|
| 36 |
+
hidden_act="gelu_pytorch_tanh",
|
| 37 |
+
intermediate_size=4304,
|
| 38 |
+
num_heads=16,
|
| 39 |
+
in_channels=3,
|
| 40 |
+
patch_size=16,
|
| 41 |
+
spatial_merge_size=2,
|
| 42 |
+
temporal_patch_size=1,
|
| 43 |
+
out_hidden_size=3584,
|
| 44 |
+
num_position_embeddings=2304,
|
| 45 |
+
deepstack_visual_indexes=[8, 16, 24],
|
| 46 |
+
initializer_range=0.02,
|
| 47 |
+
**kwargs,
|
| 48 |
+
):
|
| 49 |
+
super().__init__(**kwargs)
|
| 50 |
+
self.depth = depth
|
| 51 |
+
self.hidden_size = hidden_size
|
| 52 |
+
self.hidden_act = hidden_act
|
| 53 |
+
self.intermediate_size = intermediate_size
|
| 54 |
+
self.num_heads = num_heads
|
| 55 |
+
self.in_channels = in_channels
|
| 56 |
+
self.patch_size = patch_size
|
| 57 |
+
self.spatial_merge_size = spatial_merge_size
|
| 58 |
+
self.temporal_patch_size = temporal_patch_size
|
| 59 |
+
self.out_hidden_size = out_hidden_size
|
| 60 |
+
self.num_position_embeddings = num_position_embeddings
|
| 61 |
+
self.initializer_range = initializer_range
|
| 62 |
+
self.deepstack_visual_indexes = deepstack_visual_indexes
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class MossVLTextConfig(PretrainedConfig):
|
| 66 |
+
"""
|
| 67 |
+
Configuration for MossVL Text Model
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
model_type = "moss_vl_text"
|
| 71 |
+
base_config_key = "text_config"
|
| 72 |
+
|
| 73 |
+
def __init__(
|
| 74 |
+
self,
|
| 75 |
+
vocab_size=151936,
|
| 76 |
+
hidden_size=4096,
|
| 77 |
+
intermediate_size=22016,
|
| 78 |
+
num_hidden_layers=32,
|
| 79 |
+
num_attention_heads=32,
|
| 80 |
+
num_key_value_heads=32,
|
| 81 |
+
head_dim=128,
|
| 82 |
+
hidden_act="silu",
|
| 83 |
+
max_position_embeddings=128000,
|
| 84 |
+
initializer_range=0.02,
|
| 85 |
+
rms_norm_eps=1e-6,
|
| 86 |
+
use_cache=True,
|
| 87 |
+
tie_word_embeddings=False,
|
| 88 |
+
rope_theta=5000000.0,
|
| 89 |
+
rope_scaling=None,
|
| 90 |
+
attention_bias=False,
|
| 91 |
+
attention_dropout=0.0,
|
| 92 |
+
# Cross attention specific
|
| 93 |
+
cross_attention_layers=None, # List of layer indices to insert cross attention
|
| 94 |
+
**kwargs,
|
| 95 |
+
):
|
| 96 |
+
|
| 97 |
+
self.vocab_size = vocab_size
|
| 98 |
+
self.max_position_embeddings = max_position_embeddings
|
| 99 |
+
self.hidden_size = hidden_size
|
| 100 |
+
self.intermediate_size = intermediate_size
|
| 101 |
+
self.num_hidden_layers = num_hidden_layers
|
| 102 |
+
self.num_attention_heads = num_attention_heads
|
| 103 |
+
|
| 104 |
+
# for backward compatibility
|
| 105 |
+
if num_key_value_heads is None:
|
| 106 |
+
num_key_value_heads = num_attention_heads
|
| 107 |
+
|
| 108 |
+
self.num_key_value_heads = num_key_value_heads
|
| 109 |
+
self.head_dim = head_dim
|
| 110 |
+
self.hidden_act = hidden_act
|
| 111 |
+
self.initializer_range = initializer_range
|
| 112 |
+
self.rms_norm_eps = rms_norm_eps
|
| 113 |
+
self.use_cache = use_cache
|
| 114 |
+
self.rope_theta = rope_theta
|
| 115 |
+
self.rope_scaling = rope_scaling
|
| 116 |
+
self.attention_bias = attention_bias
|
| 117 |
+
self.attention_dropout = attention_dropout
|
| 118 |
+
|
| 119 |
+
rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
|
| 120 |
+
self.cross_attention_layers = cross_attention_layers or [2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
|
| 121 |
+
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
| 122 |
+
|
| 123 |
+
class MossVLConfig(PretrainedConfig):
|
| 124 |
+
"""
|
| 125 |
+
Configuration for MossVL Model
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
model_type = "moss_vl"
|
| 129 |
+
sub_configs = {"vision_config": MossVLVisionConfig, "text_config": MossVLTextConfig}
|
| 130 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 131 |
+
|
| 132 |
+
def __init__(
|
| 133 |
+
self,
|
| 134 |
+
text_config=None,
|
| 135 |
+
vision_config=None,
|
| 136 |
+
image_token_id=151655,
|
| 137 |
+
video_token_id=151656,
|
| 138 |
+
vision_start_token_id=151652,
|
| 139 |
+
vision_end_token_id=151653,
|
| 140 |
+
vision_seq_pad_multiple=8,
|
| 141 |
+
tie_word_embeddings=False,
|
| 142 |
+
**kwargs,
|
| 143 |
+
):
|
| 144 |
+
if isinstance(vision_config, dict):
|
| 145 |
+
self.vision_config = self.sub_configs["vision_config"](**vision_config)
|
| 146 |
+
elif vision_config is None:
|
| 147 |
+
self.vision_config = self.sub_configs["vision_config"]()
|
| 148 |
+
|
| 149 |
+
if isinstance(text_config, dict):
|
| 150 |
+
self.text_config = self.sub_configs["text_config"](**text_config)
|
| 151 |
+
elif text_config is None:
|
| 152 |
+
self.text_config = self.sub_configs["text_config"]()
|
| 153 |
+
|
| 154 |
+
self.image_token_id = image_token_id
|
| 155 |
+
self.video_token_id = video_token_id
|
| 156 |
+
self.vision_start_token_id = vision_start_token_id
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
self.vision_end_token_id = vision_end_token_id
|
| 160 |
+
self.vision_seq_pad_multiple = vision_seq_pad_multiple
|
| 161 |
+
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
__all__ = ["MossVLConfig", "MossVLTextConfig"]
|
generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 151643,
|
| 4 |
+
"eos_token_id": 151645,
|
| 5 |
+
"transformers_version": "4.57.1"
|
| 6 |
+
}
|
model-00001-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e06b965b124358c760daf4fd0df1f4f96fd3489ec1acf2df07f8cc30228f6470
|
| 3 |
+
size 5274500800
|
model-00002-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b30b6c802724afa309d1b46eec351fa3b935b7e241d4203d02217e29cd42e02
|
| 3 |
+
size 5360568508
|
model-00003-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48cce2c73d9e62b9af51ceaacbb35d59f56e062c4bd5fa006ef53a47e9b6070c
|
| 3 |
+
size 5360577920
|
model-00004-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5a162acab8d8b9dc63c0b06f840db983babe44813701ce8ee937aef4e621269
|
| 3 |
+
size 5366957460
|
model.safetensors.index.json
ADDED
|
@@ -0,0 +1,902 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 22672742416
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"model.language_model.embed_tokens.weight": "model-00001-of-00005.safetensors",
|
| 7 |
+
"model.separator_token": "model-00001-of-00005.safetensors",
|
| 8 |
+
"model.language_model.norm.weight": "model-00001-of-00005.safetensors",
|
| 9 |
+
"lm_head.weight": "model-00001-of-00005.safetensors",
|
| 10 |
+
"model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 11 |
+
"model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 12 |
+
"model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 13 |
+
"model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 14 |
+
"model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 15 |
+
"model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 16 |
+
"model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 17 |
+
"model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 18 |
+
"model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
| 19 |
+
"model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
| 20 |
+
"model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
| 21 |
+
"model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 22 |
+
"model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 23 |
+
"model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 24 |
+
"model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 25 |
+
"model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 26 |
+
"model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 27 |
+
"model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 28 |
+
"model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 29 |
+
"model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
| 30 |
+
"model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
| 31 |
+
"model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
| 32 |
+
"model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 33 |
+
"model.language_model.layers.2.cross_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 34 |
+
"model.language_model.layers.2.cross_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 35 |
+
"model.language_model.layers.2.cross_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 36 |
+
"model.language_model.layers.2.cross_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 37 |
+
"model.language_model.layers.2.cross_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 38 |
+
"model.language_model.layers.2.cross_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 39 |
+
"model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 40 |
+
"model.language_model.layers.2.cross_attn_attn_gate": "model-00001-of-00005.safetensors",
|
| 41 |
+
"model.language_model.layers.2.cross_attn_mlp_gate": "model-00001-of-00005.safetensors",
|
| 42 |
+
"model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
| 43 |
+
"model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
| 44 |
+
"model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
| 45 |
+
"model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 46 |
+
"model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 47 |
+
"model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 48 |
+
"model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 49 |
+
"model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 50 |
+
"model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 51 |
+
"model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 52 |
+
"model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 53 |
+
"model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
| 54 |
+
"model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
| 55 |
+
"model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
| 56 |
+
"model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 57 |
+
"model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 58 |
+
"model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 59 |
+
"model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 60 |
+
"model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 61 |
+
"model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 62 |
+
"model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 63 |
+
"model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 64 |
+
"model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
| 65 |
+
"model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
| 66 |
+
"model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
| 67 |
+
"model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 68 |
+
"model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 69 |
+
"model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 70 |
+
"model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 71 |
+
"model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 72 |
+
"model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 73 |
+
"model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 74 |
+
"model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 75 |
+
"model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
| 76 |
+
"model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
| 77 |
+
"model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
| 78 |
+
"model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 79 |
+
"model.language_model.layers.6.cross_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 80 |
+
"model.language_model.layers.6.cross_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 81 |
+
"model.language_model.layers.6.cross_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 82 |
+
"model.language_model.layers.6.cross_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 83 |
+
"model.language_model.layers.6.cross_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 84 |
+
"model.language_model.layers.6.cross_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 85 |
+
"model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 86 |
+
"model.language_model.layers.6.cross_attn_attn_gate": "model-00001-of-00005.safetensors",
|
| 87 |
+
"model.language_model.layers.6.cross_attn_mlp_gate": "model-00001-of-00005.safetensors",
|
| 88 |
+
"model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
|
| 89 |
+
"model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
|
| 90 |
+
"model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
|
| 91 |
+
"model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 92 |
+
"model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
|
| 93 |
+
"model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
|
| 94 |
+
"model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
|
| 95 |
+
"model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
|
| 96 |
+
"model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00005.safetensors",
|
| 97 |
+
"model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00005.safetensors",
|
| 98 |
+
"model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
|
| 99 |
+
"model.language_model.layers.7.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 100 |
+
"model.language_model.layers.7.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 101 |
+
"model.language_model.layers.7.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 102 |
+
"model.language_model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 103 |
+
"model.language_model.layers.8.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 104 |
+
"model.language_model.layers.8.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 105 |
+
"model.language_model.layers.8.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 106 |
+
"model.language_model.layers.8.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 107 |
+
"model.language_model.layers.8.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 108 |
+
"model.language_model.layers.8.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 109 |
+
"model.language_model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 110 |
+
"model.language_model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 111 |
+
"model.language_model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 112 |
+
"model.language_model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 113 |
+
"model.language_model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 114 |
+
"model.language_model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 115 |
+
"model.language_model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 116 |
+
"model.language_model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 117 |
+
"model.language_model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 118 |
+
"model.language_model.layers.9.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 119 |
+
"model.language_model.layers.9.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 120 |
+
"model.language_model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 121 |
+
"model.language_model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 122 |
+
"model.language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 123 |
+
"model.language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 124 |
+
"model.language_model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 125 |
+
"model.language_model.layers.10.cross_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 126 |
+
"model.language_model.layers.10.cross_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 127 |
+
"model.language_model.layers.10.cross_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 128 |
+
"model.language_model.layers.10.cross_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 129 |
+
"model.language_model.layers.10.cross_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 130 |
+
"model.language_model.layers.10.cross_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 131 |
+
"model.language_model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 132 |
+
"model.language_model.layers.10.cross_attn_attn_gate": "model-00002-of-00005.safetensors",
|
| 133 |
+
"model.language_model.layers.10.cross_attn_mlp_gate": "model-00002-of-00005.safetensors",
|
| 134 |
+
"model.language_model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 135 |
+
"model.language_model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 136 |
+
"model.language_model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 137 |
+
"model.language_model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 138 |
+
"model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 139 |
+
"model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 140 |
+
"model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 141 |
+
"model.language_model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 142 |
+
"model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 143 |
+
"model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 144 |
+
"model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 145 |
+
"model.language_model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 146 |
+
"model.language_model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 147 |
+
"model.language_model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 148 |
+
"model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 149 |
+
"model.language_model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 150 |
+
"model.language_model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 151 |
+
"model.language_model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 152 |
+
"model.language_model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 153 |
+
"model.language_model.layers.12.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 154 |
+
"model.language_model.layers.12.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 155 |
+
"model.language_model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 156 |
+
"model.language_model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 157 |
+
"model.language_model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 158 |
+
"model.language_model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 159 |
+
"model.language_model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 160 |
+
"model.language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 161 |
+
"model.language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 162 |
+
"model.language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 163 |
+
"model.language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 164 |
+
"model.language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 165 |
+
"model.language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 166 |
+
"model.language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 167 |
+
"model.language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 168 |
+
"model.language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 169 |
+
"model.language_model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 170 |
+
"model.language_model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 171 |
+
"model.language_model.layers.14.cross_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 172 |
+
"model.language_model.layers.14.cross_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 173 |
+
"model.language_model.layers.14.cross_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 174 |
+
"model.language_model.layers.14.cross_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 175 |
+
"model.language_model.layers.14.cross_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 176 |
+
"model.language_model.layers.14.cross_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 177 |
+
"model.language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 178 |
+
"model.language_model.layers.14.cross_attn_attn_gate": "model-00002-of-00005.safetensors",
|
| 179 |
+
"model.language_model.layers.14.cross_attn_mlp_gate": "model-00002-of-00005.safetensors",
|
| 180 |
+
"model.language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 181 |
+
"model.language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 182 |
+
"model.language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 183 |
+
"model.language_model.layers.15.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 184 |
+
"model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 185 |
+
"model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 186 |
+
"model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 187 |
+
"model.language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 188 |
+
"model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 189 |
+
"model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 190 |
+
"model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 191 |
+
"model.language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 192 |
+
"model.language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 193 |
+
"model.language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 194 |
+
"model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 195 |
+
"model.language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 196 |
+
"model.language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 197 |
+
"model.language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 198 |
+
"model.language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 199 |
+
"model.language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 200 |
+
"model.language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 201 |
+
"model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 202 |
+
"model.language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 203 |
+
"model.language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 204 |
+
"model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 205 |
+
"model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 206 |
+
"model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 207 |
+
"model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 208 |
+
"model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 209 |
+
"model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 210 |
+
"model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 211 |
+
"model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 212 |
+
"model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 213 |
+
"model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 214 |
+
"model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 215 |
+
"model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 216 |
+
"model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 217 |
+
"model.language_model.layers.18.cross_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 218 |
+
"model.language_model.layers.18.cross_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 219 |
+
"model.language_model.layers.18.cross_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 220 |
+
"model.language_model.layers.18.cross_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 221 |
+
"model.language_model.layers.18.cross_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 222 |
+
"model.language_model.layers.18.cross_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 223 |
+
"model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 224 |
+
"model.language_model.layers.18.cross_attn_attn_gate": "model-00002-of-00005.safetensors",
|
| 225 |
+
"model.language_model.layers.18.cross_attn_mlp_gate": "model-00002-of-00005.safetensors",
|
| 226 |
+
"model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 227 |
+
"model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 228 |
+
"model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 229 |
+
"model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 230 |
+
"model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 231 |
+
"model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 232 |
+
"model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 233 |
+
"model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 234 |
+
"model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 235 |
+
"model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 236 |
+
"model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 237 |
+
"model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 238 |
+
"model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 239 |
+
"model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 240 |
+
"model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 241 |
+
"model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 242 |
+
"model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 243 |
+
"model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
|
| 244 |
+
"model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
|
| 245 |
+
"model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00005.safetensors",
|
| 246 |
+
"model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00005.safetensors",
|
| 247 |
+
"model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 248 |
+
"model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
|
| 249 |
+
"model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
|
| 250 |
+
"model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
|
| 251 |
+
"model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00005.safetensors",
|
| 252 |
+
"model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
|
| 253 |
+
"model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
|
| 254 |
+
"model.language_model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 255 |
+
"model.language_model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 256 |
+
"model.language_model.layers.21.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 257 |
+
"model.language_model.layers.21.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 258 |
+
"model.language_model.layers.21.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 259 |
+
"model.language_model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 260 |
+
"model.language_model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 261 |
+
"model.language_model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 262 |
+
"model.language_model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 263 |
+
"model.language_model.layers.22.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 264 |
+
"model.language_model.layers.22.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 265 |
+
"model.language_model.layers.22.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 266 |
+
"model.language_model.layers.22.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 267 |
+
"model.language_model.layers.22.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 268 |
+
"model.language_model.layers.22.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 269 |
+
"model.language_model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 270 |
+
"model.language_model.layers.22.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
|
| 271 |
+
"model.language_model.layers.22.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
|
| 272 |
+
"model.language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 273 |
+
"model.language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 274 |
+
"model.language_model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 275 |
+
"model.language_model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 276 |
+
"model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 277 |
+
"model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 278 |
+
"model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 279 |
+
"model.language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 280 |
+
"model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 281 |
+
"model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 282 |
+
"model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 283 |
+
"model.language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 284 |
+
"model.language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 285 |
+
"model.language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 286 |
+
"model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 287 |
+
"model.language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 288 |
+
"model.language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 289 |
+
"model.language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 290 |
+
"model.language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 291 |
+
"model.language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 292 |
+
"model.language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 293 |
+
"model.language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 294 |
+
"model.language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 295 |
+
"model.language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 296 |
+
"model.language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 297 |
+
"model.language_model.layers.25.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 298 |
+
"model.language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 299 |
+
"model.language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 300 |
+
"model.language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 301 |
+
"model.language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 302 |
+
"model.language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 303 |
+
"model.language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 304 |
+
"model.language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 305 |
+
"model.language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 306 |
+
"model.language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 307 |
+
"model.language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 308 |
+
"model.language_model.layers.26.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 309 |
+
"model.language_model.layers.26.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 310 |
+
"model.language_model.layers.26.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 311 |
+
"model.language_model.layers.26.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 312 |
+
"model.language_model.layers.26.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 313 |
+
"model.language_model.layers.26.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 314 |
+
"model.language_model.layers.26.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 315 |
+
"model.language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 316 |
+
"model.language_model.layers.26.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
|
| 317 |
+
"model.language_model.layers.26.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
|
| 318 |
+
"model.language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 319 |
+
"model.language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 320 |
+
"model.language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 321 |
+
"model.language_model.layers.27.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 322 |
+
"model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 323 |
+
"model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 324 |
+
"model.language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 325 |
+
"model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 326 |
+
"model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 327 |
+
"model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 328 |
+
"model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 329 |
+
"model.language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 330 |
+
"model.language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 331 |
+
"model.language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 332 |
+
"model.language_model.layers.28.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 333 |
+
"model.language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 334 |
+
"model.language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 335 |
+
"model.language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 336 |
+
"model.language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 337 |
+
"model.language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 338 |
+
"model.language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 339 |
+
"model.language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 340 |
+
"model.language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 341 |
+
"model.language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 342 |
+
"model.language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 343 |
+
"model.language_model.layers.29.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 344 |
+
"model.language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 345 |
+
"model.language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 346 |
+
"model.language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 347 |
+
"model.language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 348 |
+
"model.language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 349 |
+
"model.language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 350 |
+
"model.language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 351 |
+
"model.language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 352 |
+
"model.language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 353 |
+
"model.language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 354 |
+
"model.language_model.layers.30.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 355 |
+
"model.language_model.layers.30.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 356 |
+
"model.language_model.layers.30.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 357 |
+
"model.language_model.layers.30.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 358 |
+
"model.language_model.layers.30.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 359 |
+
"model.language_model.layers.30.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 360 |
+
"model.language_model.layers.30.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 361 |
+
"model.language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 362 |
+
"model.language_model.layers.30.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
|
| 363 |
+
"model.language_model.layers.30.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
|
| 364 |
+
"model.language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 365 |
+
"model.language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 366 |
+
"model.language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 367 |
+
"model.language_model.layers.31.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 368 |
+
"model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 369 |
+
"model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 370 |
+
"model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 371 |
+
"model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 372 |
+
"model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 373 |
+
"model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 374 |
+
"model.language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 375 |
+
"model.language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 376 |
+
"model.language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 377 |
+
"model.language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 378 |
+
"model.language_model.layers.32.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 379 |
+
"model.language_model.layers.32.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 380 |
+
"model.language_model.layers.32.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 381 |
+
"model.language_model.layers.32.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 382 |
+
"model.language_model.layers.32.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 383 |
+
"model.language_model.layers.32.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 384 |
+
"model.language_model.layers.32.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 385 |
+
"model.language_model.layers.32.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 386 |
+
"model.language_model.layers.32.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 387 |
+
"model.language_model.layers.32.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 388 |
+
"model.language_model.layers.32.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 389 |
+
"model.language_model.layers.33.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 390 |
+
"model.language_model.layers.33.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 391 |
+
"model.language_model.layers.33.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 392 |
+
"model.language_model.layers.33.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 393 |
+
"model.language_model.layers.33.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 394 |
+
"model.language_model.layers.33.self_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 395 |
+
"model.language_model.layers.33.self_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 396 |
+
"model.language_model.layers.33.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 397 |
+
"model.language_model.layers.33.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 398 |
+
"model.language_model.layers.33.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 399 |
+
"model.language_model.layers.33.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 400 |
+
"model.language_model.layers.34.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 401 |
+
"model.language_model.layers.34.cross_attn.q_proj.weight": "model-00003-of-00005.safetensors",
|
| 402 |
+
"model.language_model.layers.34.cross_attn.k_proj.weight": "model-00003-of-00005.safetensors",
|
| 403 |
+
"model.language_model.layers.34.cross_attn.v_proj.weight": "model-00003-of-00005.safetensors",
|
| 404 |
+
"model.language_model.layers.34.cross_attn.o_proj.weight": "model-00003-of-00005.safetensors",
|
| 405 |
+
"model.language_model.layers.34.cross_attn.q_norm.weight": "model-00003-of-00005.safetensors",
|
| 406 |
+
"model.language_model.layers.34.cross_attn.k_norm.weight": "model-00003-of-00005.safetensors",
|
| 407 |
+
"model.language_model.layers.34.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 408 |
+
"model.language_model.layers.34.cross_attn_attn_gate": "model-00003-of-00005.safetensors",
|
| 409 |
+
"model.language_model.layers.34.cross_attn_mlp_gate": "model-00003-of-00005.safetensors",
|
| 410 |
+
"model.language_model.layers.34.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
|
| 411 |
+
"model.language_model.layers.34.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
|
| 412 |
+
"model.language_model.layers.34.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
|
| 413 |
+
"model.language_model.layers.35.input_layernorm.weight": "model-00003-of-00005.safetensors",
|
| 414 |
+
"model.language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 415 |
+
"model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 416 |
+
"model.language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 417 |
+
"model.language_model.layers.35.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 418 |
+
"model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 419 |
+
"model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 420 |
+
"model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 421 |
+
"model.language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 422 |
+
"model.language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 423 |
+
"model.language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 424 |
+
"model.language_model.layers.36.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 425 |
+
"model.language_model.layers.36.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 426 |
+
"model.language_model.layers.36.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 427 |
+
"model.language_model.layers.36.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 428 |
+
"model.language_model.layers.36.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 429 |
+
"model.language_model.layers.36.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 430 |
+
"model.language_model.layers.36.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 431 |
+
"model.language_model.layers.36.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 432 |
+
"model.language_model.layers.36.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 433 |
+
"model.language_model.layers.36.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 434 |
+
"model.language_model.layers.36.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 435 |
+
"model.language_model.layers.37.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 436 |
+
"model.language_model.layers.37.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 437 |
+
"model.language_model.layers.37.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 438 |
+
"model.language_model.layers.37.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 439 |
+
"model.language_model.layers.37.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 440 |
+
"model.language_model.layers.37.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 441 |
+
"model.language_model.layers.37.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 442 |
+
"model.language_model.layers.37.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 443 |
+
"model.language_model.layers.37.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 444 |
+
"model.language_model.layers.37.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 445 |
+
"model.language_model.layers.37.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 446 |
+
"model.language_model.layers.38.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 447 |
+
"model.language_model.layers.38.cross_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 448 |
+
"model.language_model.layers.38.cross_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 449 |
+
"model.language_model.layers.38.cross_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 450 |
+
"model.language_model.layers.38.cross_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 451 |
+
"model.language_model.layers.38.cross_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 452 |
+
"model.language_model.layers.38.cross_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 453 |
+
"model.language_model.layers.38.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 454 |
+
"model.language_model.layers.38.cross_attn_attn_gate": "model-00004-of-00005.safetensors",
|
| 455 |
+
"model.language_model.layers.38.cross_attn_mlp_gate": "model-00004-of-00005.safetensors",
|
| 456 |
+
"model.language_model.layers.38.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 457 |
+
"model.language_model.layers.38.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 458 |
+
"model.language_model.layers.38.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 459 |
+
"model.language_model.layers.39.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 460 |
+
"model.language_model.layers.39.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 461 |
+
"model.language_model.layers.39.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 462 |
+
"model.language_model.layers.39.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 463 |
+
"model.language_model.layers.39.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 464 |
+
"model.language_model.layers.39.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 465 |
+
"model.language_model.layers.39.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 466 |
+
"model.language_model.layers.39.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 467 |
+
"model.language_model.layers.39.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 468 |
+
"model.language_model.layers.39.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 469 |
+
"model.language_model.layers.39.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 470 |
+
"model.language_model.layers.40.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 471 |
+
"model.language_model.layers.40.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 472 |
+
"model.language_model.layers.40.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 473 |
+
"model.language_model.layers.40.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 474 |
+
"model.language_model.layers.40.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 475 |
+
"model.language_model.layers.40.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 476 |
+
"model.language_model.layers.40.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 477 |
+
"model.language_model.layers.40.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 478 |
+
"model.language_model.layers.40.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 479 |
+
"model.language_model.layers.40.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 480 |
+
"model.language_model.layers.40.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 481 |
+
"model.language_model.layers.41.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 482 |
+
"model.language_model.layers.41.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 483 |
+
"model.language_model.layers.41.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 484 |
+
"model.language_model.layers.41.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 485 |
+
"model.language_model.layers.41.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 486 |
+
"model.language_model.layers.41.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 487 |
+
"model.language_model.layers.41.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 488 |
+
"model.language_model.layers.41.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 489 |
+
"model.language_model.layers.41.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 490 |
+
"model.language_model.layers.41.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 491 |
+
"model.language_model.layers.41.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 492 |
+
"model.language_model.layers.42.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 493 |
+
"model.language_model.layers.42.cross_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 494 |
+
"model.language_model.layers.42.cross_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 495 |
+
"model.language_model.layers.42.cross_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 496 |
+
"model.language_model.layers.42.cross_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 497 |
+
"model.language_model.layers.42.cross_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 498 |
+
"model.language_model.layers.42.cross_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 499 |
+
"model.language_model.layers.42.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 500 |
+
"model.language_model.layers.42.cross_attn_attn_gate": "model-00004-of-00005.safetensors",
|
| 501 |
+
"model.language_model.layers.42.cross_attn_mlp_gate": "model-00004-of-00005.safetensors",
|
| 502 |
+
"model.language_model.layers.42.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 503 |
+
"model.language_model.layers.42.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 504 |
+
"model.language_model.layers.42.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 505 |
+
"model.language_model.layers.43.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 506 |
+
"model.language_model.layers.43.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 507 |
+
"model.language_model.layers.43.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 508 |
+
"model.language_model.layers.43.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 509 |
+
"model.language_model.layers.43.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 510 |
+
"model.language_model.layers.43.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 511 |
+
"model.language_model.layers.43.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 512 |
+
"model.language_model.layers.43.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 513 |
+
"model.language_model.layers.43.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 514 |
+
"model.language_model.layers.43.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 515 |
+
"model.language_model.layers.43.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 516 |
+
"model.language_model.layers.44.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 517 |
+
"model.language_model.layers.44.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 518 |
+
"model.language_model.layers.44.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 519 |
+
"model.language_model.layers.44.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 520 |
+
"model.language_model.layers.44.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 521 |
+
"model.language_model.layers.44.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 522 |
+
"model.language_model.layers.44.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 523 |
+
"model.language_model.layers.44.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 524 |
+
"model.language_model.layers.44.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 525 |
+
"model.language_model.layers.44.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 526 |
+
"model.language_model.layers.44.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 527 |
+
"model.language_model.layers.45.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 528 |
+
"model.language_model.layers.45.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 529 |
+
"model.language_model.layers.45.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 530 |
+
"model.language_model.layers.45.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 531 |
+
"model.language_model.layers.45.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 532 |
+
"model.language_model.layers.45.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 533 |
+
"model.language_model.layers.45.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 534 |
+
"model.language_model.layers.45.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 535 |
+
"model.language_model.layers.45.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 536 |
+
"model.language_model.layers.45.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 537 |
+
"model.language_model.layers.45.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 538 |
+
"model.language_model.layers.46.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 539 |
+
"model.language_model.layers.46.cross_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 540 |
+
"model.language_model.layers.46.cross_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 541 |
+
"model.language_model.layers.46.cross_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 542 |
+
"model.language_model.layers.46.cross_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 543 |
+
"model.language_model.layers.46.cross_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 544 |
+
"model.language_model.layers.46.cross_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 545 |
+
"model.language_model.layers.46.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 546 |
+
"model.language_model.layers.46.cross_attn_attn_gate": "model-00004-of-00005.safetensors",
|
| 547 |
+
"model.language_model.layers.46.cross_attn_mlp_gate": "model-00004-of-00005.safetensors",
|
| 548 |
+
"model.language_model.layers.46.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 549 |
+
"model.language_model.layers.46.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 550 |
+
"model.language_model.layers.46.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 551 |
+
"model.language_model.layers.47.input_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 552 |
+
"model.language_model.layers.47.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
|
| 553 |
+
"model.language_model.layers.47.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
|
| 554 |
+
"model.language_model.layers.47.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
|
| 555 |
+
"model.language_model.layers.47.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
|
| 556 |
+
"model.language_model.layers.47.self_attn.q_norm.weight": "model-00004-of-00005.safetensors",
|
| 557 |
+
"model.language_model.layers.47.self_attn.k_norm.weight": "model-00004-of-00005.safetensors",
|
| 558 |
+
"model.language_model.layers.47.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
|
| 559 |
+
"model.language_model.layers.47.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
|
| 560 |
+
"model.language_model.layers.47.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
|
| 561 |
+
"model.language_model.layers.47.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
|
| 562 |
+
"model.visual.patch_embed.proj.weight": "model-00004-of-00005.safetensors",
|
| 563 |
+
"model.visual.patch_embed.proj.bias": "model-00004-of-00005.safetensors",
|
| 564 |
+
"model.visual.pos_embed.weight": "model-00004-of-00005.safetensors",
|
| 565 |
+
"model.visual.blocks.0.norm1.weight": "model-00004-of-00005.safetensors",
|
| 566 |
+
"model.visual.blocks.0.norm1.bias": "model-00004-of-00005.safetensors",
|
| 567 |
+
"model.visual.blocks.0.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 568 |
+
"model.visual.blocks.0.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 569 |
+
"model.visual.blocks.0.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 570 |
+
"model.visual.blocks.0.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 571 |
+
"model.visual.blocks.0.norm2.weight": "model-00004-of-00005.safetensors",
|
| 572 |
+
"model.visual.blocks.0.norm2.bias": "model-00004-of-00005.safetensors",
|
| 573 |
+
"model.visual.blocks.0.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 574 |
+
"model.visual.blocks.0.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 575 |
+
"model.visual.blocks.0.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 576 |
+
"model.visual.blocks.0.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 577 |
+
"model.visual.blocks.1.norm1.weight": "model-00004-of-00005.safetensors",
|
| 578 |
+
"model.visual.blocks.1.norm1.bias": "model-00004-of-00005.safetensors",
|
| 579 |
+
"model.visual.blocks.1.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 580 |
+
"model.visual.blocks.1.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 581 |
+
"model.visual.blocks.1.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 582 |
+
"model.visual.blocks.1.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 583 |
+
"model.visual.blocks.1.norm2.weight": "model-00004-of-00005.safetensors",
|
| 584 |
+
"model.visual.blocks.1.norm2.bias": "model-00004-of-00005.safetensors",
|
| 585 |
+
"model.visual.blocks.1.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 586 |
+
"model.visual.blocks.1.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 587 |
+
"model.visual.blocks.1.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 588 |
+
"model.visual.blocks.1.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 589 |
+
"model.visual.blocks.2.norm1.weight": "model-00004-of-00005.safetensors",
|
| 590 |
+
"model.visual.blocks.2.norm1.bias": "model-00004-of-00005.safetensors",
|
| 591 |
+
"model.visual.blocks.2.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 592 |
+
"model.visual.blocks.2.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 593 |
+
"model.visual.blocks.2.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 594 |
+
"model.visual.blocks.2.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 595 |
+
"model.visual.blocks.2.norm2.weight": "model-00004-of-00005.safetensors",
|
| 596 |
+
"model.visual.blocks.2.norm2.bias": "model-00004-of-00005.safetensors",
|
| 597 |
+
"model.visual.blocks.2.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 598 |
+
"model.visual.blocks.2.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 599 |
+
"model.visual.blocks.2.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 600 |
+
"model.visual.blocks.2.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 601 |
+
"model.visual.blocks.3.norm1.weight": "model-00004-of-00005.safetensors",
|
| 602 |
+
"model.visual.blocks.3.norm1.bias": "model-00004-of-00005.safetensors",
|
| 603 |
+
"model.visual.blocks.3.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 604 |
+
"model.visual.blocks.3.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 605 |
+
"model.visual.blocks.3.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 606 |
+
"model.visual.blocks.3.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 607 |
+
"model.visual.blocks.3.norm2.weight": "model-00004-of-00005.safetensors",
|
| 608 |
+
"model.visual.blocks.3.norm2.bias": "model-00004-of-00005.safetensors",
|
| 609 |
+
"model.visual.blocks.3.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 610 |
+
"model.visual.blocks.3.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 611 |
+
"model.visual.blocks.3.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 612 |
+
"model.visual.blocks.3.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 613 |
+
"model.visual.blocks.4.norm1.weight": "model-00004-of-00005.safetensors",
|
| 614 |
+
"model.visual.blocks.4.norm1.bias": "model-00004-of-00005.safetensors",
|
| 615 |
+
"model.visual.blocks.4.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 616 |
+
"model.visual.blocks.4.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 617 |
+
"model.visual.blocks.4.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 618 |
+
"model.visual.blocks.4.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 619 |
+
"model.visual.blocks.4.norm2.weight": "model-00004-of-00005.safetensors",
|
| 620 |
+
"model.visual.blocks.4.norm2.bias": "model-00004-of-00005.safetensors",
|
| 621 |
+
"model.visual.blocks.4.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 622 |
+
"model.visual.blocks.4.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 623 |
+
"model.visual.blocks.4.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 624 |
+
"model.visual.blocks.4.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 625 |
+
"model.visual.blocks.5.norm1.weight": "model-00004-of-00005.safetensors",
|
| 626 |
+
"model.visual.blocks.5.norm1.bias": "model-00004-of-00005.safetensors",
|
| 627 |
+
"model.visual.blocks.5.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 628 |
+
"model.visual.blocks.5.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 629 |
+
"model.visual.blocks.5.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 630 |
+
"model.visual.blocks.5.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 631 |
+
"model.visual.blocks.5.norm2.weight": "model-00004-of-00005.safetensors",
|
| 632 |
+
"model.visual.blocks.5.norm2.bias": "model-00004-of-00005.safetensors",
|
| 633 |
+
"model.visual.blocks.5.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 634 |
+
"model.visual.blocks.5.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 635 |
+
"model.visual.blocks.5.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 636 |
+
"model.visual.blocks.5.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 637 |
+
"model.visual.blocks.6.norm1.weight": "model-00004-of-00005.safetensors",
|
| 638 |
+
"model.visual.blocks.6.norm1.bias": "model-00004-of-00005.safetensors",
|
| 639 |
+
"model.visual.blocks.6.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 640 |
+
"model.visual.blocks.6.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 641 |
+
"model.visual.blocks.6.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 642 |
+
"model.visual.blocks.6.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 643 |
+
"model.visual.blocks.6.norm2.weight": "model-00004-of-00005.safetensors",
|
| 644 |
+
"model.visual.blocks.6.norm2.bias": "model-00004-of-00005.safetensors",
|
| 645 |
+
"model.visual.blocks.6.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 646 |
+
"model.visual.blocks.6.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 647 |
+
"model.visual.blocks.6.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 648 |
+
"model.visual.blocks.6.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 649 |
+
"model.visual.blocks.7.norm1.weight": "model-00004-of-00005.safetensors",
|
| 650 |
+
"model.visual.blocks.7.norm1.bias": "model-00004-of-00005.safetensors",
|
| 651 |
+
"model.visual.blocks.7.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 652 |
+
"model.visual.blocks.7.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 653 |
+
"model.visual.blocks.7.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 654 |
+
"model.visual.blocks.7.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 655 |
+
"model.visual.blocks.7.norm2.weight": "model-00004-of-00005.safetensors",
|
| 656 |
+
"model.visual.blocks.7.norm2.bias": "model-00004-of-00005.safetensors",
|
| 657 |
+
"model.visual.blocks.7.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 658 |
+
"model.visual.blocks.7.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 659 |
+
"model.visual.blocks.7.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 660 |
+
"model.visual.blocks.7.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 661 |
+
"model.visual.blocks.8.norm1.weight": "model-00004-of-00005.safetensors",
|
| 662 |
+
"model.visual.blocks.8.norm1.bias": "model-00004-of-00005.safetensors",
|
| 663 |
+
"model.visual.blocks.8.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 664 |
+
"model.visual.blocks.8.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 665 |
+
"model.visual.blocks.8.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 666 |
+
"model.visual.blocks.8.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 667 |
+
"model.visual.blocks.8.norm2.weight": "model-00004-of-00005.safetensors",
|
| 668 |
+
"model.visual.blocks.8.norm2.bias": "model-00004-of-00005.safetensors",
|
| 669 |
+
"model.visual.blocks.8.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 670 |
+
"model.visual.blocks.8.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 671 |
+
"model.visual.blocks.8.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 672 |
+
"model.visual.blocks.8.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 673 |
+
"model.visual.blocks.9.norm1.weight": "model-00004-of-00005.safetensors",
|
| 674 |
+
"model.visual.blocks.9.norm1.bias": "model-00004-of-00005.safetensors",
|
| 675 |
+
"model.visual.blocks.9.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 676 |
+
"model.visual.blocks.9.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 677 |
+
"model.visual.blocks.9.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 678 |
+
"model.visual.blocks.9.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 679 |
+
"model.visual.blocks.9.norm2.weight": "model-00004-of-00005.safetensors",
|
| 680 |
+
"model.visual.blocks.9.norm2.bias": "model-00004-of-00005.safetensors",
|
| 681 |
+
"model.visual.blocks.9.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 682 |
+
"model.visual.blocks.9.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 683 |
+
"model.visual.blocks.9.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 684 |
+
"model.visual.blocks.9.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 685 |
+
"model.visual.blocks.10.norm1.weight": "model-00004-of-00005.safetensors",
|
| 686 |
+
"model.visual.blocks.10.norm1.bias": "model-00004-of-00005.safetensors",
|
| 687 |
+
"model.visual.blocks.10.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 688 |
+
"model.visual.blocks.10.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 689 |
+
"model.visual.blocks.10.attn.proj.weight": "model-00004-of-00005.safetensors",
|
| 690 |
+
"model.visual.blocks.10.attn.proj.bias": "model-00004-of-00005.safetensors",
|
| 691 |
+
"model.visual.blocks.10.norm2.weight": "model-00004-of-00005.safetensors",
|
| 692 |
+
"model.visual.blocks.10.norm2.bias": "model-00004-of-00005.safetensors",
|
| 693 |
+
"model.visual.blocks.10.mlp.linear_fc1.weight": "model-00004-of-00005.safetensors",
|
| 694 |
+
"model.visual.blocks.10.mlp.linear_fc1.bias": "model-00004-of-00005.safetensors",
|
| 695 |
+
"model.visual.blocks.10.mlp.linear_fc2.weight": "model-00004-of-00005.safetensors",
|
| 696 |
+
"model.visual.blocks.10.mlp.linear_fc2.bias": "model-00004-of-00005.safetensors",
|
| 697 |
+
"model.visual.blocks.11.norm1.weight": "model-00004-of-00005.safetensors",
|
| 698 |
+
"model.visual.blocks.11.norm1.bias": "model-00004-of-00005.safetensors",
|
| 699 |
+
"model.visual.blocks.11.attn.qkv.weight": "model-00004-of-00005.safetensors",
|
| 700 |
+
"model.visual.blocks.11.attn.qkv.bias": "model-00004-of-00005.safetensors",
|
| 701 |
+
"model.visual.blocks.11.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 702 |
+
"model.visual.blocks.11.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 703 |
+
"model.visual.blocks.11.norm2.weight": "model-00005-of-00005.safetensors",
|
| 704 |
+
"model.visual.blocks.11.norm2.bias": "model-00005-of-00005.safetensors",
|
| 705 |
+
"model.visual.blocks.11.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 706 |
+
"model.visual.blocks.11.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 707 |
+
"model.visual.blocks.11.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 708 |
+
"model.visual.blocks.11.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 709 |
+
"model.visual.blocks.12.norm1.weight": "model-00005-of-00005.safetensors",
|
| 710 |
+
"model.visual.blocks.12.norm1.bias": "model-00005-of-00005.safetensors",
|
| 711 |
+
"model.visual.blocks.12.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 712 |
+
"model.visual.blocks.12.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 713 |
+
"model.visual.blocks.12.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 714 |
+
"model.visual.blocks.12.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 715 |
+
"model.visual.blocks.12.norm2.weight": "model-00005-of-00005.safetensors",
|
| 716 |
+
"model.visual.blocks.12.norm2.bias": "model-00005-of-00005.safetensors",
|
| 717 |
+
"model.visual.blocks.12.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 718 |
+
"model.visual.blocks.12.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 719 |
+
"model.visual.blocks.12.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 720 |
+
"model.visual.blocks.12.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 721 |
+
"model.visual.blocks.13.norm1.weight": "model-00005-of-00005.safetensors",
|
| 722 |
+
"model.visual.blocks.13.norm1.bias": "model-00005-of-00005.safetensors",
|
| 723 |
+
"model.visual.blocks.13.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 724 |
+
"model.visual.blocks.13.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 725 |
+
"model.visual.blocks.13.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 726 |
+
"model.visual.blocks.13.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 727 |
+
"model.visual.blocks.13.norm2.weight": "model-00005-of-00005.safetensors",
|
| 728 |
+
"model.visual.blocks.13.norm2.bias": "model-00005-of-00005.safetensors",
|
| 729 |
+
"model.visual.blocks.13.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 730 |
+
"model.visual.blocks.13.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 731 |
+
"model.visual.blocks.13.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 732 |
+
"model.visual.blocks.13.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 733 |
+
"model.visual.blocks.14.norm1.weight": "model-00005-of-00005.safetensors",
|
| 734 |
+
"model.visual.blocks.14.norm1.bias": "model-00005-of-00005.safetensors",
|
| 735 |
+
"model.visual.blocks.14.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 736 |
+
"model.visual.blocks.14.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 737 |
+
"model.visual.blocks.14.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 738 |
+
"model.visual.blocks.14.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 739 |
+
"model.visual.blocks.14.norm2.weight": "model-00005-of-00005.safetensors",
|
| 740 |
+
"model.visual.blocks.14.norm2.bias": "model-00005-of-00005.safetensors",
|
| 741 |
+
"model.visual.blocks.14.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 742 |
+
"model.visual.blocks.14.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 743 |
+
"model.visual.blocks.14.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 744 |
+
"model.visual.blocks.14.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 745 |
+
"model.visual.blocks.15.norm1.weight": "model-00005-of-00005.safetensors",
|
| 746 |
+
"model.visual.blocks.15.norm1.bias": "model-00005-of-00005.safetensors",
|
| 747 |
+
"model.visual.blocks.15.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 748 |
+
"model.visual.blocks.15.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 749 |
+
"model.visual.blocks.15.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 750 |
+
"model.visual.blocks.15.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 751 |
+
"model.visual.blocks.15.norm2.weight": "model-00005-of-00005.safetensors",
|
| 752 |
+
"model.visual.blocks.15.norm2.bias": "model-00005-of-00005.safetensors",
|
| 753 |
+
"model.visual.blocks.15.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 754 |
+
"model.visual.blocks.15.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 755 |
+
"model.visual.blocks.15.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 756 |
+
"model.visual.blocks.15.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 757 |
+
"model.visual.blocks.16.norm1.weight": "model-00005-of-00005.safetensors",
|
| 758 |
+
"model.visual.blocks.16.norm1.bias": "model-00005-of-00005.safetensors",
|
| 759 |
+
"model.visual.blocks.16.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 760 |
+
"model.visual.blocks.16.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 761 |
+
"model.visual.blocks.16.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 762 |
+
"model.visual.blocks.16.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 763 |
+
"model.visual.blocks.16.norm2.weight": "model-00005-of-00005.safetensors",
|
| 764 |
+
"model.visual.blocks.16.norm2.bias": "model-00005-of-00005.safetensors",
|
| 765 |
+
"model.visual.blocks.16.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 766 |
+
"model.visual.blocks.16.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 767 |
+
"model.visual.blocks.16.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 768 |
+
"model.visual.blocks.16.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 769 |
+
"model.visual.blocks.17.norm1.weight": "model-00005-of-00005.safetensors",
|
| 770 |
+
"model.visual.blocks.17.norm1.bias": "model-00005-of-00005.safetensors",
|
| 771 |
+
"model.visual.blocks.17.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 772 |
+
"model.visual.blocks.17.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 773 |
+
"model.visual.blocks.17.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 774 |
+
"model.visual.blocks.17.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 775 |
+
"model.visual.blocks.17.norm2.weight": "model-00005-of-00005.safetensors",
|
| 776 |
+
"model.visual.blocks.17.norm2.bias": "model-00005-of-00005.safetensors",
|
| 777 |
+
"model.visual.blocks.17.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 778 |
+
"model.visual.blocks.17.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 779 |
+
"model.visual.blocks.17.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 780 |
+
"model.visual.blocks.17.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 781 |
+
"model.visual.blocks.18.norm1.weight": "model-00005-of-00005.safetensors",
|
| 782 |
+
"model.visual.blocks.18.norm1.bias": "model-00005-of-00005.safetensors",
|
| 783 |
+
"model.visual.blocks.18.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 784 |
+
"model.visual.blocks.18.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 785 |
+
"model.visual.blocks.18.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 786 |
+
"model.visual.blocks.18.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 787 |
+
"model.visual.blocks.18.norm2.weight": "model-00005-of-00005.safetensors",
|
| 788 |
+
"model.visual.blocks.18.norm2.bias": "model-00005-of-00005.safetensors",
|
| 789 |
+
"model.visual.blocks.18.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 790 |
+
"model.visual.blocks.18.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 791 |
+
"model.visual.blocks.18.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 792 |
+
"model.visual.blocks.18.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 793 |
+
"model.visual.blocks.19.norm1.weight": "model-00005-of-00005.safetensors",
|
| 794 |
+
"model.visual.blocks.19.norm1.bias": "model-00005-of-00005.safetensors",
|
| 795 |
+
"model.visual.blocks.19.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 796 |
+
"model.visual.blocks.19.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 797 |
+
"model.visual.blocks.19.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 798 |
+
"model.visual.blocks.19.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 799 |
+
"model.visual.blocks.19.norm2.weight": "model-00005-of-00005.safetensors",
|
| 800 |
+
"model.visual.blocks.19.norm2.bias": "model-00005-of-00005.safetensors",
|
| 801 |
+
"model.visual.blocks.19.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 802 |
+
"model.visual.blocks.19.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 803 |
+
"model.visual.blocks.19.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 804 |
+
"model.visual.blocks.19.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 805 |
+
"model.visual.blocks.20.norm1.weight": "model-00005-of-00005.safetensors",
|
| 806 |
+
"model.visual.blocks.20.norm1.bias": "model-00005-of-00005.safetensors",
|
| 807 |
+
"model.visual.blocks.20.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 808 |
+
"model.visual.blocks.20.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 809 |
+
"model.visual.blocks.20.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 810 |
+
"model.visual.blocks.20.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 811 |
+
"model.visual.blocks.20.norm2.weight": "model-00005-of-00005.safetensors",
|
| 812 |
+
"model.visual.blocks.20.norm2.bias": "model-00005-of-00005.safetensors",
|
| 813 |
+
"model.visual.blocks.20.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 814 |
+
"model.visual.blocks.20.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 815 |
+
"model.visual.blocks.20.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 816 |
+
"model.visual.blocks.20.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 817 |
+
"model.visual.blocks.21.norm1.weight": "model-00005-of-00005.safetensors",
|
| 818 |
+
"model.visual.blocks.21.norm1.bias": "model-00005-of-00005.safetensors",
|
| 819 |
+
"model.visual.blocks.21.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 820 |
+
"model.visual.blocks.21.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 821 |
+
"model.visual.blocks.21.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 822 |
+
"model.visual.blocks.21.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 823 |
+
"model.visual.blocks.21.norm2.weight": "model-00005-of-00005.safetensors",
|
| 824 |
+
"model.visual.blocks.21.norm2.bias": "model-00005-of-00005.safetensors",
|
| 825 |
+
"model.visual.blocks.21.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 826 |
+
"model.visual.blocks.21.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 827 |
+
"model.visual.blocks.21.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 828 |
+
"model.visual.blocks.21.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 829 |
+
"model.visual.blocks.22.norm1.weight": "model-00005-of-00005.safetensors",
|
| 830 |
+
"model.visual.blocks.22.norm1.bias": "model-00005-of-00005.safetensors",
|
| 831 |
+
"model.visual.blocks.22.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 832 |
+
"model.visual.blocks.22.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 833 |
+
"model.visual.blocks.22.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 834 |
+
"model.visual.blocks.22.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 835 |
+
"model.visual.blocks.22.norm2.weight": "model-00005-of-00005.safetensors",
|
| 836 |
+
"model.visual.blocks.22.norm2.bias": "model-00005-of-00005.safetensors",
|
| 837 |
+
"model.visual.blocks.22.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 838 |
+
"model.visual.blocks.22.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 839 |
+
"model.visual.blocks.22.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 840 |
+
"model.visual.blocks.22.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 841 |
+
"model.visual.blocks.23.norm1.weight": "model-00005-of-00005.safetensors",
|
| 842 |
+
"model.visual.blocks.23.norm1.bias": "model-00005-of-00005.safetensors",
|
| 843 |
+
"model.visual.blocks.23.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 844 |
+
"model.visual.blocks.23.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 845 |
+
"model.visual.blocks.23.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 846 |
+
"model.visual.blocks.23.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 847 |
+
"model.visual.blocks.23.norm2.weight": "model-00005-of-00005.safetensors",
|
| 848 |
+
"model.visual.blocks.23.norm2.bias": "model-00005-of-00005.safetensors",
|
| 849 |
+
"model.visual.blocks.23.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 850 |
+
"model.visual.blocks.23.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 851 |
+
"model.visual.blocks.23.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 852 |
+
"model.visual.blocks.23.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 853 |
+
"model.visual.blocks.24.norm1.weight": "model-00005-of-00005.safetensors",
|
| 854 |
+
"model.visual.blocks.24.norm1.bias": "model-00005-of-00005.safetensors",
|
| 855 |
+
"model.visual.blocks.24.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 856 |
+
"model.visual.blocks.24.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 857 |
+
"model.visual.blocks.24.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 858 |
+
"model.visual.blocks.24.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 859 |
+
"model.visual.blocks.24.norm2.weight": "model-00005-of-00005.safetensors",
|
| 860 |
+
"model.visual.blocks.24.norm2.bias": "model-00005-of-00005.safetensors",
|
| 861 |
+
"model.visual.blocks.24.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 862 |
+
"model.visual.blocks.24.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 863 |
+
"model.visual.blocks.24.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 864 |
+
"model.visual.blocks.24.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 865 |
+
"model.visual.blocks.25.norm1.weight": "model-00005-of-00005.safetensors",
|
| 866 |
+
"model.visual.blocks.25.norm1.bias": "model-00005-of-00005.safetensors",
|
| 867 |
+
"model.visual.blocks.25.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 868 |
+
"model.visual.blocks.25.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 869 |
+
"model.visual.blocks.25.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 870 |
+
"model.visual.blocks.25.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 871 |
+
"model.visual.blocks.25.norm2.weight": "model-00005-of-00005.safetensors",
|
| 872 |
+
"model.visual.blocks.25.norm2.bias": "model-00005-of-00005.safetensors",
|
| 873 |
+
"model.visual.blocks.25.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 874 |
+
"model.visual.blocks.25.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 875 |
+
"model.visual.blocks.25.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 876 |
+
"model.visual.blocks.25.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 877 |
+
"model.visual.blocks.26.norm1.weight": "model-00005-of-00005.safetensors",
|
| 878 |
+
"model.visual.blocks.26.norm1.bias": "model-00005-of-00005.safetensors",
|
| 879 |
+
"model.visual.blocks.26.attn.qkv.weight": "model-00005-of-00005.safetensors",
|
| 880 |
+
"model.visual.blocks.26.attn.qkv.bias": "model-00005-of-00005.safetensors",
|
| 881 |
+
"model.visual.blocks.26.attn.proj.weight": "model-00005-of-00005.safetensors",
|
| 882 |
+
"model.visual.blocks.26.attn.proj.bias": "model-00005-of-00005.safetensors",
|
| 883 |
+
"model.visual.blocks.26.norm2.weight": "model-00005-of-00005.safetensors",
|
| 884 |
+
"model.visual.blocks.26.norm2.bias": "model-00005-of-00005.safetensors",
|
| 885 |
+
"model.visual.blocks.26.mlp.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 886 |
+
"model.visual.blocks.26.mlp.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 887 |
+
"model.visual.blocks.26.mlp.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 888 |
+
"model.visual.blocks.26.mlp.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 889 |
+
"model.visual.merger.linear_fc1.weight": "model-00005-of-00005.safetensors",
|
| 890 |
+
"model.visual.merger.linear_fc1.bias": "model-00005-of-00005.safetensors",
|
| 891 |
+
"model.visual.merger.linear_fc2.weight": "model-00005-of-00005.safetensors",
|
| 892 |
+
"model.visual.merger.linear_fc2.bias": "model-00005-of-00005.safetensors",
|
| 893 |
+
"model.visual.merger.norms.0.weight": "model-00005-of-00005.safetensors",
|
| 894 |
+
"model.visual.merger.norms.0.bias": "model-00005-of-00005.safetensors",
|
| 895 |
+
"model.visual.merger.norms.1.weight": "model-00005-of-00005.safetensors",
|
| 896 |
+
"model.visual.merger.norms.1.bias": "model-00005-of-00005.safetensors",
|
| 897 |
+
"model.visual.merger.norms.2.weight": "model-00005-of-00005.safetensors",
|
| 898 |
+
"model.visual.merger.norms.2.bias": "model-00005-of-00005.safetensors",
|
| 899 |
+
"model.visual.merger.norms.3.weight": "model-00005-of-00005.safetensors",
|
| 900 |
+
"model.visual.merger.norms.3.bias": "model-00005-of-00005.safetensors"
|
| 901 |
+
}
|
| 902 |
+
}
|
modeling_moss_vl.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoProcessor": "processing_moss_vl.MossVLProcessor",
|
| 4 |
+
"AutoImageProcessor": "processing_moss_vl.MossVLImageProcessorFast"
|
| 5 |
+
},
|
| 6 |
+
"size": {
|
| 7 |
+
"longest_edge": 16777216,
|
| 8 |
+
"shortest_edge": 4096
|
| 9 |
+
},
|
| 10 |
+
"multi_image_max_pixels": 943718400,
|
| 11 |
+
"patch_size": 16,
|
| 12 |
+
"temporal_patch_size": 1,
|
| 13 |
+
"merge_size": 2,
|
| 14 |
+
"image_mean": [
|
| 15 |
+
0.5,
|
| 16 |
+
0.5,
|
| 17 |
+
0.5
|
| 18 |
+
],
|
| 19 |
+
"image_std": [
|
| 20 |
+
0.5,
|
| 21 |
+
0.5,
|
| 22 |
+
0.5
|
| 23 |
+
],
|
| 24 |
+
"processor_class": "MossVLProcessor",
|
| 25 |
+
"image_processor_type": "MossVLImageProcessorFast"
|
| 26 |
+
}
|
processing_moss_vl.py
ADDED
|
@@ -0,0 +1,1079 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2025 The FNLP Vision Team and The HuggingFace Inc. team. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""
|
| 16 |
+
Processor class for Moss-VL.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from typing import Any, Dict, List, Optional, Union
|
| 20 |
+
|
| 21 |
+
import numpy as np
|
| 22 |
+
import torch
|
| 23 |
+
from torchvision.transforms.v2 import functional as F
|
| 24 |
+
from PIL import Image
|
| 25 |
+
from transformers.feature_extraction_utils import BatchFeature
|
| 26 |
+
from transformers.image_utils import ImageInput, SizeDict
|
| 27 |
+
from transformers.image_processing_utils_fast import group_images_by_shape, reorder_images
|
| 28 |
+
from transformers.utils import TensorType
|
| 29 |
+
from transformers.processing_utils import (
|
| 30 |
+
ImagesKwargs,
|
| 31 |
+
ProcessingKwargs,
|
| 32 |
+
ProcessorMixin,
|
| 33 |
+
Unpack,
|
| 34 |
+
VideosKwargs,
|
| 35 |
+
)
|
| 36 |
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 37 |
+
from transformers.utils import logging
|
| 38 |
+
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import Qwen2VLImageProcessorFast
|
| 39 |
+
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
logger = logging.get_logger(__name__)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class MossVLImageProcessorFast(Qwen2VLImageProcessorFast):
|
| 46 |
+
"""
|
| 47 |
+
Custom image processor that overrides _preprocess to support multi_image_max_pixels.
|
| 48 |
+
Inherits from Qwen2VLImageProcessorFast.
|
| 49 |
+
"""
|
| 50 |
+
# Multi-image batch total pixels limit (read from config)
|
| 51 |
+
multi_image_max_pixels = None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _preprocess(
|
| 55 |
+
self,
|
| 56 |
+
images: list["torch.Tensor"],
|
| 57 |
+
do_resize: bool,
|
| 58 |
+
size: SizeDict,
|
| 59 |
+
interpolation: Optional["F.InterpolationMode"],
|
| 60 |
+
do_rescale: bool,
|
| 61 |
+
rescale_factor: float,
|
| 62 |
+
do_normalize: bool,
|
| 63 |
+
image_mean: Optional[Union[float, list[float]]],
|
| 64 |
+
image_std: Optional[Union[float, list[float]]],
|
| 65 |
+
patch_size: int,
|
| 66 |
+
temporal_patch_size: int,
|
| 67 |
+
merge_size: int,
|
| 68 |
+
disable_grouping: Optional[bool],
|
| 69 |
+
return_tensors: Optional[Union[str, TensorType]],
|
| 70 |
+
**kwargs,
|
| 71 |
+
):
|
| 72 |
+
"""Override _preprocess to use custom smart_resize with batch-level max_pixels.
|
| 73 |
+
|
| 74 |
+
multi_image_max_pixels is treated as a batch-level total budget, proportionally allocated
|
| 75 |
+
to each image based on its original pixel count. min_pixels remains a per-image
|
| 76 |
+
constraint. multi_image_max_pixels can be configured separately from longest_edge.
|
| 77 |
+
"""
|
| 78 |
+
min_pixels = size["shortest_edge"]
|
| 79 |
+
max_pixels = size["longest_edge"] # Per-image upper limit
|
| 80 |
+
# Use multi_image_max_pixels if configured, otherwise fall back to longest_edge
|
| 81 |
+
multi_image_max_pixels = getattr(self, "multi_image_max_pixels", None) or max_pixels
|
| 82 |
+
|
| 83 |
+
# Calculate total original pixels across all images in the batch
|
| 84 |
+
# This is used to proportionally allocate max_pixels to each image
|
| 85 |
+
total_original_pixels = sum(img.shape[-2] * img.shape[-1] for img in images)
|
| 86 |
+
|
| 87 |
+
# Group images by size for batched resizing
|
| 88 |
+
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
| 89 |
+
resized_images_grouped = {}
|
| 90 |
+
for shape, stacked_images in grouped_images.items():
|
| 91 |
+
height, width = stacked_images.shape[-2:]
|
| 92 |
+
if do_resize:
|
| 93 |
+
# Calculate proportional max_pixels for images with this shape
|
| 94 |
+
# Each image's max_pixels is allocated based on its proportion of total pixels
|
| 95 |
+
original_pixels = height * width
|
| 96 |
+
if total_original_pixels > 0:
|
| 97 |
+
proportion = original_pixels / total_original_pixels
|
| 98 |
+
proportional_max_pixels = int(multi_image_max_pixels * proportion)
|
| 99 |
+
else:
|
| 100 |
+
proportional_max_pixels = multi_image_max_pixels
|
| 101 |
+
|
| 102 |
+
# Ensure proportional max_pixels is within [min_pixels, max_pixels] range
|
| 103 |
+
# min_pixels: per-image lower limit (shortest_edge)
|
| 104 |
+
# max_pixels: per-image upper limit (longest_edge)
|
| 105 |
+
proportional_max_pixels = max(proportional_max_pixels, min_pixels)
|
| 106 |
+
proportional_max_pixels = min(proportional_max_pixels, max_pixels)
|
| 107 |
+
|
| 108 |
+
resized_height, resized_width = smart_resize(
|
| 109 |
+
height,
|
| 110 |
+
width,
|
| 111 |
+
factor=patch_size * merge_size,
|
| 112 |
+
min_pixels=min_pixels,
|
| 113 |
+
max_pixels=proportional_max_pixels,
|
| 114 |
+
)
|
| 115 |
+
stacked_images = self.resize(
|
| 116 |
+
image=stacked_images,
|
| 117 |
+
size=SizeDict(height=resized_height, width=resized_width),
|
| 118 |
+
interpolation=interpolation,
|
| 119 |
+
)
|
| 120 |
+
resized_images_grouped[shape] = stacked_images
|
| 121 |
+
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
| 122 |
+
|
| 123 |
+
# Warn if multi-image batch exceeds multi_image_max_pixels due to min_pixels constraint
|
| 124 |
+
if len(images) > 1:
|
| 125 |
+
total_resized_pixels = sum(img.shape[-2] * img.shape[-1] for img in resized_images)
|
| 126 |
+
if total_resized_pixels > multi_image_max_pixels:
|
| 127 |
+
logger.warning_once(
|
| 128 |
+
f"Multi-image batch total pixels ({total_resized_pixels}) exceeds multi_image_max_pixels ({multi_image_max_pixels}). "
|
| 129 |
+
f"This may happen when image_count * min_pixels > multi_image_max_pixels."
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Group images by size for further processing
|
| 133 |
+
# Needed in case do_resize is False, or resize returns images with different sizes
|
| 134 |
+
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
| 135 |
+
processed_images_grouped = {}
|
| 136 |
+
processed_grids = {}
|
| 137 |
+
for shape, stacked_images in grouped_images.items():
|
| 138 |
+
resized_height, resized_width = stacked_images.shape[-2:]
|
| 139 |
+
# Fused rescale and normalize
|
| 140 |
+
patches = self.rescale_and_normalize(
|
| 141 |
+
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
| 142 |
+
)
|
| 143 |
+
if patches.ndim == 4:
|
| 144 |
+
# add a temporal dimension if we have images
|
| 145 |
+
patches = patches.unsqueeze(1)
|
| 146 |
+
if patches.shape[1] % temporal_patch_size != 0:
|
| 147 |
+
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
|
| 148 |
+
patches = torch.cat([patches, repeats], dim=1)
|
| 149 |
+
batch_size, grid_t, channel = patches.shape[:3]
|
| 150 |
+
grid_t = grid_t // temporal_patch_size
|
| 151 |
+
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
| 152 |
+
|
| 153 |
+
patches = patches.view(
|
| 154 |
+
batch_size,
|
| 155 |
+
grid_t,
|
| 156 |
+
temporal_patch_size,
|
| 157 |
+
channel,
|
| 158 |
+
grid_h // merge_size,
|
| 159 |
+
merge_size,
|
| 160 |
+
patch_size,
|
| 161 |
+
grid_w // merge_size,
|
| 162 |
+
merge_size,
|
| 163 |
+
patch_size,
|
| 164 |
+
)
|
| 165 |
+
# Reorder dimensions to group grid and patch information for subsequent flattening.
|
| 166 |
+
# (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w)
|
| 167 |
+
patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
|
| 168 |
+
flatten_patches = patches.reshape(
|
| 169 |
+
batch_size,
|
| 170 |
+
grid_t * grid_h * grid_w,
|
| 171 |
+
channel * temporal_patch_size * patch_size * patch_size,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
processed_images_grouped[shape] = flatten_patches
|
| 175 |
+
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
|
| 176 |
+
|
| 177 |
+
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
| 178 |
+
processed_grids = reorder_images(processed_grids, grouped_images_index)
|
| 179 |
+
pixel_values = torch.cat(processed_images, dim=0)
|
| 180 |
+
image_grid_thw = torch.tensor(processed_grids)
|
| 181 |
+
|
| 182 |
+
return BatchFeature(
|
| 183 |
+
data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def _to_numpy(x):
|
| 187 |
+
"""
|
| 188 |
+
Convert various tensor types to numpy array.
|
| 189 |
+
Supports torch.Tensor, tf.Tensor, jax.Array, np.ndarray, lists, and primitives.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
x: Input value that can be a tensor from various frameworks or a Python primitive
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
np.ndarray: NumPy array representation of the input
|
| 196 |
+
"""
|
| 197 |
+
# Already numpy
|
| 198 |
+
if isinstance(x, np.ndarray):
|
| 199 |
+
return x
|
| 200 |
+
|
| 201 |
+
# Torch tensor or TensorFlow tensor (both have .numpy() method)
|
| 202 |
+
if hasattr(x, 'numpy'):
|
| 203 |
+
# For torch tensors on CUDA, need to move to CPU first
|
| 204 |
+
if hasattr(x, 'cpu'):
|
| 205 |
+
return x.cpu().numpy()
|
| 206 |
+
# For TensorFlow or already on CPU
|
| 207 |
+
return x.numpy()
|
| 208 |
+
|
| 209 |
+
# JAX arrays and other array-like objects that support __array__ protocol
|
| 210 |
+
if hasattr(x, '__array__'):
|
| 211 |
+
return np.asarray(x)
|
| 212 |
+
|
| 213 |
+
# Python primitives (list, tuple, int, float)
|
| 214 |
+
return np.array(x)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
class MossVLImagesKwargs(ImagesKwargs):
|
| 218 |
+
min_pixels: Optional[int]
|
| 219 |
+
max_pixels: Optional[int]
|
| 220 |
+
patch_size: Optional[int]
|
| 221 |
+
temporal_patch_size: Optional[int]
|
| 222 |
+
merge_size: Optional[int]
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
class MossVLVideosKwargs(VideosKwargs, total=False):
|
| 227 |
+
video_fps: Optional[Union[int, float]]
|
| 228 |
+
min_frames: Optional[int]
|
| 229 |
+
max_frames: Optional[int]
|
| 230 |
+
num_extract_threads: Optional[int]
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
class MossVLProcessorKwargs(ProcessingKwargs, total=False):
|
| 234 |
+
images_kwargs: MossVLImagesKwargs
|
| 235 |
+
videos_kwargs: MossVLVideosKwargs
|
| 236 |
+
# _defaults = {
|
| 237 |
+
# "text_kwargs": {
|
| 238 |
+
# "padding": True, # 👈 启用 padding
|
| 239 |
+
# "padding_side": "left", # 👈 左 padding
|
| 240 |
+
# "pad_to_multiple_of": 8, # 👈 pad 到 8 的倍数
|
| 241 |
+
# "return_token_type_ids": False,
|
| 242 |
+
# "return_mm_token_type_ids": False,
|
| 243 |
+
# },
|
| 244 |
+
# "videos_kwargs": {"return_metadata": True},
|
| 245 |
+
# }
|
| 246 |
+
_defaults = {
|
| 247 |
+
"text_kwargs": {
|
| 248 |
+
"padding": False,
|
| 249 |
+
"return_token_type_ids": False,
|
| 250 |
+
"return_mm_token_type_ids": False,
|
| 251 |
+
},
|
| 252 |
+
"videos_kwargs": {"return_metadata": True},
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
class MossVLProcessor(ProcessorMixin):
|
| 256 |
+
r"""
|
| 257 |
+
Constructs a Moss-VL processor which wraps a Qwen2VL image processor, Moss-VL video processor and a Qwen2 tokenizer
|
| 258 |
+
into a single processor.
|
| 259 |
+
|
| 260 |
+
[`MossVLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`], [`MossVLVideoProcessor`] and [`Qwen2TokenizerFast`].
|
| 261 |
+
See the [`~MossVLProcessor.__call__`] and [`~MossVLProcessor.decode`] for more information.
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
| 265 |
+
The image processor is a required input.
|
| 266 |
+
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
| 267 |
+
The tokenizer is a required input.
|
| 268 |
+
video_processor ([`MossVLVideoProcessor`], *optional*):
|
| 269 |
+
The video processor is a required input.
|
| 270 |
+
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
| 271 |
+
in a chat into a tokenizable string.
|
| 272 |
+
"""
|
| 273 |
+
|
| 274 |
+
attributes = ["image_processor", "tokenizer", "video_processor"]
|
| 275 |
+
image_processor_class = "AutoImageProcessor"
|
| 276 |
+
video_processor_class = "AutoVideoProcessor"
|
| 277 |
+
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
| 278 |
+
|
| 279 |
+
def __init__(
|
| 280 |
+
self,
|
| 281 |
+
image_processor=None,
|
| 282 |
+
tokenizer=None,
|
| 283 |
+
video_processor=None,
|
| 284 |
+
chat_template=None,
|
| 285 |
+
**kwargs
|
| 286 |
+
):
|
| 287 |
+
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
| 291 |
+
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
self.image_token_id = (
|
| 295 |
+
tokenizer.image_token_id
|
| 296 |
+
if getattr(tokenizer, "image_token_id", None)
|
| 297 |
+
else tokenizer.convert_tokens_to_ids(self.image_token)
|
| 298 |
+
)
|
| 299 |
+
self.video_token_id = (
|
| 300 |
+
tokenizer.video_token_id
|
| 301 |
+
if getattr(tokenizer, "video_token_id", None)
|
| 302 |
+
else tokenizer.convert_tokens_to_ids(self.video_token)
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
self.vision_start_token = (
|
| 306 |
+
"<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
|
| 307 |
+
)
|
| 308 |
+
self.vision_end_token = (
|
| 309 |
+
"<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# Placeholders used in input text
|
| 313 |
+
self.image_placeholder = "<|image|>"
|
| 314 |
+
self.video_placeholder = "<|video|>"
|
| 315 |
+
|
| 316 |
+
self.time_start_token = "<|time_start|>"
|
| 317 |
+
self.time_end_token = "<|time_end|>"
|
| 318 |
+
|
| 319 |
+
# EOS token for labels generation (assistant's response should end with this)
|
| 320 |
+
self.im_end_token = "<|im_end|>"
|
| 321 |
+
self.im_end_token_id = tokenizer.convert_tokens_to_ids(self.im_end_token)
|
| 322 |
+
|
| 323 |
+
# Vision-related token ids (all should be masked in labels)
|
| 324 |
+
self.vision_start_token_id = tokenizer.convert_tokens_to_ids(self.vision_start_token)
|
| 325 |
+
self.vision_end_token_id = tokenizer.convert_tokens_to_ids(self.vision_end_token)
|
| 326 |
+
|
| 327 |
+
# Token ids that should always be masked in labels (e.g. <|image_pad|>)
|
| 328 |
+
self.mask_token_ids = {self.image_token_id}
|
| 329 |
+
|
| 330 |
+
def __call__(
|
| 331 |
+
self,
|
| 332 |
+
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
| 333 |
+
images: ImageInput = None,
|
| 334 |
+
videos: Union[str, Dict[str, Any], List[Union[str, Dict[str, Any]]]] = None,
|
| 335 |
+
labels_spans: Optional[Union[List[tuple], List[List[tuple]]]] = None,
|
| 336 |
+
ignore_index: int = -100,
|
| 337 |
+
**kwargs: Unpack[MossVLProcessorKwargs],
|
| 338 |
+
) -> BatchFeature:
|
| 339 |
+
"""
|
| 340 |
+
Main method to prepare for the model one or several sequences(s) and image(s)/video(s).
|
| 341 |
+
|
| 342 |
+
Args:
|
| 343 |
+
text (`str`, `list[str]`, `list[list[str]]`):
|
| 344 |
+
The sequence or batch of sequences to be encoded.
|
| 345 |
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
| 346 |
+
The image or batch of images to be prepared.
|
| 347 |
+
videos (`str`, `Dict`, `list[str]`, `list[Dict]`):
|
| 348 |
+
The video or batch of videos to be prepared. Each video can be:
|
| 349 |
+
- A string path to a video file
|
| 350 |
+
- A dict with keys:
|
| 351 |
+
- "video_path": str, path to the video file
|
| 352 |
+
- "segments": list of segments, where each segment is:
|
| 353 |
+
- [start, end]: a time segment (left-closed, right-open interval in seconds)
|
| 354 |
+
- [time]: a single frame at the specified time (in seconds)
|
| 355 |
+
The number of segments should match the number of video placeholders in the text.
|
| 356 |
+
labels_spans (`list[list[int]]`, `list[list[list[int]]]`, *optional*):
|
| 357 |
+
Character-level spans indicating assistant regions in original text.
|
| 358 |
+
Each span is a [start, end] list with inclusive start and exclusive end.
|
| 359 |
+
Example: [[10, 50], [100, 150]] means characters [10:50) and [100:150) are assistant.
|
| 360 |
+
Note: Use list (not tuple) for spans as they will be modified in place during processing.
|
| 361 |
+
When provided, the processor will generate `labels` in the output, where:
|
| 362 |
+
- Non-assistant tokens have value `ignore_index` (-100 by default)
|
| 363 |
+
- Image tokens always have value `ignore_index` even in assistant part
|
| 364 |
+
- Other assistant tokens have their token id as label
|
| 365 |
+
ignore_index (`int`, *optional*, defaults to -100):
|
| 366 |
+
Value for masked positions in labels.
|
| 367 |
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
| 368 |
+
If set, will return tensors of a particular framework. Acceptable values are:
|
| 369 |
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
| 370 |
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
| 371 |
+
- `'np'`: Return NumPy `np.ndarray` objects.
|
| 372 |
+
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
| 377 |
+
- **input_ids** -- List of token ids to be fed to a model.
|
| 378 |
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
|
| 379 |
+
- **pixel_values** -- Pixel values to be fed to a model (concatenation of images and videos).
|
| 380 |
+
- **grid_thw** -- List of grid sizes (t, h, w) for each media item.
|
| 381 |
+
- **media_nums_per_sample** -- List of number of media items per sample.
|
| 382 |
+
- **labels** -- (Optional) Labels for training, only present when `labels_spans` is provided.
|
| 383 |
+
"""
|
| 384 |
+
# Merge kwargs with defaults
|
| 385 |
+
output_kwargs = self._merge_kwargs(
|
| 386 |
+
MossVLProcessorKwargs,
|
| 387 |
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
| 388 |
+
**kwargs,
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
# Step 1: Process images if provided
|
| 392 |
+
if images is not None:
|
| 393 |
+
images_kwargs = output_kwargs["images_kwargs"].copy()
|
| 394 |
+
images_kwargs["return_tensors"] = None
|
| 395 |
+
image_inputs = self.image_processor(images=images, **images_kwargs)
|
| 396 |
+
image_grid_thw = image_inputs["image_grid_thw"]
|
| 397 |
+
else:
|
| 398 |
+
image_inputs = {}
|
| 399 |
+
image_grid_thw = None
|
| 400 |
+
|
| 401 |
+
# Step 2: Process videos if provided
|
| 402 |
+
if videos is not None:
|
| 403 |
+
videos_kwargs = output_kwargs["videos_kwargs"].copy()
|
| 404 |
+
videos_kwargs["return_tensors"] = None
|
| 405 |
+
videos_inputs = self.video_processor(videos=videos, **videos_kwargs)
|
| 406 |
+
video_grid_thw = videos_inputs["video_grid_thw"]
|
| 407 |
+
# If user has not requested video metadata, pop it
|
| 408 |
+
if "return_metadata" not in kwargs:
|
| 409 |
+
video_metadata = videos_inputs.pop("video_metadata")
|
| 410 |
+
else:
|
| 411 |
+
video_metadata = videos_inputs["video_metadata"]
|
| 412 |
+
else:
|
| 413 |
+
videos_inputs = {}
|
| 414 |
+
video_grid_thw = None
|
| 415 |
+
video_metadata = None
|
| 416 |
+
|
| 417 |
+
# Step 3: Process text with placeholder replacement
|
| 418 |
+
if text is None or (isinstance(text, str) and len(text.strip()) == 0):
|
| 419 |
+
raise ValueError("Text input is required for MossVL processor and cannot be empty.")
|
| 420 |
+
|
| 421 |
+
if not isinstance(text, list):
|
| 422 |
+
text = [text]
|
| 423 |
+
|
| 424 |
+
text = text.copy() # Copy to avoid in-place modifications
|
| 425 |
+
|
| 426 |
+
# Prepare labels_spans if provided
|
| 427 |
+
# labels_spans format: List[List[List[int]]] - batch of samples, each sample has multiple spans
|
| 428 |
+
# Each span is [start, end] (list, not tuple) so it can be modified in place
|
| 429 |
+
should_create_labels = labels_spans is not None
|
| 430 |
+
if should_create_labels:
|
| 431 |
+
# Ensure batch format: convert single sample spans to batch format
|
| 432 |
+
# Single sample: [[start, end], [start, end], ...]
|
| 433 |
+
# Batch: [[[start, end], ...], [[start, end], ...], ...]
|
| 434 |
+
if labels_spans and isinstance(labels_spans[0], list) and len(labels_spans[0]) == 2 and isinstance(labels_spans[0][0], int):
|
| 435 |
+
labels_spans = [labels_spans]
|
| 436 |
+
|
| 437 |
+
# Step 3.0-pre: Check if we need to reorder (when both images and videos exist)
|
| 438 |
+
# If only one media type exists, we can skip the expensive split+reorder+concat
|
| 439 |
+
has_images = images is not None and "pixel_values" in image_inputs
|
| 440 |
+
has_videos = videos is not None and "pixel_values_videos" in videos_inputs
|
| 441 |
+
needs_reorder = has_images and has_videos
|
| 442 |
+
|
| 443 |
+
image_pixel_values_list = []
|
| 444 |
+
video_pixel_values_list = []
|
| 445 |
+
|
| 446 |
+
# Step 3.0: Record the order of media in original text (before replacement)
|
| 447 |
+
# This will be used later to correctly order pixel_values and grid_thw
|
| 448 |
+
media_order_per_sample = []
|
| 449 |
+
for i in range(len(text)):
|
| 450 |
+
media_order = []
|
| 451 |
+
temp_text = text[i]
|
| 452 |
+
pos = 0
|
| 453 |
+
while pos < len(temp_text):
|
| 454 |
+
img_pos = temp_text.find(self.image_placeholder, pos)
|
| 455 |
+
vid_pos = temp_text.find(self.video_placeholder, pos)
|
| 456 |
+
|
| 457 |
+
if img_pos == -1 and vid_pos == -1:
|
| 458 |
+
break
|
| 459 |
+
|
| 460 |
+
if img_pos != -1 and (vid_pos == -1 or img_pos < vid_pos):
|
| 461 |
+
media_order.append(("image", img_pos))
|
| 462 |
+
pos = img_pos + len(self.image_placeholder)
|
| 463 |
+
elif vid_pos != -1:
|
| 464 |
+
media_order.append(("video", vid_pos))
|
| 465 |
+
pos = vid_pos + len(self.video_placeholder)
|
| 466 |
+
|
| 467 |
+
media_order_per_sample.append(media_order)
|
| 468 |
+
|
| 469 |
+
# Step 3.0.1: Check if any sample has no media (empty samples need blank image)
|
| 470 |
+
# If there are empty samples, we need to enter slow path to handle them properly
|
| 471 |
+
has_empty_samples = any(len(order) == 0 for order in media_order_per_sample)
|
| 472 |
+
if has_empty_samples:
|
| 473 |
+
needs_reorder = True
|
| 474 |
+
|
| 475 |
+
# Split pixel values for reordering if needed
|
| 476 |
+
if needs_reorder:
|
| 477 |
+
if has_images:
|
| 478 |
+
flat_pixel_values = image_inputs["pixel_values"]
|
| 479 |
+
flat_grid_thw = image_inputs["image_grid_thw"]
|
| 480 |
+
# grid_thw is (t, h, w), num_patches = t * h * w
|
| 481 |
+
patch_counts = [int(np.prod(_to_numpy(grid))) for grid in flat_grid_thw]
|
| 482 |
+
if len(patch_counts) == 1:
|
| 483 |
+
# Single image case: no need to split
|
| 484 |
+
image_pixel_values_list = [flat_pixel_values]
|
| 485 |
+
elif len(patch_counts) > 1:
|
| 486 |
+
# Multiple images: split by cumulative counts
|
| 487 |
+
split_indices = np.cumsum(patch_counts)[:-1]
|
| 488 |
+
image_pixel_values_list = np.split(flat_pixel_values, split_indices)
|
| 489 |
+
|
| 490 |
+
if has_videos:
|
| 491 |
+
flat_video_values = videos_inputs["pixel_values_videos"]
|
| 492 |
+
flat_video_grid = videos_inputs["video_grid_thw"]
|
| 493 |
+
video_patch_counts = [int(np.prod(_to_numpy(grid))) for grid in flat_video_grid]
|
| 494 |
+
if len(video_patch_counts) == 1:
|
| 495 |
+
# Single video case: no need to split
|
| 496 |
+
video_pixel_values_list = [flat_video_values]
|
| 497 |
+
elif len(video_patch_counts) > 1:
|
| 498 |
+
# Multiple videos: split by cumulative counts
|
| 499 |
+
split_indices = np.cumsum(video_patch_counts)[:-1]
|
| 500 |
+
video_pixel_values_list = np.split(flat_video_values, split_indices)
|
| 501 |
+
|
| 502 |
+
# Step 3.1: Replace placeholders (simple replacement, no expansion yet)
|
| 503 |
+
# In MossVL, one image placeholder = one image token
|
| 504 |
+
# One video placeholder = one video token (will be expanded later)
|
| 505 |
+
for i in range(len(text)):
|
| 506 |
+
if should_create_labels:
|
| 507 |
+
# Replace and update spans for image placeholders
|
| 508 |
+
text[i], labels_spans[i] = self._replace_and_update_spans(
|
| 509 |
+
text[i], self.image_placeholder, self.image_token, labels_spans[i]
|
| 510 |
+
)
|
| 511 |
+
# Replace and update spans for video placeholders
|
| 512 |
+
text[i], labels_spans[i] = self._replace_and_update_spans(
|
| 513 |
+
text[i], self.video_placeholder, self.video_token, labels_spans[i]
|
| 514 |
+
)
|
| 515 |
+
else:
|
| 516 |
+
text[i] = text[i].replace(self.image_placeholder, self.image_token)
|
| 517 |
+
text[i] = text[i].replace(self.video_placeholder, self.video_token)
|
| 518 |
+
|
| 519 |
+
# Step 3.2: Validate token counts
|
| 520 |
+
n_images_in_text = [t.count(self.image_token) for t in text]
|
| 521 |
+
n_videos_in_text = [t.count(self.video_token) for t in text]
|
| 522 |
+
|
| 523 |
+
# Count placeholders in text
|
| 524 |
+
total_images_in_text = sum(n_images_in_text)
|
| 525 |
+
total_videos_in_text = sum(n_videos_in_text)
|
| 526 |
+
|
| 527 |
+
# Count actual images and videos provided
|
| 528 |
+
total_images_provided = len(image_grid_thw) if image_grid_thw is not None else 0
|
| 529 |
+
total_videos_provided = len(video_grid_thw) if video_grid_thw is not None else 0
|
| 530 |
+
|
| 531 |
+
# Validate image counts
|
| 532 |
+
if total_images_in_text != total_images_provided:
|
| 533 |
+
raise ValueError(
|
| 534 |
+
"Number of image tokens does not match number of images provided. "
|
| 535 |
+
f"Found {total_images_in_text} image tokens in text and {total_images_provided} images."
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
# Validate video counts
|
| 539 |
+
if total_videos_in_text != total_videos_provided:
|
| 540 |
+
raise ValueError(
|
| 541 |
+
"Number of video tokens does not match number of videos provided. "
|
| 542 |
+
f"Found {total_videos_in_text} video tokens in text and {total_videos_provided} videos."
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
# Step 3.3: Expand video tokens with timestamps
|
| 546 |
+
# Now expand each video token to multiple tokens (one per frame) with timestamps
|
| 547 |
+
if video_grid_thw is not None:
|
| 548 |
+
index = 0
|
| 549 |
+
for i in range(len(text)):
|
| 550 |
+
while self.video_token in text[i]:
|
| 551 |
+
metadata = video_metadata[index]
|
| 552 |
+
if metadata.fps is None:
|
| 553 |
+
logger.warning_once(
|
| 554 |
+
"MossVL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
|
| 555 |
+
"Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
|
| 556 |
+
"Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
|
| 557 |
+
)
|
| 558 |
+
metadata.fps = 24 if metadata.fps is None else metadata.fps
|
| 559 |
+
|
| 560 |
+
# Calculate timestamps
|
| 561 |
+
# Use actual_timestamps if available (for segments), otherwise use frames_indices
|
| 562 |
+
actual_timestamps = getattr(metadata, 'actual_timestamps', None)
|
| 563 |
+
curr_timestamp = self._calculate_timestamps(
|
| 564 |
+
metadata.frames_indices,
|
| 565 |
+
metadata.total_num_frames,
|
| 566 |
+
metadata.fps,
|
| 567 |
+
metadata.duration,
|
| 568 |
+
self.video_processor.temporal_patch_size,
|
| 569 |
+
actual_timestamps=actual_timestamps,
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
# Build video placeholder: one video token per frame with timestamp
|
| 573 |
+
# video_grid_thw[index][0] is the temporal dimension (number of frames after merging)
|
| 574 |
+
|
| 575 |
+
video_tokens = []
|
| 576 |
+
for frame_idx in range(video_grid_thw[index][0]):
|
| 577 |
+
curr_time = curr_timestamp[frame_idx]
|
| 578 |
+
# Format: <|time_start|>X.X seconds<|time_end|><|image_pad|>
|
| 579 |
+
video_tokens.append(
|
| 580 |
+
f"{self.time_start_token}{curr_time:.1f} seconds{self.time_end_token}{self.image_token}"
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
# Wrap the entire video sequence with vision_start and vision_end tokens
|
| 584 |
+
video_placeholder = f"{self.vision_start_token}{''.join(video_tokens)}{self.vision_end_token}"
|
| 585 |
+
|
| 586 |
+
# Replace the video token with expanded sequence and update spans if needed
|
| 587 |
+
if should_create_labels:
|
| 588 |
+
text[i], labels_spans[i] = self._replace_and_update_spans(
|
| 589 |
+
text[i], self.video_token, video_placeholder, labels_spans[i], replace_count=1
|
| 590 |
+
)
|
| 591 |
+
else:
|
| 592 |
+
text[i] = text[i].replace(self.video_token, video_placeholder, 1)
|
| 593 |
+
index += 1
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
# Step 4: Tokenize text
|
| 598 |
+
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
| 599 |
+
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
| 600 |
+
|
| 601 |
+
# Request offset_mapping if we need to create labels
|
| 602 |
+
if should_create_labels:
|
| 603 |
+
output_kwargs["text_kwargs"]["return_offsets_mapping"] = True
|
| 604 |
+
|
| 605 |
+
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
| 606 |
+
|
| 607 |
+
# ignore check_special_mm_tokens nums in test and input ids.
|
| 608 |
+
# self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
|
| 609 |
+
|
| 610 |
+
# Create labels if labels_spans was provided
|
| 611 |
+
if should_create_labels:
|
| 612 |
+
offset_mapping = text_inputs.pop("offset_mapping")
|
| 613 |
+
labels = self._create_labels_from_spans(
|
| 614 |
+
text_inputs["input_ids"],
|
| 615 |
+
offset_mapping,
|
| 616 |
+
labels_spans,
|
| 617 |
+
ignore_index
|
| 618 |
+
)
|
| 619 |
+
|
| 620 |
+
if return_mm_token_type_ids:
|
| 621 |
+
array_ids = np.array(text_inputs["input_ids"])
|
| 622 |
+
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
|
| 623 |
+
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
| 624 |
+
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
| 625 |
+
|
| 626 |
+
# Step 5: Concatenate pixel_values and grid_thw in sequence order
|
| 627 |
+
# Prepare output
|
| 628 |
+
output_data = {**text_inputs}
|
| 629 |
+
|
| 630 |
+
if not needs_reorder:
|
| 631 |
+
# Fast path: only one media type, no reordering needed
|
| 632 |
+
final_pixel_values = []
|
| 633 |
+
final_grid_thw = []
|
| 634 |
+
|
| 635 |
+
if has_images:
|
| 636 |
+
final_pixel_values.append(image_inputs["pixel_values"])
|
| 637 |
+
final_grid_thw.extend(image_grid_thw)
|
| 638 |
+
|
| 639 |
+
if has_videos:
|
| 640 |
+
final_pixel_values.append(videos_inputs["pixel_values_videos"])
|
| 641 |
+
final_grid_thw.extend(video_grid_thw)
|
| 642 |
+
|
| 643 |
+
if final_pixel_values:
|
| 644 |
+
output_data["pixel_values"] = np.concatenate(final_pixel_values, axis=0) if len(final_pixel_values) > 1 else final_pixel_values[0]
|
| 645 |
+
|
| 646 |
+
if final_grid_thw:
|
| 647 |
+
output_data["grid_thw"] = np.stack(final_grid_thw, axis=0)
|
| 648 |
+
|
| 649 |
+
# Calculate media_nums_per_sample
|
| 650 |
+
media_nums_per_sample = []
|
| 651 |
+
for batch_idx in range(len(text)):
|
| 652 |
+
media_order = media_order_per_sample[batch_idx]
|
| 653 |
+
media_nums_per_sample.append(len(media_order) if len(media_order) > 0 else 1)
|
| 654 |
+
|
| 655 |
+
# Don't add media_nums_per_sample to output_data yet
|
| 656 |
+
# Will add it after BatchFeature to keep it as list
|
| 657 |
+
|
| 658 |
+
else:
|
| 659 |
+
# Slow path: both images and videos exist, need reordering
|
| 660 |
+
final_pixel_values = []
|
| 661 |
+
final_grid_thw = []
|
| 662 |
+
media_nums_per_sample = []
|
| 663 |
+
|
| 664 |
+
# Global indices to track position in flattened image/video arrays
|
| 665 |
+
global_image_idx = 0
|
| 666 |
+
global_video_idx = 0
|
| 667 |
+
|
| 668 |
+
for batch_idx in range(len(text)):
|
| 669 |
+
# Use the recorded media order from Step 3.0
|
| 670 |
+
media_order = media_order_per_sample[batch_idx]
|
| 671 |
+
|
| 672 |
+
if len(media_order) == 0:
|
| 673 |
+
# If no media provided for this sample, add a blank image
|
| 674 |
+
media_nums_per_sample.append(1)
|
| 675 |
+
min_pixels = 128 * 128
|
| 676 |
+
patch_size = getattr(self.image_processor, "patch_size", None) or 16
|
| 677 |
+
temporal_patch_size = getattr(self.image_processor, "temporal_patch_size", None) or 1
|
| 678 |
+
merge_size = getattr(self.image_processor, "merge_size", None) or 2
|
| 679 |
+
|
| 680 |
+
factor = patch_size * merge_size
|
| 681 |
+
side = int(np.ceil(np.sqrt(min_pixels) / factor) * factor)
|
| 682 |
+
grid_h = side // patch_size
|
| 683 |
+
grid_w = side // patch_size
|
| 684 |
+
grid_t = 1
|
| 685 |
+
|
| 686 |
+
# Channel = 3 (RGB)
|
| 687 |
+
channel = 3
|
| 688 |
+
dim = channel * temporal_patch_size * patch_size * patch_size
|
| 689 |
+
num_patches = grid_t * grid_h * grid_w
|
| 690 |
+
|
| 691 |
+
blank_pixel_values = np.zeros((num_patches, dim), dtype=np.float32)
|
| 692 |
+
blank_grid_thw = np.array([grid_t, grid_h, grid_w], dtype=np.int64)
|
| 693 |
+
|
| 694 |
+
final_pixel_values.append(blank_pixel_values)
|
| 695 |
+
final_grid_thw.append(blank_grid_thw)
|
| 696 |
+
else:
|
| 697 |
+
media_nums_per_sample.append(len(media_order))
|
| 698 |
+
|
| 699 |
+
# Collect media data according to the recorded order
|
| 700 |
+
for media_type, _ in media_order:
|
| 701 |
+
if media_type == "image" and image_grid_thw is not None:
|
| 702 |
+
# Get image data
|
| 703 |
+
if image_pixel_values_list:
|
| 704 |
+
final_pixel_values.append(image_pixel_values_list[global_image_idx])
|
| 705 |
+
final_grid_thw.append(image_grid_thw[global_image_idx])
|
| 706 |
+
global_image_idx += 1
|
| 707 |
+
elif media_type == "video" and video_grid_thw is not None:
|
| 708 |
+
# Get video data
|
| 709 |
+
if video_pixel_values_list:
|
| 710 |
+
final_pixel_values.append(video_pixel_values_list[global_video_idx])
|
| 711 |
+
final_grid_thw.append(video_grid_thw[global_video_idx])
|
| 712 |
+
global_video_idx += 1
|
| 713 |
+
|
| 714 |
+
# Concatenate/stack to unified format
|
| 715 |
+
if final_pixel_values:
|
| 716 |
+
output_data["pixel_values"] = np.concatenate(final_pixel_values, axis=0)
|
| 717 |
+
|
| 718 |
+
if final_grid_thw:
|
| 719 |
+
output_data["grid_thw"] = np.stack(final_grid_thw, axis=0)
|
| 720 |
+
|
| 721 |
+
# Don't add media_nums_per_sample to output_data yet
|
| 722 |
+
# Will add it after BatchFeature to keep it as list
|
| 723 |
+
|
| 724 |
+
# Create cross_attention_mask using media_nums_per_sample
|
| 725 |
+
if "input_ids" in output_data and "grid_thw" in output_data and media_nums_per_sample:
|
| 726 |
+
cross_attention_mask = self._create_cross_attention_mask(
|
| 727 |
+
output_data["input_ids"],
|
| 728 |
+
output_data["grid_thw"],
|
| 729 |
+
media_nums_per_sample,
|
| 730 |
+
output_data.get("attention_mask", None)
|
| 731 |
+
)
|
| 732 |
+
output_data["cross_attention_mask"] = cross_attention_mask
|
| 733 |
+
|
| 734 |
+
# Add labels to output if created
|
| 735 |
+
if should_create_labels:
|
| 736 |
+
output_data["labels"] = labels
|
| 737 |
+
|
| 738 |
+
# BatchFeature will handle conversion to pt/tf/jax/np based on tensor_type
|
| 739 |
+
batch_feature = BatchFeature(data=output_data, tensor_type=return_tensors)
|
| 740 |
+
|
| 741 |
+
# Add media_nums_per_sample after BatchFeature to keep it as list (not tensor)
|
| 742 |
+
if media_nums_per_sample:
|
| 743 |
+
batch_feature["media_nums_per_sample"] = media_nums_per_sample
|
| 744 |
+
|
| 745 |
+
return batch_feature
|
| 746 |
+
|
| 747 |
+
def _create_cross_attention_mask(self, input_ids, grid_thw, media_nums_per_sample, attention_mask=None):
|
| 748 |
+
"""
|
| 749 |
+
Create cross_attention_mask of shape (batch_size, 1, text_len, num_images).
|
| 750 |
+
Video frames are treated as individual images.
|
| 751 |
+
Mask values: True for masked, False for visible.
|
| 752 |
+
Causal masking: text can see images that appear at or before the text position.
|
| 753 |
+
|
| 754 |
+
Args:
|
| 755 |
+
input_ids: List of token ids
|
| 756 |
+
grid_thw: Grid sizes for each media item
|
| 757 |
+
media_nums_per_sample: Number of media items per sample
|
| 758 |
+
attention_mask: Optional attention mask to filter out padding positions
|
| 759 |
+
"""
|
| 760 |
+
batch_size = len(input_ids)
|
| 761 |
+
max_text_len = max(len(ids) for ids in input_ids)
|
| 762 |
+
|
| 763 |
+
# Calculate total frames per sample to find max_num_frames
|
| 764 |
+
total_frames_per_sample = []
|
| 765 |
+
media_idx = 0
|
| 766 |
+
for b in range(batch_size):
|
| 767 |
+
num_media = media_nums_per_sample[b]
|
| 768 |
+
if num_media == 0:
|
| 769 |
+
total_frames_per_sample.append(0)
|
| 770 |
+
continue
|
| 771 |
+
|
| 772 |
+
sample_frames = 0
|
| 773 |
+
for _ in range(num_media):
|
| 774 |
+
# grid_thw is (N, 3) where first dim is t (num_frames)
|
| 775 |
+
t = grid_thw[media_idx][0]
|
| 776 |
+
sample_frames += t
|
| 777 |
+
media_idx += 1
|
| 778 |
+
total_frames_per_sample.append(sample_frames)
|
| 779 |
+
|
| 780 |
+
max_num_frames = max(total_frames_per_sample) if total_frames_per_sample else 0
|
| 781 |
+
|
| 782 |
+
if max_num_frames == 0:
|
| 783 |
+
return None
|
| 784 |
+
|
| 785 |
+
# Vectorized implementation for speed
|
| 786 |
+
|
| 787 |
+
# 1. Pad input_ids to create a tensor
|
| 788 |
+
# We use -1 as pad value since token ids are positive
|
| 789 |
+
input_ids_tensor = torch.full((batch_size, max_text_len), -1, dtype=torch.long)
|
| 790 |
+
for b, ids in enumerate(input_ids):
|
| 791 |
+
l = len(ids)
|
| 792 |
+
input_ids_tensor[b, :l] = torch.tensor(ids, dtype=torch.long)
|
| 793 |
+
|
| 794 |
+
# 2. Identify image tokens
|
| 795 |
+
is_image_token = (input_ids_tensor == self.image_token_id)
|
| 796 |
+
|
| 797 |
+
# 3. Compute cumulative image tokens (how many image tokens appeared up to position t)
|
| 798 |
+
# shape: (batch_size, text_len)
|
| 799 |
+
cum_image_tokens = is_image_token.cumsum(dim=1)
|
| 800 |
+
|
| 801 |
+
# 4. Create frame indices
|
| 802 |
+
# shape: (1, 1, max_num_frames)
|
| 803 |
+
frame_indices = torch.arange(max_num_frames).reshape(1, 1, -1)
|
| 804 |
+
|
| 805 |
+
# 5. Determine visibility based on causal relationship
|
| 806 |
+
# Text at `t` sees frame `i` if `cum_image_tokens[t] > i`
|
| 807 |
+
# Because if frame `i` is the (i+1)-th image token, it becomes visible when count reaches i+1
|
| 808 |
+
# shape: (batch_size, text_len, max_num_frames)
|
| 809 |
+
visible_mask = cum_image_tokens.unsqueeze(-1) > frame_indices
|
| 810 |
+
|
| 811 |
+
# 6. Apply attention_mask if provided
|
| 812 |
+
if attention_mask is not None:
|
| 813 |
+
# Convert to tensor if needed
|
| 814 |
+
if isinstance(attention_mask, torch.Tensor):
|
| 815 |
+
attn_mask_tensor = attention_mask
|
| 816 |
+
else:
|
| 817 |
+
# List of lists
|
| 818 |
+
attn_mask_tensor = torch.zeros((batch_size, max_text_len), dtype=torch.long)
|
| 819 |
+
for b, mask_row in enumerate(attention_mask):
|
| 820 |
+
l = len(mask_row)
|
| 821 |
+
attn_mask_tensor[b, :l] = torch.tensor(mask_row, dtype=torch.long)
|
| 822 |
+
|
| 823 |
+
# shape: (batch_size, text_len, 1)
|
| 824 |
+
valid_text = (attn_mask_tensor.unsqueeze(-1) == 1)
|
| 825 |
+
visible_mask = visible_mask & valid_text
|
| 826 |
+
|
| 827 |
+
# 7. Mask out frames that don't exist for a sample
|
| 828 |
+
# shape: (batch_size, 1, 1)
|
| 829 |
+
total_frames_tensor = torch.tensor(total_frames_per_sample).reshape(batch_size, 1, 1)
|
| 830 |
+
# shape: (batch_size, 1, max_num_frames)
|
| 831 |
+
valid_frames = frame_indices < total_frames_tensor
|
| 832 |
+
|
| 833 |
+
visible_mask = visible_mask & valid_frames
|
| 834 |
+
|
| 835 |
+
# 8. Create final mask (True for masked, False for visible)
|
| 836 |
+
mask = ~visible_mask
|
| 837 |
+
|
| 838 |
+
# 9. Add channel dimension: (batch_size, 1, text_len, max_num_frames)
|
| 839 |
+
mask = mask.unsqueeze(1)
|
| 840 |
+
|
| 841 |
+
return mask
|
| 842 |
+
|
| 843 |
+
def _replace_and_update_spans(
|
| 844 |
+
self,
|
| 845 |
+
text: str,
|
| 846 |
+
old_str: str,
|
| 847 |
+
new_str: str,
|
| 848 |
+
spans: List[List[int]],
|
| 849 |
+
replace_count: int = -1
|
| 850 |
+
) -> tuple:
|
| 851 |
+
"""
|
| 852 |
+
Replace occurrences of old_str with new_str and update spans accordingly.
|
| 853 |
+
|
| 854 |
+
Args:
|
| 855 |
+
text: The text to perform replacement on
|
| 856 |
+
old_str: String to be replaced
|
| 857 |
+
new_str: String to replace with
|
| 858 |
+
spans: List of [start, end] spans to update (modified in place)
|
| 859 |
+
replace_count: Maximum number of replacements (-1 for all)
|
| 860 |
+
|
| 861 |
+
Returns:
|
| 862 |
+
Tuple of (new_text, updated_spans)
|
| 863 |
+
"""
|
| 864 |
+
delta = len(new_str) - len(old_str)
|
| 865 |
+
result_text = text
|
| 866 |
+
count = 0
|
| 867 |
+
search_start = 0
|
| 868 |
+
|
| 869 |
+
while True:
|
| 870 |
+
pos = result_text.find(old_str, search_start)
|
| 871 |
+
if pos == -1:
|
| 872 |
+
break
|
| 873 |
+
if replace_count != -1 and count >= replace_count:
|
| 874 |
+
break
|
| 875 |
+
|
| 876 |
+
# Update all spans that come after this position
|
| 877 |
+
for span in spans:
|
| 878 |
+
if span[0] > pos:
|
| 879 |
+
# Span starts after replacement point
|
| 880 |
+
span[0] += delta
|
| 881 |
+
span[1] += delta
|
| 882 |
+
elif span[1] > pos:
|
| 883 |
+
# Span ends after replacement point (spans the replacement)
|
| 884 |
+
span[1] += delta
|
| 885 |
+
|
| 886 |
+
# Perform the replacement
|
| 887 |
+
result_text = result_text[:pos] + new_str + result_text[pos + len(old_str):]
|
| 888 |
+
search_start = pos + len(new_str)
|
| 889 |
+
count += 1
|
| 890 |
+
|
| 891 |
+
return result_text, spans
|
| 892 |
+
|
| 893 |
+
def _create_labels_from_spans(
|
| 894 |
+
self,
|
| 895 |
+
input_ids: List[List[int]],
|
| 896 |
+
offset_mapping: List[List[tuple]],
|
| 897 |
+
labels_spans: List[List[List[int]]],
|
| 898 |
+
ignore_index: int = -100,
|
| 899 |
+
mask_token_ids: Optional[set] = None
|
| 900 |
+
) -> List[List[int]]:
|
| 901 |
+
"""
|
| 902 |
+
Create labels from spans and offset_mapping.
|
| 903 |
+
|
| 904 |
+
Args:
|
| 905 |
+
input_ids: Tokenized input ids
|
| 906 |
+
offset_mapping: Character offsets for each token from tokenizer (special tokens included)
|
| 907 |
+
labels_spans: Updated spans indicating assistant regions (after text transformations)
|
| 908 |
+
ignore_index: Value for masked positions
|
| 909 |
+
mask_token_ids: Set of token ids that should always be masked (set to ignore_index)
|
| 910 |
+
in labels, regardless of whether they fall inside a span.
|
| 911 |
+
Defaults to self.mask_token_ids if not provided.
|
| 912 |
+
|
| 913 |
+
Returns:
|
| 914 |
+
labels: List of label ids, same shape as input_ids
|
| 915 |
+
|
| 916 |
+
Note:
|
| 917 |
+
- Tokenizer's offset_mapping already includes correct offsets for special tokens in text
|
| 918 |
+
- Only need to mask tokens inside <|vision_start|>...<|vision_end|>
|
| 919 |
+
- Tokens whose id is in mask_token_ids are always masked
|
| 920 |
+
- All other tokens in spans (including special tokens like <|im_end|>) get labels
|
| 921 |
+
"""
|
| 922 |
+
if mask_token_ids is None:
|
| 923 |
+
mask_token_ids = self.mask_token_ids
|
| 924 |
+
|
| 925 |
+
batch_labels = []
|
| 926 |
+
|
| 927 |
+
for batch_idx in range(len(input_ids)):
|
| 928 |
+
ids = input_ids[batch_idx]
|
| 929 |
+
offsets = offset_mapping[batch_idx]
|
| 930 |
+
spans = labels_spans[batch_idx]
|
| 931 |
+
|
| 932 |
+
labels = [ignore_index] * len(ids)
|
| 933 |
+
|
| 934 |
+
# Process each span: find token range and set labels
|
| 935 |
+
for span_start, span_end in spans:
|
| 936 |
+
in_vision = False
|
| 937 |
+
|
| 938 |
+
# Find tokens that overlap with this span
|
| 939 |
+
for token_idx, (token_id, (char_start, char_end)) in enumerate(zip(ids, offsets)):
|
| 940 |
+
# Skip tokens completely before this span
|
| 941 |
+
if char_end <= span_start:
|
| 942 |
+
continue
|
| 943 |
+
# Stop when tokens are completely after this span
|
| 944 |
+
if char_start >= span_end:
|
| 945 |
+
break
|
| 946 |
+
|
| 947 |
+
# Token overlaps with span, process it
|
| 948 |
+
# Track vision region: <|vision_start|> ... <|vision_end|>
|
| 949 |
+
if token_id == self.vision_start_token_id:
|
| 950 |
+
in_vision = True
|
| 951 |
+
continue
|
| 952 |
+
if token_id == self.vision_end_token_id:
|
| 953 |
+
in_vision = False
|
| 954 |
+
continue
|
| 955 |
+
|
| 956 |
+
# Skip tokens inside vision region
|
| 957 |
+
if in_vision:
|
| 958 |
+
continue
|
| 959 |
+
|
| 960 |
+
# Always mask special tokens that should never have labels
|
| 961 |
+
if token_id in mask_token_ids:
|
| 962 |
+
continue
|
| 963 |
+
|
| 964 |
+
# Set label for this token
|
| 965 |
+
labels[token_idx] = token_id
|
| 966 |
+
|
| 967 |
+
batch_labels.append(labels)
|
| 968 |
+
|
| 969 |
+
return batch_labels
|
| 970 |
+
|
| 971 |
+
def _calculate_timestamps(
|
| 972 |
+
self,
|
| 973 |
+
frames_indices: Optional[Union[List[int], np.ndarray]],
|
| 974 |
+
total_num_frames: int,
|
| 975 |
+
video_fps: float,
|
| 976 |
+
duration: float,
|
| 977 |
+
merge_size: int = 1,
|
| 978 |
+
actual_timestamps: Optional[List[float]] = None
|
| 979 |
+
):
|
| 980 |
+
"""
|
| 981 |
+
Calculate timestamps for video frames.
|
| 982 |
+
|
| 983 |
+
Args:
|
| 984 |
+
frames_indices: Actual frame indices extracted (if available)
|
| 985 |
+
total_num_frames: Total number of sampled frames
|
| 986 |
+
video_fps: Video frames per second
|
| 987 |
+
duration: Video duration in seconds
|
| 988 |
+
merge_size: Temporal merge size
|
| 989 |
+
actual_timestamps: Pre-calculated actual timestamps (for segments)
|
| 990 |
+
|
| 991 |
+
Returns:
|
| 992 |
+
List of timestamps (one per merged temporal patch)
|
| 993 |
+
"""
|
| 994 |
+
# If actual timestamps are provided (from segment), use them directly
|
| 995 |
+
if actual_timestamps is not None:
|
| 996 |
+
timestamps = list(actual_timestamps)
|
| 997 |
+
|
| 998 |
+
# Pad timestamps to be multiple of merge_size
|
| 999 |
+
if len(timestamps) % merge_size != 0:
|
| 1000 |
+
timestamps.extend([timestamps[-1]] * (merge_size - len(timestamps) % merge_size))
|
| 1001 |
+
|
| 1002 |
+
# Frames are merged by merge_size, so we average the timestamps within each temporal patch
|
| 1003 |
+
timestamps = [
|
| 1004 |
+
(timestamps[i] + timestamps[i + merge_size - 1]) / 2
|
| 1005 |
+
for i in range(0, len(timestamps), merge_size)
|
| 1006 |
+
]
|
| 1007 |
+
return timestamps
|
| 1008 |
+
|
| 1009 |
+
# Use frames_indices if available, otherwise generate uniformly sampled indices
|
| 1010 |
+
if frames_indices is not None:
|
| 1011 |
+
if isinstance(frames_indices, np.ndarray):
|
| 1012 |
+
indices = frames_indices.tolist()
|
| 1013 |
+
else:
|
| 1014 |
+
indices = list(frames_indices)
|
| 1015 |
+
else:
|
| 1016 |
+
# Generate uniformly sampled frame indices
|
| 1017 |
+
if total_num_frames <= 1:
|
| 1018 |
+
indices = [0]
|
| 1019 |
+
else:
|
| 1020 |
+
# Uniformly sample frames across the video duration
|
| 1021 |
+
indices = np.linspace(0, duration * video_fps - 1, total_num_frames).astype(np.int32).tolist()
|
| 1022 |
+
|
| 1023 |
+
# Pad indices to be multiple of merge_size
|
| 1024 |
+
if len(indices) % merge_size != 0:
|
| 1025 |
+
indices.extend([indices[-1]] * (merge_size - len(indices) % merge_size))
|
| 1026 |
+
|
| 1027 |
+
# Convert frame indices to timestamps
|
| 1028 |
+
timestamps = [idx / video_fps for idx in indices]
|
| 1029 |
+
|
| 1030 |
+
# Frames are merged by merge_size, so we average the timestamps within each temporal patch
|
| 1031 |
+
timestamps = [
|
| 1032 |
+
(timestamps[i] + timestamps[i + merge_size - 1]) / 2
|
| 1033 |
+
for i in range(0, len(timestamps), merge_size)
|
| 1034 |
+
]
|
| 1035 |
+
return timestamps
|
| 1036 |
+
|
| 1037 |
+
def batch_decode(self, *args, **kwargs):
|
| 1038 |
+
"""
|
| 1039 |
+
This method forwards all its arguments to the tokenizer's batch_decode.
|
| 1040 |
+
Please refer to the docstring of this method for more information.
|
| 1041 |
+
"""
|
| 1042 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
| 1043 |
+
|
| 1044 |
+
def decode(self, *args, **kwargs):
|
| 1045 |
+
"""
|
| 1046 |
+
This method forwards all its arguments to the tokenizer's decode.
|
| 1047 |
+
Please refer to the docstring of this method for more information.
|
| 1048 |
+
"""
|
| 1049 |
+
return self.tokenizer.decode(*args, **kwargs)
|
| 1050 |
+
|
| 1051 |
+
def post_process_image_text_to_text(
|
| 1052 |
+
self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
|
| 1053 |
+
):
|
| 1054 |
+
"""
|
| 1055 |
+
Post-process the output of the model to decode the text.
|
| 1056 |
+
|
| 1057 |
+
Args:
|
| 1058 |
+
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
| 1059 |
+
The output of the model `generate` function. The output is expected to be a tensor
|
| 1060 |
+
of shape `(batch_size, sequence_length)` or `(sequence_length,)`.
|
| 1061 |
+
skip_special_tokens (`bool`, *optional*, defaults to `True`):
|
| 1062 |
+
Whether or not to remove special tokens in the output.
|
| 1063 |
+
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
| 1064 |
+
Whether or not to clean up the tokenization spaces.
|
| 1065 |
+
**kwargs:
|
| 1066 |
+
Additional arguments to be passed to the tokenizer's `batch_decode` method.
|
| 1067 |
+
|
| 1068 |
+
Returns:
|
| 1069 |
+
`list[str]`: The decoded text.
|
| 1070 |
+
"""
|
| 1071 |
+
return self.tokenizer.batch_decode(
|
| 1072 |
+
generated_outputs,
|
| 1073 |
+
skip_special_tokens=skip_special_tokens,
|
| 1074 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
| 1075 |
+
**kwargs,
|
| 1076 |
+
)
|
| 1077 |
+
|
| 1078 |
+
|
| 1079 |
+
__all__ = ["MossVLProcessor", "MossVLImageProcessorFast"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cu128
|
| 2 |
+
--extra-index-url https://pypi.nvidia.com
|
| 3 |
+
|
| 4 |
+
torch==2.8.0+cu128
|
| 5 |
+
torchvision==0.23.0+cu128
|
| 6 |
+
transformers==4.57.1
|
| 7 |
+
accelerate==1.12.0
|
| 8 |
+
flash-attn==2.8.1
|
| 9 |
+
torchcodec==0.7.0
|
| 10 |
+
numpy==2.4.3
|
| 11 |
+
pillow==12.1.1
|
| 12 |
+
joblib==1.5.2
|
| 13 |
+
einops==0.8.2
|
| 14 |
+
ninja==1.13.0
|
| 15 |
+
packaging==26.0
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52d44d7e09e05fb10f9ec5dc913bf1d62ff37ac249cb9ec47d891935149f5e3e
|
| 3 |
+
size 11423034
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
},
|
| 213 |
+
"151669": {
|
| 214 |
+
"content": "<|time_start|>",
|
| 215 |
+
"lstrip": false,
|
| 216 |
+
"normalized": false,
|
| 217 |
+
"rstrip": false,
|
| 218 |
+
"single_word": false,
|
| 219 |
+
"special": true
|
| 220 |
+
},
|
| 221 |
+
"151670": {
|
| 222 |
+
"content": "<|time_end|>",
|
| 223 |
+
"lstrip": false,
|
| 224 |
+
"normalized": false,
|
| 225 |
+
"rstrip": false,
|
| 226 |
+
"single_word": false,
|
| 227 |
+
"special": true
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"additional_special_tokens": [
|
| 231 |
+
"<|im_start|>",
|
| 232 |
+
"<|im_end|>",
|
| 233 |
+
"<|object_ref_start|>",
|
| 234 |
+
"<|object_ref_end|>",
|
| 235 |
+
"<|box_start|>",
|
| 236 |
+
"<|box_end|>",
|
| 237 |
+
"<|quad_start|>",
|
| 238 |
+
"<|quad_end|>",
|
| 239 |
+
"<|vision_start|>",
|
| 240 |
+
"<|vision_end|>",
|
| 241 |
+
"<|vision_pad|>",
|
| 242 |
+
"<|image_pad|>",
|
| 243 |
+
"<|video_pad|>",
|
| 244 |
+
"<|time_start|>",
|
| 245 |
+
"<|time_end|>"
|
| 246 |
+
],
|
| 247 |
+
"bos_token": null,
|
| 248 |
+
"clean_up_tokenization_spaces": false,
|
| 249 |
+
"eos_token": "<|im_end|>",
|
| 250 |
+
"errors": "replace",
|
| 251 |
+
"extra_special_tokens": {},
|
| 252 |
+
"model_max_length": 262144,
|
| 253 |
+
"pad_token": "<|endoftext|>",
|
| 254 |
+
"split_special_tokens": false,
|
| 255 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 256 |
+
"unk_token": null,
|
| 257 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {%- if messages[0].content is string %}\n {{- messages[0].content }}\n {%- else %}\n {%- for content in messages[0].content %}\n {%- if 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].content is string %}\n {{- messages[0].content }}\n {%- else %}\n {%- for content in messages[0].content %}\n {%- if 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n {%- if message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n <|image|>\n {%- elif content.type == 'video' or 'video' in content %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n <|video|>\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content_item in message.content %}\n {%- if 'text' in content_item %}\n {{- content_item.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and message.content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n <|image|>\n {%- elif content.type == 'video' or 'video' in content %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n <|video|>\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
|
| 258 |
+
}
|
video_preprocessor_config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoProcessor": "processing_moss_vl.MossVLProcessor",
|
| 4 |
+
"AutoVideoProcessor": "video_processing_moss_vl.MossVLVideoProcessor"
|
| 5 |
+
},
|
| 6 |
+
"size": {
|
| 7 |
+
"longest_edge": 16777216,
|
| 8 |
+
"shortest_edge": 4096
|
| 9 |
+
},
|
| 10 |
+
"video_max_pixels": 943718400,
|
| 11 |
+
"patch_size": 16,
|
| 12 |
+
"temporal_patch_size": 1,
|
| 13 |
+
"merge_size": 2,
|
| 14 |
+
"video_fps": 1.0,
|
| 15 |
+
"min_frames": 1,
|
| 16 |
+
"max_frames": 256,
|
| 17 |
+
"num_extract_threads": 4,
|
| 18 |
+
"image_mean": [
|
| 19 |
+
0.5,
|
| 20 |
+
0.5,
|
| 21 |
+
0.5
|
| 22 |
+
],
|
| 23 |
+
"image_std": [
|
| 24 |
+
0.5,
|
| 25 |
+
0.5,
|
| 26 |
+
0.5
|
| 27 |
+
],
|
| 28 |
+
"processor_class": "MossVLProcessor",
|
| 29 |
+
"video_processor_type": "MossVLVideoProcessor"
|
| 30 |
+
}
|
video_processing_moss_vl.py
ADDED
|
@@ -0,0 +1,1132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2025 The FNLP Vision Team and The HuggingFace Inc. team. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""video processor class for Moss-VL."""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import logging as system_logging
|
| 19 |
+
import math
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
import subprocess
|
| 23 |
+
import traceback
|
| 24 |
+
from typing import Any, Dict, List, Optional, Union
|
| 25 |
+
|
| 26 |
+
import numpy as np
|
| 27 |
+
import torch
|
| 28 |
+
from joblib import Parallel, delayed
|
| 29 |
+
from torchcodec.decoders import VideoDecoder
|
| 30 |
+
|
| 31 |
+
from transformers.feature_extraction_utils import BatchFeature
|
| 32 |
+
from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size, validate_kwargs
|
| 33 |
+
from transformers.processing_utils import Unpack, VideosKwargs
|
| 34 |
+
from transformers.utils import TensorType, add_start_docstrings, logging
|
| 35 |
+
from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
|
| 36 |
+
from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
logger = logging.get_logger(__name__)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# -----------------------------------------------------------------------------
|
| 43 |
+
# Torchcodec video frame extraction utilities
|
| 44 |
+
# -----------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
def check_video_for_extra_streams_and_errors(video_path: str) -> dict:
|
| 47 |
+
"""
|
| 48 |
+
Check if video file has abnormal streams or errors reported by ffprobe.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
video_path: Path to the video file.
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
A dictionary containing:
|
| 55 |
+
- 'has_extra_streams': bool, whether there are streams other than video and audio.
|
| 56 |
+
- 'unsupported_codec_errors': list, all "Unsupported codec" error messages.
|
| 57 |
+
- 'ffprobe_output_error': str, other errors/warnings from ffprobe stderr.
|
| 58 |
+
- 'ffprobe_successful': bool, whether ffprobe command executed successfully (return code 0).
|
| 59 |
+
- 'stream_details': list, codec_type and index for each stream.
|
| 60 |
+
- 'num_streams': int, total number of streams identified in the video file.
|
| 61 |
+
"""
|
| 62 |
+
result = {
|
| 63 |
+
'has_extra_streams': False,
|
| 64 |
+
'unsupported_codec_errors': [],
|
| 65 |
+
'ffprobe_output_error': '',
|
| 66 |
+
'ffprobe_successful': False,
|
| 67 |
+
'stream_details': [],
|
| 68 |
+
'num_streams': 0
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
command = [
|
| 72 |
+
"ffprobe",
|
| 73 |
+
"-v", "error",
|
| 74 |
+
"-show_streams",
|
| 75 |
+
"-show_format",
|
| 76 |
+
"-of", "json",
|
| 77 |
+
video_path
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
process = subprocess.run(
|
| 82 |
+
command,
|
| 83 |
+
capture_output=True,
|
| 84 |
+
text=True,
|
| 85 |
+
check=False
|
| 86 |
+
)
|
| 87 |
+
result['ffprobe_successful'] = (process.returncode == 0)
|
| 88 |
+
|
| 89 |
+
if process.stderr:
|
| 90 |
+
result['ffprobe_output_error'] = process.stderr
|
| 91 |
+
unsupported_codec_pattern = re.compile(r"Unsupported codec with id \d+ for input stream \d+")
|
| 92 |
+
result['unsupported_codec_errors'] = unsupported_codec_pattern.findall(process.stderr)
|
| 93 |
+
|
| 94 |
+
if process.stdout:
|
| 95 |
+
ffprobe_data = json.loads(process.stdout)
|
| 96 |
+
if 'streams' in ffprobe_data:
|
| 97 |
+
result['num_streams'] = len(ffprobe_data['streams'])
|
| 98 |
+
for stream in ffprobe_data['streams']:
|
| 99 |
+
stream_type = stream.get('codec_type')
|
| 100 |
+
stream_index = stream.get('index')
|
| 101 |
+
result['stream_details'].append({'index': stream_index, 'codec_type': stream_type})
|
| 102 |
+
if stream_type not in ['video', 'audio']:
|
| 103 |
+
result['has_extra_streams'] = True
|
| 104 |
+
|
| 105 |
+
if 'format' in ffprobe_data and 'nb_streams' in ffprobe_data['format']:
|
| 106 |
+
if result['num_streams'] == 0:
|
| 107 |
+
result['num_streams'] = ffprobe_data['format']['nb_streams']
|
| 108 |
+
elif result['num_streams'] != ffprobe_data['format']['nb_streams']:
|
| 109 |
+
logger.warning(
|
| 110 |
+
f"Number of streams in 'streams' list ({result['num_streams']}) "
|
| 111 |
+
f"differs from 'nb_streams' in 'format' ({ffprobe_data['format']['nb_streams']})."
|
| 112 |
+
)
|
| 113 |
+
except FileNotFoundError:
|
| 114 |
+
result['ffprobe_output_error'] = "ffprobe command not found. Please ensure FFmpeg is installed and in your PATH."
|
| 115 |
+
result['ffprobe_successful'] = False
|
| 116 |
+
except json.JSONDecodeError:
|
| 117 |
+
result['ffprobe_output_error'] = "Failed to parse ffprobe JSON output. Check ffprobe installation or video file."
|
| 118 |
+
result['ffprobe_successful'] = False
|
| 119 |
+
except Exception as e:
|
| 120 |
+
result['ffprobe_output_error'] = f"An unexpected error occurred: {e}"
|
| 121 |
+
result['ffprobe_successful'] = False
|
| 122 |
+
|
| 123 |
+
return result
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def remove_video_extra_stream_ffmpeg(input_video: str, output_video: str) -> bool:
|
| 127 |
+
"""
|
| 128 |
+
Remove extra streams from video using ffmpeg.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
input_video: Path to input video.
|
| 132 |
+
output_video: Path to output video.
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
bool: True if successful, False otherwise.
|
| 136 |
+
"""
|
| 137 |
+
command_list = [
|
| 138 |
+
"ffmpeg", "-y", "-i", input_video,
|
| 139 |
+
"-map", "0:v:0",
|
| 140 |
+
"-c", "copy",
|
| 141 |
+
"-an",
|
| 142 |
+
"-sn",
|
| 143 |
+
"-dn",
|
| 144 |
+
"-map_metadata", "-1",
|
| 145 |
+
"-map_chapters", "-1",
|
| 146 |
+
"-movflags", "faststart",
|
| 147 |
+
output_video,
|
| 148 |
+
]
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
subprocess.run(command_list, shell=False, check=True, capture_output=True)
|
| 152 |
+
return True
|
| 153 |
+
except subprocess.CalledProcessError as e:
|
| 154 |
+
system_logging.error(f"Command execution failed with return code: {e.returncode}, video: {input_video}")
|
| 155 |
+
system_logging.error(f"Error output:\n{e.stderr}")
|
| 156 |
+
return False
|
| 157 |
+
except FileNotFoundError:
|
| 158 |
+
system_logging.error("Error: ffmpeg command not found. Please ensure ffmpeg is installed and in PATH.")
|
| 159 |
+
return False
|
| 160 |
+
except Exception as e:
|
| 161 |
+
system_logging.error(f"Unexpected error executing command: {e}, video: {input_video}", exc_info=True)
|
| 162 |
+
return False
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def clean_video_streams(video_path: str) -> str:
|
| 166 |
+
"""
|
| 167 |
+
Clean video streams if extra streams are detected.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
video_path: Path to the video file.
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
str: Path to cleaned video (or original if no cleaning needed).
|
| 174 |
+
"""
|
| 175 |
+
ffprobe_res = check_video_for_extra_streams_and_errors(video_path)
|
| 176 |
+
if ffprobe_res['has_extra_streams']:
|
| 177 |
+
base_name = os.path.basename(video_path)
|
| 178 |
+
output_folder = os.path.dirname(video_path)
|
| 179 |
+
file_name_without_ext, file_ext = os.path.splitext(base_name)
|
| 180 |
+
new_base_name = f"{file_name_without_ext}_fix{file_ext}"
|
| 181 |
+
video_path_output = os.path.join(output_folder, new_base_name)
|
| 182 |
+
|
| 183 |
+
process_flag = remove_video_extra_stream_ffmpeg(video_path, video_path_output)
|
| 184 |
+
if not process_flag:
|
| 185 |
+
logger.warning("Failed to remove extra streams with ffmpeg")
|
| 186 |
+
return video_path
|
| 187 |
+
return video_path_output
|
| 188 |
+
return video_path
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def split_indices(indices: List[Union[int, float]], num_chunks: int) -> List[List[Union[int, float]]]:
|
| 192 |
+
"""
|
| 193 |
+
Split an index list into roughly equal chunks.
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
indices: List of indices to split.
|
| 197 |
+
num_chunks: Number of chunks to create.
|
| 198 |
+
|
| 199 |
+
Returns:
|
| 200 |
+
List of index chunks.
|
| 201 |
+
"""
|
| 202 |
+
chunk_size = len(indices) // num_chunks
|
| 203 |
+
chunks = []
|
| 204 |
+
for i in range(num_chunks - 1):
|
| 205 |
+
chunks.append(indices[i * chunk_size:(i + 1) * chunk_size])
|
| 206 |
+
chunks.append(indices[(num_chunks - 1) * chunk_size:])
|
| 207 |
+
return chunks
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def decode_sequentially(indices: List[int], video_path: str, ffmpeg_threads: int = 0):
|
| 211 |
+
"""
|
| 212 |
+
Decode frames sequentially from a video.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
indices: List of frame indices to decode.
|
| 216 |
+
video_path: Path to the video file.
|
| 217 |
+
ffmpeg_threads: Number of ffmpeg threads to use.
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
FrameBatch from torchcodec.
|
| 221 |
+
"""
|
| 222 |
+
decoder = VideoDecoder(video_path, num_ffmpeg_threads=ffmpeg_threads)
|
| 223 |
+
try:
|
| 224 |
+
return decoder.get_frames_at(indices)
|
| 225 |
+
finally:
|
| 226 |
+
del decoder
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def decode_with_multithreading(indices: List[int], num_threads: int, video_path: str) -> dict:
|
| 230 |
+
"""
|
| 231 |
+
Decode frames using multithreading with joblib.
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
indices: List of frame indices to decode.
|
| 235 |
+
num_threads: Number of threads to use.
|
| 236 |
+
video_path: Path to the video file.
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
dict: Contains 'data', 'duration_seconds', 'pts_seconds' tensors.
|
| 240 |
+
"""
|
| 241 |
+
chunks = split_indices(indices, num_chunks=num_threads)
|
| 242 |
+
results = Parallel(n_jobs=num_threads, prefer="threads", verbose=0)(
|
| 243 |
+
delayed(decode_sequentially)(chunk, video_path) for chunk in chunks
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
return {
|
| 247 |
+
"data": torch.cat([frame_batch.data for frame_batch in results], dim=0),
|
| 248 |
+
"duration_seconds": torch.cat([frame_batch.duration_seconds for frame_batch in results], dim=0),
|
| 249 |
+
"pts_seconds": torch.cat([frame_batch.pts_seconds for frame_batch in results], dim=0)
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def decode_sequentially_timestamp(timestamp_list: List[float], video_path: str, ffmpeg_threads: int = 0):
|
| 254 |
+
"""
|
| 255 |
+
Decode frames sequentially from a video based on timestamps.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
timestamp_list: List of timestamps (in seconds) to decode.
|
| 259 |
+
video_path: Path to the video file.
|
| 260 |
+
ffmpeg_threads: Number of ffmpeg threads to use.
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
FrameBatch from torchcodec.
|
| 264 |
+
"""
|
| 265 |
+
decoder = VideoDecoder(video_path, num_ffmpeg_threads=ffmpeg_threads)
|
| 266 |
+
try:
|
| 267 |
+
metadata = decoder.metadata
|
| 268 |
+
|
| 269 |
+
min_pts = metadata.begin_stream_seconds_from_content
|
| 270 |
+
if min_pts is None:
|
| 271 |
+
min_pts = 0.0
|
| 272 |
+
|
| 273 |
+
max_pts = None
|
| 274 |
+
if metadata.num_frames_from_content and metadata.average_fps:
|
| 275 |
+
max_pts = (metadata.num_frames_from_content - 1) / metadata.average_fps + min_pts
|
| 276 |
+
elif metadata.end_stream_seconds_from_content is not None:
|
| 277 |
+
max_pts = metadata.end_stream_seconds_from_content
|
| 278 |
+
else:
|
| 279 |
+
max_pts = metadata.duration_seconds
|
| 280 |
+
|
| 281 |
+
if max_pts is not None and max_pts > 0:
|
| 282 |
+
timestamp_list = [max(min_pts, min(t, max_pts)) for t in timestamp_list]
|
| 283 |
+
elif min_pts > 0:
|
| 284 |
+
timestamp_list = [max(min_pts, t) for t in timestamp_list]
|
| 285 |
+
|
| 286 |
+
return decoder.get_frames_played_at(timestamp_list)
|
| 287 |
+
finally:
|
| 288 |
+
del decoder
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def timestamp_decode_with_multithreading(timestamp_list: List[float], num_threads: int, video_path: str) -> dict:
|
| 292 |
+
"""
|
| 293 |
+
Decode frames using multithreading based on timestamps.
|
| 294 |
+
|
| 295 |
+
Args:
|
| 296 |
+
timestamp_list: List of timestamps (in seconds) to decode.
|
| 297 |
+
num_threads: Number of threads to use.
|
| 298 |
+
video_path: Path to the video file.
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
dict: Contains 'data', 'duration_seconds', 'pts_seconds' tensors.
|
| 302 |
+
"""
|
| 303 |
+
chunks = split_indices(timestamp_list, num_chunks=num_threads)
|
| 304 |
+
results = Parallel(n_jobs=num_threads, prefer="threads", verbose=0)(
|
| 305 |
+
delayed(decode_sequentially_timestamp)(chunk, video_path) for chunk in chunks
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Concatenate results from all threads
|
| 309 |
+
data_list = [frame_batch.data for frame_batch in results]
|
| 310 |
+
duration_list = [frame_batch.duration_seconds for frame_batch in results]
|
| 311 |
+
pts_list = [frame_batch.pts_seconds for frame_batch in results]
|
| 312 |
+
|
| 313 |
+
if not data_list:
|
| 314 |
+
logger.warning("No frames were successfully decoded.")
|
| 315 |
+
return {"data": torch.empty(0), "duration_seconds": torch.empty(0), "pts_seconds": torch.empty(0)}
|
| 316 |
+
|
| 317 |
+
return {
|
| 318 |
+
"data": torch.cat(data_list, dim=0),
|
| 319 |
+
"duration_seconds": torch.cat(duration_list, dim=0),
|
| 320 |
+
"pts_seconds": torch.cat(pts_list, dim=0)
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def extract_frames_with_torchcodec(
|
| 325 |
+
video_path: str,
|
| 326 |
+
sample_frames_count: int,
|
| 327 |
+
num_threads: int = 4,
|
| 328 |
+
|
| 329 |
+
) -> Optional[dict]:
|
| 330 |
+
"""
|
| 331 |
+
Extract frames from video using torchcodec with multithreading.
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
video_path: Path to the video file.
|
| 335 |
+
sample_frames_count: Number of frames to sample.
|
| 336 |
+
num_threads: Number of threads to use for extraction.
|
| 337 |
+
sampling_method: Sampling method, either "index" (uniform frame indices) or "timestamp" (uniform timestamps).
|
| 338 |
+
|
| 339 |
+
Returns:
|
| 340 |
+
dict: Contains 'data' (N, C, H, W), 'duration_seconds' (N,), 'pts_seconds' (N,) tensors.
|
| 341 |
+
Returns None if extraction fails.
|
| 342 |
+
"""
|
| 343 |
+
try:
|
| 344 |
+
video_path = clean_video_streams(video_path)
|
| 345 |
+
decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
|
| 346 |
+
metadata = decoder.metadata
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
total_frames_in_video = metadata.num_frames_from_content
|
| 350 |
+
|
| 351 |
+
effective_sample_count = min(sample_frames_count, total_frames_in_video)
|
| 352 |
+
if effective_sample_count == 0:
|
| 353 |
+
logger.error("Cannot extract frames: video has 0 frames or specified frame count is 0")
|
| 354 |
+
return None
|
| 355 |
+
|
| 356 |
+
# Generate uniform frame indices
|
| 357 |
+
frame_indices = np.linspace(0, total_frames_in_video - 1, effective_sample_count).astype(np.int32)
|
| 358 |
+
# Ensure indices are valid and remove duplicates
|
| 359 |
+
frame_indices = np.unique(np.clip(frame_indices, 0, total_frames_in_video - 1))
|
| 360 |
+
|
| 361 |
+
result = decode_with_multithreading(frame_indices.tolist(), num_threads=num_threads, video_path=video_path)
|
| 362 |
+
# Add frame_indices to the result for later use
|
| 363 |
+
result["frame_indices"] = frame_indices
|
| 364 |
+
return result
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
except Exception:
|
| 369 |
+
traceback.print_exc()
|
| 370 |
+
return None
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def smart_resize(
|
| 374 |
+
num_frames: int,
|
| 375 |
+
height: int,
|
| 376 |
+
width: int,
|
| 377 |
+
temporal_factor: int = 1,
|
| 378 |
+
factor: int = 32,
|
| 379 |
+
min_pixels: int = 128 * 128,
|
| 380 |
+
max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144,
|
| 381 |
+
per_frame_min_pixels: int = None,
|
| 382 |
+
per_frame_max_pixels: int = None,
|
| 383 |
+
):
|
| 384 |
+
if num_frames < temporal_factor:
|
| 385 |
+
raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}")
|
| 386 |
+
if height < factor or width < factor:
|
| 387 |
+
raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
|
| 388 |
+
elif max(height, width) / min(height, width) > 200:
|
| 389 |
+
raise ValueError(
|
| 390 |
+
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
|
| 391 |
+
)
|
| 392 |
+
h_bar = round(height / factor) * factor
|
| 393 |
+
w_bar = round(width / factor) * factor
|
| 394 |
+
t_bar = round(num_frames / temporal_factor) * temporal_factor
|
| 395 |
+
|
| 396 |
+
# Step 1: Apply per-frame upper limit constraint
|
| 397 |
+
if per_frame_max_pixels is not None and h_bar * w_bar > per_frame_max_pixels:
|
| 398 |
+
beta = math.sqrt((height * width) / per_frame_max_pixels)
|
| 399 |
+
h_bar = max(factor, math.floor(height / beta / factor) * factor)
|
| 400 |
+
w_bar = max(factor, math.floor(width / beta / factor) * factor)
|
| 401 |
+
|
| 402 |
+
# Step 2: Apply 3D volume constraints (frames * height * width)
|
| 403 |
+
if t_bar * h_bar * w_bar > max_pixels:
|
| 404 |
+
beta = math.sqrt((num_frames * height * width) / max_pixels)
|
| 405 |
+
h_bar = max(factor, math.floor(height / beta / factor) * factor)
|
| 406 |
+
w_bar = max(factor, math.floor(width / beta / factor) * factor)
|
| 407 |
+
elif t_bar * h_bar * w_bar < min_pixels:
|
| 408 |
+
beta = math.sqrt(min_pixels / (num_frames * height * width))
|
| 409 |
+
h_bar = math.ceil(height * beta / factor) * factor
|
| 410 |
+
w_bar = math.ceil(width * beta / factor) * factor
|
| 411 |
+
|
| 412 |
+
# Step 3: Ensure per-frame lower limit is respected (after volume constraint)
|
| 413 |
+
# This guarantees single frame stays within [per_frame_min_pixels, per_frame_max_pixels]
|
| 414 |
+
if per_frame_min_pixels is not None and h_bar * w_bar < per_frame_min_pixels:
|
| 415 |
+
beta = math.sqrt(per_frame_min_pixels / (height * width))
|
| 416 |
+
h_bar = math.ceil(height * beta / factor) * factor
|
| 417 |
+
w_bar = math.ceil(width * beta / factor) * factor
|
| 418 |
+
|
| 419 |
+
return h_bar, w_bar
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
class MossVLVideoProcessorInitKwargs(VideosKwargs):
|
| 423 |
+
patch_size: Optional[int]
|
| 424 |
+
temporal_patch_size: Optional[int]
|
| 425 |
+
merge_size: Optional[int]
|
| 426 |
+
min_frames: Optional[int]
|
| 427 |
+
max_frames: Optional[int]
|
| 428 |
+
video_fps: Optional[Union[int, float]]
|
| 429 |
+
num_extract_threads: Optional[int]
|
| 430 |
+
# Total 3D volume budget across all videos; distributed proportionally per video by T*H*W
|
| 431 |
+
video_max_pixels: Optional[int]
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
@add_start_docstrings(
|
| 435 |
+
"Constructs a fast Moss-VL video processor that dynamically resizes videos based on the original videos.",
|
| 436 |
+
BASE_VIDEO_PROCESSOR_DOCSTRING,
|
| 437 |
+
"""
|
| 438 |
+
patch_size (`int`, *optional*, defaults to 16):
|
| 439 |
+
The spacial patch size of the vision encoder.
|
| 440 |
+
temporal_patch_size (`int`, *optional*, defaults to 1):
|
| 441 |
+
The temporal patch size of the vision encoder.
|
| 442 |
+
merge_size (`int`, *optional*, defaults to 2):
|
| 443 |
+
The merge size of the vision encoder to llm encoder.
|
| 444 |
+
video_fps (`float`, *optional*, defaults to 1.0):
|
| 445 |
+
Target frames per second for video sampling.
|
| 446 |
+
min_frames (`int`, *optional*, defaults to 1):
|
| 447 |
+
Minimum number of frames to sample from a video.
|
| 448 |
+
max_frames (`int`, *optional*, defaults to 256):
|
| 449 |
+
Maximum number of frames to sample from a video.
|
| 450 |
+
num_extract_threads (`int`, *optional*, defaults to 4):
|
| 451 |
+
Number of threads to use for frame extraction.
|
| 452 |
+
""",
|
| 453 |
+
)
|
| 454 |
+
class MossVLVideoProcessor(BaseVideoProcessor):
|
| 455 |
+
resample = PILImageResampling.BICUBIC
|
| 456 |
+
size = {"shortest_edge": 128 * 32 * 32, "longest_edge": 32 * 32 * 768}
|
| 457 |
+
image_mean = [0.5, 0.5, 0.5]
|
| 458 |
+
image_std = [0.5, 0.5, 0.5]
|
| 459 |
+
do_resize = True
|
| 460 |
+
do_rescale = True
|
| 461 |
+
do_normalize = True
|
| 462 |
+
do_convert_rgb = True
|
| 463 |
+
patch_size = 16
|
| 464 |
+
temporal_patch_size = 1
|
| 465 |
+
merge_size = 2
|
| 466 |
+
video_fps = 1.0
|
| 467 |
+
min_frames = 1
|
| 468 |
+
max_frames = 256
|
| 469 |
+
num_extract_threads = 4
|
| 470 |
+
do_sample_frames = True
|
| 471 |
+
# Total 3D volume budget across all videos; distributed proportionally per video by T*H*W
|
| 472 |
+
video_max_pixels = None # read from config
|
| 473 |
+
valid_kwargs = MossVLVideoProcessorInitKwargs
|
| 474 |
+
model_input_names = ["pixel_values_videos", "video_grid_thw"]
|
| 475 |
+
|
| 476 |
+
def __init__(self, **kwargs: Unpack[MossVLVideoProcessorInitKwargs]):
|
| 477 |
+
super().__init__(**kwargs)
|
| 478 |
+
if self.size is not None and (
|
| 479 |
+
self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
|
| 480 |
+
):
|
| 481 |
+
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
| 482 |
+
|
| 483 |
+
def _further_process_kwargs(
|
| 484 |
+
self,
|
| 485 |
+
size: Optional[SizeDict] = None,
|
| 486 |
+
**kwargs,
|
| 487 |
+
) -> dict:
|
| 488 |
+
"""
|
| 489 |
+
Update kwargs that need further processing before being validated
|
| 490 |
+
Can be overridden by subclasses to customize the processing of kwargs.
|
| 491 |
+
"""
|
| 492 |
+
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
| 493 |
+
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
| 494 |
+
|
| 495 |
+
return super()._further_process_kwargs(size=size, **kwargs)
|
| 496 |
+
|
| 497 |
+
def _get_video_path_from_input(self, video_input: Union[str, Dict[str, Any]]) -> str:
|
| 498 |
+
"""Normalize a video input into a video path."""
|
| 499 |
+
if isinstance(video_input, dict):
|
| 500 |
+
return video_input["video_path"]
|
| 501 |
+
return video_input
|
| 502 |
+
|
| 503 |
+
def _get_video_duration_seconds(self, video_input: Union[str, Dict[str, Any]]) -> float:
|
| 504 |
+
"""Get video duration in seconds for weighted frame-budget allocation."""
|
| 505 |
+
video_path = clean_video_streams(self._get_video_path_from_input(video_input))
|
| 506 |
+
decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
|
| 507 |
+
try:
|
| 508 |
+
metadata = decoder.metadata
|
| 509 |
+
duration = None
|
| 510 |
+
if (
|
| 511 |
+
metadata.end_stream_seconds_from_content is not None
|
| 512 |
+
and metadata.begin_stream_seconds_from_content is not None
|
| 513 |
+
):
|
| 514 |
+
duration = metadata.end_stream_seconds_from_content - metadata.begin_stream_seconds_from_content
|
| 515 |
+
if duration is None or duration <= 0:
|
| 516 |
+
duration = metadata.duration_seconds
|
| 517 |
+
return max(0.0, float(duration or 0.0))
|
| 518 |
+
finally:
|
| 519 |
+
del decoder
|
| 520 |
+
|
| 521 |
+
def _allocate_max_frames_for_multiple_videos(
|
| 522 |
+
self,
|
| 523 |
+
video_inputs: List[Union[str, Dict[str, Any]]],
|
| 524 |
+
total_max_frames: Optional[int],
|
| 525 |
+
) -> List[Optional[int]]:
|
| 526 |
+
"""
|
| 527 |
+
Treat max_frames as a total budget for multi-video input and allocate it by duration.
|
| 528 |
+
|
| 529 |
+
The returned values are per-video max_frames. Segment dict inputs still keep their
|
| 530 |
+
existing per-segment weighting logic after receiving the video-level allocation.
|
| 531 |
+
"""
|
| 532 |
+
if not video_inputs:
|
| 533 |
+
return []
|
| 534 |
+
if total_max_frames is None or len(video_inputs) == 1:
|
| 535 |
+
return [total_max_frames] * len(video_inputs)
|
| 536 |
+
|
| 537 |
+
total_max_frames = int(total_max_frames)
|
| 538 |
+
num_videos = len(video_inputs)
|
| 539 |
+
if total_max_frames < num_videos:
|
| 540 |
+
logger.warning(
|
| 541 |
+
"Received max_frames=%s for %s videos. At least one frame per video is required, "
|
| 542 |
+
"so falling back to 1 frame per video.",
|
| 543 |
+
total_max_frames,
|
| 544 |
+
num_videos,
|
| 545 |
+
)
|
| 546 |
+
return [1] * num_videos
|
| 547 |
+
|
| 548 |
+
video_durations = [self._get_video_duration_seconds(video_input) for video_input in video_inputs]
|
| 549 |
+
total_duration = sum(video_durations)
|
| 550 |
+
|
| 551 |
+
# Reserve one frame per video first, then distribute the remaining budget by duration.
|
| 552 |
+
allocations = [1] * num_videos
|
| 553 |
+
remaining_budget = total_max_frames - num_videos
|
| 554 |
+
if remaining_budget == 0:
|
| 555 |
+
return allocations
|
| 556 |
+
|
| 557 |
+
if total_duration <= 0:
|
| 558 |
+
raw_extra_allocations = [remaining_budget / num_videos] * num_videos
|
| 559 |
+
else:
|
| 560 |
+
raw_extra_allocations = [
|
| 561 |
+
remaining_budget * (duration / total_duration) for duration in video_durations
|
| 562 |
+
]
|
| 563 |
+
|
| 564 |
+
base_extra_allocations = [int(math.floor(value)) for value in raw_extra_allocations]
|
| 565 |
+
allocations = [base + extra for base, extra in zip(allocations, base_extra_allocations)]
|
| 566 |
+
|
| 567 |
+
remainder = remaining_budget - sum(base_extra_allocations)
|
| 568 |
+
if remainder > 0:
|
| 569 |
+
fractional_parts = [
|
| 570 |
+
(raw_value - base_value, index)
|
| 571 |
+
for index, (raw_value, base_value) in enumerate(zip(raw_extra_allocations, base_extra_allocations))
|
| 572 |
+
]
|
| 573 |
+
fractional_parts.sort(key=lambda item: (-item[0], item[1]))
|
| 574 |
+
for _, index in fractional_parts[:remainder]:
|
| 575 |
+
allocations[index] += 1
|
| 576 |
+
|
| 577 |
+
return allocations
|
| 578 |
+
|
| 579 |
+
def calculate_num_frames(
|
| 580 |
+
self,
|
| 581 |
+
metadata: VideoMetadata,
|
| 582 |
+
num_frames: Optional[int] = None,
|
| 583 |
+
fps: Optional[Union[int, float]] = None,
|
| 584 |
+
min_frames: Optional[int] = None,
|
| 585 |
+
max_frames: Optional[int] = None,
|
| 586 |
+
**kwargs,
|
| 587 |
+
) -> int:
|
| 588 |
+
"""
|
| 589 |
+
Calculate the number of frames to sample using fps-based logic with min/max constraints.
|
| 590 |
+
|
| 591 |
+
Logic:
|
| 592 |
+
1. Calculate target_frames based on fps and video duration
|
| 593 |
+
2. Apply min_frames and max_frames constraints
|
| 594 |
+
3. Apply max_allowed_frames protection (rough cap from total video_max_pixels budget)
|
| 595 |
+
4. Return the number of frames to sample
|
| 596 |
+
|
| 597 |
+
Args:
|
| 598 |
+
metadata (`VideoMetadata`):
|
| 599 |
+
Metadata of the video containing information about total duration, fps and total number of frames.
|
| 600 |
+
num_frames (`int`, *optional*):
|
| 601 |
+
Maximum number of frames to sample. If provided, overrides fps-based calculation.
|
| 602 |
+
fps (`int` or `float`, *optional*):
|
| 603 |
+
Target frames to sample per second. Defaults to `self.video_fps`.
|
| 604 |
+
min_frames (`int`, *optional*):
|
| 605 |
+
Minimum number of frames to sample. If None, uses self.min_frames.
|
| 606 |
+
max_frames (`int`, *optional*):
|
| 607 |
+
Maximum number of frames to sample. If None, uses self.max_frames.
|
| 608 |
+
Returns:
|
| 609 |
+
int:
|
| 610 |
+
Number of frames to sample.
|
| 611 |
+
"""
|
| 612 |
+
if fps is not None and num_frames is not None:
|
| 613 |
+
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
| 614 |
+
|
| 615 |
+
total_num_frames = metadata.total_num_frames
|
| 616 |
+
|
| 617 |
+
# Use provided min/max or fall back to defaults
|
| 618 |
+
effective_min_frames = min_frames if min_frames is not None else self.min_frames
|
| 619 |
+
effective_max_frames = max_frames if max_frames is not None else self.max_frames
|
| 620 |
+
|
| 621 |
+
# Rough per-video frame cap derived from the multi-video total budget
|
| 622 |
+
# (exact allocation happens later in _preprocess via weighted distribution)
|
| 623 |
+
per_frame_min_pixels = self.size.get("shortest_edge", None) if self.size else None
|
| 624 |
+
video_max_pixels = getattr(self, "video_max_pixels", None)
|
| 625 |
+
if per_frame_min_pixels is not None and video_max_pixels is not None and per_frame_min_pixels > 0:
|
| 626 |
+
max_allowed_frames = video_max_pixels // per_frame_min_pixels
|
| 627 |
+
effective_max_frames = min(effective_max_frames, max_allowed_frames)
|
| 628 |
+
|
| 629 |
+
# Get video duration
|
| 630 |
+
if hasattr(metadata, 'duration') and metadata.duration is not None:
|
| 631 |
+
duration = metadata.duration
|
| 632 |
+
else:
|
| 633 |
+
video_fps = metadata.fps
|
| 634 |
+
if video_fps is not None and video_fps > 0:
|
| 635 |
+
duration = total_num_frames / video_fps
|
| 636 |
+
else:
|
| 637 |
+
# Fallback: assume 24 fps
|
| 638 |
+
video_fps = 24.0
|
| 639 |
+
duration = total_num_frames / video_fps
|
| 640 |
+
logger.warning_once(
|
| 641 |
+
"Could not determine video fps from metadata, defaulting to 24 fps for duration calculation."
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
# Use provided fps or default
|
| 645 |
+
target_fps = fps if fps is not None else self.video_fps
|
| 646 |
+
|
| 647 |
+
# Calculate target frames based on fps and duration
|
| 648 |
+
if num_frames is None:
|
| 649 |
+
# Calculate how many frames we should sample based on target fps
|
| 650 |
+
target_total_frames = int(math.ceil(duration * target_fps - 1e-6))
|
| 651 |
+
|
| 652 |
+
# Apply min/max constraints
|
| 653 |
+
sample_frames = max(target_total_frames, effective_min_frames)
|
| 654 |
+
sample_frames = min(sample_frames, effective_max_frames, total_num_frames)
|
| 655 |
+
else:
|
| 656 |
+
# If num_frames is explicitly provided, use it directly with constraints
|
| 657 |
+
sample_frames = min(max(num_frames, effective_min_frames), effective_max_frames, total_num_frames)
|
| 658 |
+
|
| 659 |
+
return sample_frames
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
def _fetch_video_segment(
|
| 663 |
+
self,
|
| 664 |
+
video_path: str,
|
| 665 |
+
segment: List[float],
|
| 666 |
+
min_frames: Optional[int] = None,
|
| 667 |
+
max_frames: Optional[int] = None,
|
| 668 |
+
video_fps: Optional[float] = None,
|
| 669 |
+
):
|
| 670 |
+
"""
|
| 671 |
+
Fetch video frames for a specific segment.
|
| 672 |
+
|
| 673 |
+
Args:
|
| 674 |
+
video_path: Path to the video file
|
| 675 |
+
segment: [start, end] for a segment (left-closed, right-open) or [time] for a single frame
|
| 676 |
+
min_frames: Minimum frames for this segment (weighted). Defaults to self.min_frames. Must be >= 1.
|
| 677 |
+
max_frames: Maximum frames for this segment (weighted). Defaults to self.max_frames. Must be >= 1.
|
| 678 |
+
video_fps: Target frames per second for video sampling. If None, uses self.video_fps.
|
| 679 |
+
|
| 680 |
+
Returns:
|
| 681 |
+
Tuple of (video_tensor, video_metadata)
|
| 682 |
+
"""
|
| 683 |
+
# Use provided min/max or fall back to defaults, ensure >= 1
|
| 684 |
+
min_frames = max(1, min_frames if min_frames is not None else self.min_frames)
|
| 685 |
+
max_frames = max(1, max_frames if max_frames is not None else self.max_frames)
|
| 686 |
+
# Use provided video_fps or fall back to self.video_fps
|
| 687 |
+
target_video_fps = video_fps if video_fps is not None else self.video_fps
|
| 688 |
+
|
| 689 |
+
video_path = clean_video_streams(video_path)
|
| 690 |
+
decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
|
| 691 |
+
try:
|
| 692 |
+
torchcodec_metadata = decoder.metadata
|
| 693 |
+
|
| 694 |
+
video_fps = torchcodec_metadata.average_fps
|
| 695 |
+
|
| 696 |
+
# Calculate duration
|
| 697 |
+
duration = None
|
| 698 |
+
if torchcodec_metadata.end_stream_seconds_from_content is not None and torchcodec_metadata.begin_stream_seconds_from_content is not None:
|
| 699 |
+
duration = torchcodec_metadata.end_stream_seconds_from_content - torchcodec_metadata.begin_stream_seconds_from_content
|
| 700 |
+
if duration is None or duration <= 0:
|
| 701 |
+
duration = torchcodec_metadata.duration_seconds
|
| 702 |
+
|
| 703 |
+
if len(segment) == 1:
|
| 704 |
+
# Single frame at specified time
|
| 705 |
+
timestamp = segment[0]
|
| 706 |
+
frame_batch = decoder.get_frames_played_at([timestamp])
|
| 707 |
+
video_tensor = frame_batch.data
|
| 708 |
+
actual_timestamps = [timestamp]
|
| 709 |
+
sample_count = 1
|
| 710 |
+
else:
|
| 711 |
+
# Segment [start, end) - left-closed, right-open interval
|
| 712 |
+
start_time, end_time = segment
|
| 713 |
+
segment_duration = end_time - start_time
|
| 714 |
+
|
| 715 |
+
# Calculate number of frames to sample for this segment
|
| 716 |
+
target_frames = int(math.ceil(segment_duration * target_video_fps))
|
| 717 |
+
target_frames = max(target_frames, min_frames)
|
| 718 |
+
target_frames = min(target_frames, max_frames)
|
| 719 |
+
|
| 720 |
+
# Generate timestamps for uniform sampling within segment
|
| 721 |
+
if target_frames == 1:
|
| 722 |
+
actual_timestamps = [start_time] # Use start_time for single frame
|
| 723 |
+
else:
|
| 724 |
+
# Sample uniformly within [start, end), endpoint=False for left-closed right-open
|
| 725 |
+
actual_timestamps = np.linspace(start_time, end_time, target_frames, endpoint=False).tolist()
|
| 726 |
+
|
| 727 |
+
# Use multithreading for extraction
|
| 728 |
+
result = timestamp_decode_with_multithreading(actual_timestamps, self.num_extract_threads, video_path)
|
| 729 |
+
video_tensor = result["data"]
|
| 730 |
+
sample_count = len(actual_timestamps)
|
| 731 |
+
|
| 732 |
+
# Create VideoMetadata
|
| 733 |
+
video_metadata = VideoMetadata(
|
| 734 |
+
total_num_frames=sample_count,
|
| 735 |
+
fps=video_fps,
|
| 736 |
+
duration=duration,
|
| 737 |
+
video_backend="torchcodec",
|
| 738 |
+
height=torchcodec_metadata.height,
|
| 739 |
+
width=torchcodec_metadata.width,
|
| 740 |
+
frames_indices=None
|
| 741 |
+
)
|
| 742 |
+
|
| 743 |
+
# Store actual timestamps as a custom attribute for _calculate_timestamps to use
|
| 744 |
+
video_metadata.actual_timestamps = actual_timestamps
|
| 745 |
+
|
| 746 |
+
return video_tensor, video_metadata
|
| 747 |
+
finally:
|
| 748 |
+
del decoder
|
| 749 |
+
|
| 750 |
+
def fetch_videos(
|
| 751 |
+
self,
|
| 752 |
+
video_url_or_urls: Union[str, Dict[str, Any], List[Union[str, Dict[str, Any]]]],
|
| 753 |
+
sample_indices_fn=None,
|
| 754 |
+
video_fps: Optional[float] = None,
|
| 755 |
+
min_frames: Optional[int] = None,
|
| 756 |
+
max_frames: Optional[int] = None,
|
| 757 |
+
):
|
| 758 |
+
"""
|
| 759 |
+
Override fetch_videos to use torchcodec for frame extraction.
|
| 760 |
+
|
| 761 |
+
This method uses torchcodec with multithreading for efficient frame extraction.
|
| 762 |
+
Frame count is calculated by the calculate_num_frames method
|
| 763 |
+
(fps-based with min/max constraints).
|
| 764 |
+
|
| 765 |
+
Args:
|
| 766 |
+
video_url_or_urls: Can be one of:
|
| 767 |
+
- str: Single video path
|
| 768 |
+
- Dict: Video with segments {"video_path": str, "segments": List[List[float]]}
|
| 769 |
+
- List[Union[str, Dict]]: List of video paths or segment dicts
|
| 770 |
+
sample_indices_fn: (Not used) Kept for compatibility with base class signature.
|
| 771 |
+
video_fps: Target frames per second for video sampling. If None, uses self.video_fps.
|
| 772 |
+
min_frames: Minimum number of frames to sample. If None, uses self.min_frames.
|
| 773 |
+
max_frames: Maximum number of frames to sample. If None, uses self.max_frames.
|
| 774 |
+
|
| 775 |
+
Returns:
|
| 776 |
+
Tuple of (videos, metadata) where videos are torch.Tensors and metadata are VideoMetadata objects.
|
| 777 |
+
"""
|
| 778 |
+
# Use provided values or fall back to self defaults
|
| 779 |
+
effective_video_fps = video_fps if video_fps is not None else self.video_fps
|
| 780 |
+
effective_min_frames = min_frames if min_frames is not None else self.min_frames
|
| 781 |
+
effective_max_frames = max_frames if max_frames is not None else self.max_frames
|
| 782 |
+
# Handle recursive calls for lists
|
| 783 |
+
if isinstance(video_url_or_urls, list):
|
| 784 |
+
all_videos = []
|
| 785 |
+
all_metadata = []
|
| 786 |
+
if len(video_url_or_urls) == 1:
|
| 787 |
+
per_video_max_frames = [effective_max_frames]
|
| 788 |
+
else:
|
| 789 |
+
per_video_max_frames = self._allocate_max_frames_for_multiple_videos(
|
| 790 |
+
video_url_or_urls,
|
| 791 |
+
effective_max_frames,
|
| 792 |
+
)
|
| 793 |
+
for x, allocated_max_frames in zip(video_url_or_urls, per_video_max_frames):
|
| 794 |
+
result = self.fetch_videos(
|
| 795 |
+
x,
|
| 796 |
+
video_fps=effective_video_fps,
|
| 797 |
+
min_frames=effective_min_frames,
|
| 798 |
+
max_frames=allocated_max_frames,
|
| 799 |
+
)
|
| 800 |
+
# Check if result is from segment expansion (returns lists) or single item
|
| 801 |
+
if isinstance(result[0], list):
|
| 802 |
+
all_videos.extend(result[0])
|
| 803 |
+
all_metadata.extend(result[1])
|
| 804 |
+
else:
|
| 805 |
+
all_videos.append(result[0])
|
| 806 |
+
all_metadata.append(result[1])
|
| 807 |
+
return all_videos, all_metadata
|
| 808 |
+
|
| 809 |
+
# Handle dict with segments - returns lists (one per segment)
|
| 810 |
+
if isinstance(video_url_or_urls, dict):
|
| 811 |
+
video_path = video_url_or_urls["video_path"]
|
| 812 |
+
segments = video_url_or_urls["segments"]
|
| 813 |
+
|
| 814 |
+
# Calculate total duration of all time-range segments (len == 2) for weighted min/max frames
|
| 815 |
+
# Single-frame segments (len == 1) are excluded from weighting
|
| 816 |
+
segment_durations = []
|
| 817 |
+
for seg in segments:
|
| 818 |
+
if len(seg) == 2:
|
| 819 |
+
segment_durations.append(seg[1] - seg[0])
|
| 820 |
+
else:
|
| 821 |
+
segment_durations.append(None) # Single frame, no weighting
|
| 822 |
+
|
| 823 |
+
total_segment_duration = sum(d for d in segment_durations if d is not None)
|
| 824 |
+
|
| 825 |
+
videos = []
|
| 826 |
+
metadata = []
|
| 827 |
+
for i, segment in enumerate(segments):
|
| 828 |
+
if len(segment) == 1:
|
| 829 |
+
# Single frame - no weighted min/max, just extract directly
|
| 830 |
+
video, meta = self._fetch_video_segment(video_path, segment, video_fps=effective_video_fps)
|
| 831 |
+
else:
|
| 832 |
+
# Time-range segment - apply weighted min/max frames
|
| 833 |
+
if total_segment_duration > 0:
|
| 834 |
+
weight = segment_durations[i] / total_segment_duration
|
| 835 |
+
else:
|
| 836 |
+
# Fallback: equal weight among time-range segments
|
| 837 |
+
num_range_segments = sum(1 for d in segment_durations if d is not None)
|
| 838 |
+
weight = 1.0 / num_range_segments if num_range_segments > 0 else 1.0
|
| 839 |
+
|
| 840 |
+
# Calculate weighted min/max frames (ensure >= 1)
|
| 841 |
+
weighted_min_frames = max(1, int(round(effective_min_frames * weight)))
|
| 842 |
+
weighted_max_frames = max(1, int(round(effective_max_frames * weight)))
|
| 843 |
+
|
| 844 |
+
video, meta = self._fetch_video_segment(
|
| 845 |
+
video_path, segment,
|
| 846 |
+
min_frames=weighted_min_frames,
|
| 847 |
+
max_frames=weighted_max_frames,
|
| 848 |
+
video_fps=effective_video_fps,
|
| 849 |
+
)
|
| 850 |
+
videos.append(video)
|
| 851 |
+
metadata.append(meta)
|
| 852 |
+
return videos, metadata
|
| 853 |
+
|
| 854 |
+
# Single video path
|
| 855 |
+
video_path = video_url_or_urls
|
| 856 |
+
|
| 857 |
+
# Clean video streams first (remove extra streams if needed)
|
| 858 |
+
video_path = clean_video_streams(video_path)
|
| 859 |
+
|
| 860 |
+
decoder = None
|
| 861 |
+
try:
|
| 862 |
+
# Create VideoDecoder only once for both metadata and frame extraction
|
| 863 |
+
decoder = VideoDecoder(video_path, num_ffmpeg_threads=0)
|
| 864 |
+
torchcodec_metadata = decoder.metadata
|
| 865 |
+
|
| 866 |
+
duration = None
|
| 867 |
+
if torchcodec_metadata.end_stream_seconds_from_content is not None and torchcodec_metadata.begin_stream_seconds_from_content is not None:
|
| 868 |
+
duration = torchcodec_metadata.end_stream_seconds_from_content - torchcodec_metadata.begin_stream_seconds_from_content
|
| 869 |
+
|
| 870 |
+
if duration is None or duration <= 0:
|
| 871 |
+
duration = torchcodec_metadata.duration_seconds
|
| 872 |
+
|
| 873 |
+
# Use num_frames_from_content for accurate frame count (consistent with extraction)
|
| 874 |
+
total_frames_in_video = torchcodec_metadata.num_frames_from_content
|
| 875 |
+
|
| 876 |
+
# Create VideoMetadata object for sample_frames method
|
| 877 |
+
temp_metadata = VideoMetadata(
|
| 878 |
+
total_num_frames=total_frames_in_video,
|
| 879 |
+
fps=torchcodec_metadata.average_fps,
|
| 880 |
+
duration=duration,
|
| 881 |
+
video_backend="torchcodec",
|
| 882 |
+
height=torchcodec_metadata.height,
|
| 883 |
+
width=torchcodec_metadata.width,
|
| 884 |
+
frames_indices=None
|
| 885 |
+
)
|
| 886 |
+
|
| 887 |
+
# Use calculate_num_frames method to get the number of frames to sample
|
| 888 |
+
sample_frames_count = self.calculate_num_frames(
|
| 889 |
+
temp_metadata,
|
| 890 |
+
fps=effective_video_fps,
|
| 891 |
+
min_frames=effective_min_frames,
|
| 892 |
+
max_frames=effective_max_frames,
|
| 893 |
+
)
|
| 894 |
+
|
| 895 |
+
# Ensure sample count is valid
|
| 896 |
+
effective_sample_count = min(sample_frames_count, total_frames_in_video)
|
| 897 |
+
if effective_sample_count == 0:
|
| 898 |
+
raise ValueError(f"Cannot extract frames: video has 0 frames or specified frame count is 0")
|
| 899 |
+
|
| 900 |
+
# Generate uniform frame indices
|
| 901 |
+
frame_indices = np.linspace(0, total_frames_in_video - 1, effective_sample_count).astype(np.int32)
|
| 902 |
+
# Ensure indices are valid and remove duplicates
|
| 903 |
+
frame_indices = np.unique(np.clip(frame_indices, 0, total_frames_in_video - 1))
|
| 904 |
+
|
| 905 |
+
# Extract frames using multithreading (decoder is created inside each thread for thread safety)
|
| 906 |
+
result = decode_with_multithreading(frame_indices.tolist(), num_threads=self.num_extract_threads, video_path=video_path)
|
| 907 |
+
|
| 908 |
+
# Extract frame tensor (N, C, H, W)
|
| 909 |
+
frames_tensor = result["data"]
|
| 910 |
+
|
| 911 |
+
# Create final VideoMetadata object
|
| 912 |
+
video_metadata = VideoMetadata(
|
| 913 |
+
total_num_frames=len(frame_indices),
|
| 914 |
+
fps=torchcodec_metadata.average_fps,
|
| 915 |
+
duration=duration,
|
| 916 |
+
video_backend="torchcodec",
|
| 917 |
+
height=torchcodec_metadata.height,
|
| 918 |
+
width=torchcodec_metadata.width,
|
| 919 |
+
frames_indices=frame_indices
|
| 920 |
+
)
|
| 921 |
+
|
| 922 |
+
# Ensure frames are in (T, C, H, W) format
|
| 923 |
+
if frames_tensor.dim() == 4: # (N, C, H, W)
|
| 924 |
+
video_tensor = frames_tensor
|
| 925 |
+
else:
|
| 926 |
+
raise ValueError(f"Unexpected frame tensor shape: {frames_tensor.shape}")
|
| 927 |
+
|
| 928 |
+
return video_tensor, video_metadata
|
| 929 |
+
|
| 930 |
+
except Exception as e:
|
| 931 |
+
logger.error(f"Error loading video {video_path}: {e}")
|
| 932 |
+
traceback.print_exc()
|
| 933 |
+
raise ValueError(f"Failed to load video {video_path}: {e}")
|
| 934 |
+
finally:
|
| 935 |
+
if decoder is not None:
|
| 936 |
+
del decoder
|
| 937 |
+
|
| 938 |
+
def _preprocess(
|
| 939 |
+
self,
|
| 940 |
+
videos: list[torch.Tensor],
|
| 941 |
+
do_convert_rgb: bool = True,
|
| 942 |
+
do_resize: bool = True,
|
| 943 |
+
size: Optional[SizeDict] = None,
|
| 944 |
+
interpolation: PILImageResampling = PILImageResampling.BICUBIC,
|
| 945 |
+
do_rescale: bool = True,
|
| 946 |
+
rescale_factor: float = 1 / 255.0,
|
| 947 |
+
do_normalize: bool = True,
|
| 948 |
+
image_mean: Optional[Union[float, list[float]]] = None,
|
| 949 |
+
image_std: Optional[Union[float, list[float]]] = None,
|
| 950 |
+
patch_size: Optional[int] = None,
|
| 951 |
+
temporal_patch_size: Optional[int] = None,
|
| 952 |
+
merge_size: Optional[int] = None,
|
| 953 |
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
| 954 |
+
**kwargs,
|
| 955 |
+
):
|
| 956 |
+
grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
|
| 957 |
+
resized_videos_grouped = {}
|
| 958 |
+
|
| 959 |
+
video_max_pixels = getattr(self, "video_max_pixels", None)
|
| 960 |
+
if video_max_pixels is not None:
|
| 961 |
+
total_volume = sum(
|
| 962 |
+
sv.shape[0] * sv.shape[1] * sv.shape[3] * sv.shape[4]
|
| 963 |
+
for sv in grouped_videos.values()
|
| 964 |
+
)
|
| 965 |
+
else:
|
| 966 |
+
total_volume = 0
|
| 967 |
+
|
| 968 |
+
for shape, stacked_videos in grouped_videos.items():
|
| 969 |
+
B, T, C, H, W = stacked_videos.shape
|
| 970 |
+
num_frames, height, width = T, H, W
|
| 971 |
+
# Convert to RGB if needed (reuse from base class)
|
| 972 |
+
if do_convert_rgb:
|
| 973 |
+
stacked_videos = self.convert_to_rgb(stacked_videos)
|
| 974 |
+
if do_resize:
|
| 975 |
+
if video_max_pixels is not None and total_volume > 0:
|
| 976 |
+
allocated_max_pixels = int(video_max_pixels * (T * H * W) / total_volume)
|
| 977 |
+
else:
|
| 978 |
+
allocated_max_pixels = size.longest_edge
|
| 979 |
+
resized_height, resized_width = smart_resize(
|
| 980 |
+
num_frames=num_frames,
|
| 981 |
+
height=height,
|
| 982 |
+
width=width,
|
| 983 |
+
temporal_factor=temporal_patch_size,
|
| 984 |
+
factor=patch_size * merge_size,
|
| 985 |
+
min_pixels=size.shortest_edge,
|
| 986 |
+
max_pixels=allocated_max_pixels,
|
| 987 |
+
per_frame_min_pixels=size.shortest_edge,
|
| 988 |
+
per_frame_max_pixels=size.longest_edge,
|
| 989 |
+
)
|
| 990 |
+
stacked_videos = stacked_videos.view(B * T, C, H, W)
|
| 991 |
+
stacked_videos = self.resize(
|
| 992 |
+
stacked_videos,
|
| 993 |
+
size=SizeDict(height=resized_height, width=resized_width),
|
| 994 |
+
interpolation=interpolation,
|
| 995 |
+
)
|
| 996 |
+
stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width)
|
| 997 |
+
resized_videos_grouped[shape] = stacked_videos
|
| 998 |
+
resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
|
| 999 |
+
|
| 1000 |
+
# Group videos by size for further processing
|
| 1001 |
+
# Needed in case do_resize is False, or resize returns videos with different sizes
|
| 1002 |
+
grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
|
| 1003 |
+
processed_videos_grouped = {}
|
| 1004 |
+
processed_grids = {}
|
| 1005 |
+
for shape, stacked_videos in grouped_videos.items():
|
| 1006 |
+
resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
|
| 1007 |
+
|
| 1008 |
+
# Fused rescale and normalize
|
| 1009 |
+
stacked_videos = self.rescale_and_normalize(
|
| 1010 |
+
stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
| 1011 |
+
)
|
| 1012 |
+
patches = stacked_videos
|
| 1013 |
+
|
| 1014 |
+
# Check that videos have `num_frames` divisible by `temporal_patch_size`
|
| 1015 |
+
if patches.shape[1] % temporal_patch_size != 0:
|
| 1016 |
+
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
|
| 1017 |
+
patches = torch.cat([patches, repeats], dim=1)
|
| 1018 |
+
batch_size, grid_t, channel = patches.shape[:3]
|
| 1019 |
+
grid_t = grid_t // temporal_patch_size
|
| 1020 |
+
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
|
| 1021 |
+
|
| 1022 |
+
patches = patches.view(
|
| 1023 |
+
batch_size,
|
| 1024 |
+
grid_t,
|
| 1025 |
+
temporal_patch_size,
|
| 1026 |
+
channel,
|
| 1027 |
+
grid_h // merge_size,
|
| 1028 |
+
merge_size,
|
| 1029 |
+
patch_size,
|
| 1030 |
+
grid_w // merge_size,
|
| 1031 |
+
merge_size,
|
| 1032 |
+
patch_size,
|
| 1033 |
+
)
|
| 1034 |
+
patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
|
| 1035 |
+
flatten_patches = patches.reshape(
|
| 1036 |
+
batch_size,
|
| 1037 |
+
grid_t * grid_h * grid_w,
|
| 1038 |
+
channel * temporal_patch_size * patch_size * patch_size,
|
| 1039 |
+
)
|
| 1040 |
+
|
| 1041 |
+
processed_videos_grouped[shape] = flatten_patches
|
| 1042 |
+
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
|
| 1043 |
+
|
| 1044 |
+
processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
|
| 1045 |
+
processed_grids = reorder_videos(processed_grids, grouped_videos_index)
|
| 1046 |
+
pixel_values_videos = torch.cat(processed_videos, dim=0)
|
| 1047 |
+
video_grid_thw = torch.tensor(processed_grids)
|
| 1048 |
+
data = {
|
| 1049 |
+
"pixel_values_videos": pixel_values_videos,
|
| 1050 |
+
"video_grid_thw": video_grid_thw,
|
| 1051 |
+
}
|
| 1052 |
+
|
| 1053 |
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
| 1054 |
+
|
| 1055 |
+
def preprocess(
|
| 1056 |
+
self,
|
| 1057 |
+
videos: Union[str, Dict[str, Any], List[Union[str, Dict[str, Any]]]],
|
| 1058 |
+
**kwargs,
|
| 1059 |
+
) -> BatchFeature:
|
| 1060 |
+
"""
|
| 1061 |
+
Preprocess videos for the model.
|
| 1062 |
+
|
| 1063 |
+
This method overrides the base class to handle two video input formats:
|
| 1064 |
+
1. String path: "path/to/video.mp4"
|
| 1065 |
+
2. Dict with segments: {"video_path": "...", "segment": [[start, end], [time], ...]}
|
| 1066 |
+
|
| 1067 |
+
Args:
|
| 1068 |
+
videos: Video input(s) in one of the supported formats.
|
| 1069 |
+
**kwargs: Additional arguments passed to _preprocess.
|
| 1070 |
+
|
| 1071 |
+
Returns:
|
| 1072 |
+
BatchFeature with pixel_values_videos, video_grid_thw, and optionally video_metadata.
|
| 1073 |
+
"""
|
| 1074 |
+
# Validate kwargs
|
| 1075 |
+
validate_kwargs(
|
| 1076 |
+
captured_kwargs=kwargs.keys(),
|
| 1077 |
+
valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
|
| 1078 |
+
)
|
| 1079 |
+
|
| 1080 |
+
# Set default kwargs from self
|
| 1081 |
+
for kwarg_name in self.valid_kwargs.__annotations__:
|
| 1082 |
+
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
|
| 1083 |
+
|
| 1084 |
+
# Pop kwargs that are handled separately
|
| 1085 |
+
return_tensors = kwargs.pop("return_tensors", None)
|
| 1086 |
+
return_metadata = kwargs.pop("return_metadata", False)
|
| 1087 |
+
input_data_format = kwargs.pop("input_data_format", None)
|
| 1088 |
+
device = kwargs.pop("device", None)
|
| 1089 |
+
kwargs.pop("video_metadata", None) # We generate our own metadata
|
| 1090 |
+
kwargs.pop("do_sample_frames", None) # We handle sampling ourselves
|
| 1091 |
+
kwargs.pop("data_format", None) # Not used
|
| 1092 |
+
|
| 1093 |
+
# Normalize input to list format
|
| 1094 |
+
if not isinstance(videos, list):
|
| 1095 |
+
videos = [videos]
|
| 1096 |
+
|
| 1097 |
+
# Get video processing params from kwargs (may be passed explicitly for per-batch configuration)
|
| 1098 |
+
video_fps = kwargs.pop("video_fps", None)
|
| 1099 |
+
min_frames = kwargs.pop("min_frames", None)
|
| 1100 |
+
max_frames = kwargs.pop("max_frames", None)
|
| 1101 |
+
|
| 1102 |
+
# Use fetch_videos to handle both string and dict formats
|
| 1103 |
+
video_tensors, video_metadata = self.fetch_videos(
|
| 1104 |
+
videos,
|
| 1105 |
+
video_fps=video_fps,
|
| 1106 |
+
min_frames=min_frames,
|
| 1107 |
+
max_frames=max_frames,
|
| 1108 |
+
)
|
| 1109 |
+
|
| 1110 |
+
# Prepare video tensors using _prepare_input_videos
|
| 1111 |
+
prepared_videos = self._prepare_input_videos(
|
| 1112 |
+
videos=video_tensors,
|
| 1113 |
+
input_data_format=input_data_format,
|
| 1114 |
+
device=device,
|
| 1115 |
+
)
|
| 1116 |
+
|
| 1117 |
+
# Process kwargs for _preprocess
|
| 1118 |
+
kwargs = self._further_process_kwargs(**kwargs)
|
| 1119 |
+
self._validate_preprocess_kwargs(**kwargs)
|
| 1120 |
+
|
| 1121 |
+
# Call _preprocess with prepared videos
|
| 1122 |
+
result = self._preprocess(videos=prepared_videos, return_tensors=return_tensors, **kwargs)
|
| 1123 |
+
|
| 1124 |
+
# Add metadata if requested
|
| 1125 |
+
if return_metadata:
|
| 1126 |
+
result["video_metadata"] = video_metadata
|
| 1127 |
+
|
| 1128 |
+
return result
|
| 1129 |
+
|
| 1130 |
+
|
| 1131 |
+
__all__ = ["MossVLVideoProcessor"]
|
| 1132 |
+
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|