v0.75
Browse files- Modelfile +16 -0
- README.md +11 -9
- chat_template.jinja +7 -0
- config.json +71 -4
- generation_config.json +3 -10
- model-00001-of-00004.safetensors +2 -2
- model-00002-of-00004.safetensors +2 -2
- model-00003-of-00004.safetensors +2 -2
- model-00004-of-00004.safetensors +2 -2
- model.safetensors.index.json +2 -1
- preprocessor_config.json +11 -1
- tokenizer_config.json +3 -2
- video_preprocessor_config.json +45 -0
Modelfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ollama modelfile auto-generated by llamafactory
|
| 2 |
+
|
| 3 |
+
FROM .
|
| 4 |
+
|
| 5 |
+
TEMPLATE """{{ if .System }}<|im_start|>system
|
| 6 |
+
{{ .System }}<|im_end|>
|
| 7 |
+
{{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
|
| 8 |
+
{{ .Content }}<|im_end|>
|
| 9 |
+
<|im_start|>assistant
|
| 10 |
+
{{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
|
| 11 |
+
{{ end }}{{ end }}"""
|
| 12 |
+
|
| 13 |
+
SYSTEM """You are a helpful assistant."""
|
| 14 |
+
|
| 15 |
+
PARAMETER stop "<|im_end|>"
|
| 16 |
+
PARAMETER num_ctx 4096
|
README.md
CHANGED
|
@@ -1,26 +1,27 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
datasets:
|
| 4 |
-
- KwaiVGI/VideoGen-RewardBench
|
| 5 |
-
- TIGER-Lab/GenAI-Bench
|
| 6 |
base_model:
|
| 7 |
-
- Qwen/Qwen2.5-VL-7B-Instruct
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
| 11 |
## Model Summary
|
| 12 |
|
| 13 |
VR-Thinker is the first Multimodal Reward Model utilizing Thinking-with-Image framework.
|
| 14 |
|
| 15 |
For further details, please refer to the following:
|
|
|
|
| 16 |
- 📰 Paper: https://arxiv.org/pdf/2510.10518
|
| 17 |
- 📚 Github: https://github.com/qunzhongwang/vr-thinker
|
| 18 |
- 👋 Contact: [Qunzhong Wang](http://qunzhongwang.github.io/)
|
| 19 |
|
| 20 |
### Quick Start
|
|
|
|
| 21 |
We provide a sample test interface here:
|
| 22 |
|
| 23 |
-
|
| 24 |
import json
|
| 25 |
import random
|
| 26 |
import torch
|
|
@@ -144,17 +145,18 @@ output_text = processor.batch_decode(
|
|
| 144 |
)
|
| 145 |
print(output_text)
|
| 146 |
|
| 147 |
-
|
| 148 |
|
| 149 |
## Citation
|
|
|
|
| 150 |
```
|
| 151 |
@misc{wang2025vrthinkerboostingvideoreward,
|
| 152 |
-
title={VR-Thinker: Boosting Video Reward Models through Thinking-with-Image Reasoning},
|
| 153 |
author={Qunzhong Wang and Jie Liu and Jiajun Liang and Yilei Jiang and Yuanxing Zhang and Jinyuan Chen and Yaozhi Zheng and Xintao Wang and Pengfei Wan and Xiangyu Yue and Jiaheng Liu},
|
| 154 |
year={2025},
|
| 155 |
eprint={2510.10518},
|
| 156 |
archivePrefix={arXiv},
|
| 157 |
primaryClass={cs.CV},
|
| 158 |
-
url={https://arxiv.org/abs/2510.10518},
|
| 159 |
}
|
| 160 |
-
```
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
datasets:
|
| 4 |
+
- KwaiVGI/VideoGen-RewardBench
|
| 5 |
+
- TIGER-Lab/GenAI-Bench
|
| 6 |
base_model:
|
| 7 |
+
- Qwen/Qwen2.5-VL-7B-Instruct
|
| 8 |
---
|
| 9 |
|
|
|
|
| 10 |
## Model Summary
|
| 11 |
|
| 12 |
VR-Thinker is the first Multimodal Reward Model utilizing Thinking-with-Image framework.
|
| 13 |
|
| 14 |
For further details, please refer to the following:
|
| 15 |
+
|
| 16 |
- 📰 Paper: https://arxiv.org/pdf/2510.10518
|
| 17 |
- 📚 Github: https://github.com/qunzhongwang/vr-thinker
|
| 18 |
- 👋 Contact: [Qunzhong Wang](http://qunzhongwang.github.io/)
|
| 19 |
|
| 20 |
### Quick Start
|
| 21 |
+
|
| 22 |
We provide a sample test interface here:
|
| 23 |
|
| 24 |
+
```python
|
| 25 |
import json
|
| 26 |
import random
|
| 27 |
import torch
|
|
|
|
| 145 |
)
|
| 146 |
print(output_text)
|
| 147 |
|
| 148 |
+
```
|
| 149 |
|
| 150 |
## Citation
|
| 151 |
+
|
| 152 |
```
|
| 153 |
@misc{wang2025vrthinkerboostingvideoreward,
|
| 154 |
+
title={VR-Thinker: Boosting Video Reward Models through Thinking-with-Image Reasoning},
|
| 155 |
author={Qunzhong Wang and Jie Liu and Jiajun Liang and Yilei Jiang and Yuanxing Zhang and Jinyuan Chen and Yaozhi Zheng and Xintao Wang and Pengfei Wan and Xiangyu Yue and Jiaheng Liu},
|
| 156 |
year={2025},
|
| 157 |
eprint={2510.10518},
|
| 158 |
archivePrefix={arXiv},
|
| 159 |
primaryClass={cs.CV},
|
| 160 |
+
url={https://arxiv.org/abs/2510.10518},
|
| 161 |
}
|
| 162 |
+
```
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
|
| 2 |
+
You are a helpful assistant.<|im_end|>
|
| 3 |
+
{% endif %}<|im_start|>{{ message['role'] }}
|
| 4 |
+
{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
|
| 5 |
+
{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
|
| 6 |
+
{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
|
| 7 |
+
{% endif %}
|
config.json
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
"Qwen2_5_VLForConditionalGeneration"
|
| 4 |
],
|
| 5 |
"attention_dropout": 0.0,
|
| 6 |
-
"
|
| 7 |
"eos_token_id": 151645,
|
| 8 |
"hidden_act": "silu",
|
| 9 |
"hidden_size": 3584,
|
|
@@ -16,6 +16,7 @@
|
|
| 16 |
"num_attention_heads": 28,
|
| 17 |
"num_hidden_layers": 28,
|
| 18 |
"num_key_value_heads": 4,
|
|
|
|
| 19 |
"rms_norm_eps": 1e-06,
|
| 20 |
"rope_scaling": {
|
| 21 |
"mrope_section": [
|
|
@@ -28,14 +29,80 @@
|
|
| 28 |
},
|
| 29 |
"rope_theta": 1000000.0,
|
| 30 |
"sliding_window": 32768,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
"tie_word_embeddings": false,
|
| 32 |
-
"
|
| 33 |
-
"transformers_version": "4.51.0",
|
| 34 |
"use_cache": false,
|
| 35 |
"use_sliding_window": false,
|
| 36 |
"video_token_id": 151656,
|
| 37 |
"vision_config": {
|
| 38 |
"depth": 32,
|
|
|
|
| 39 |
"fullatt_block_indexes": [
|
| 40 |
7,
|
| 41 |
15,
|
|
@@ -46,6 +113,7 @@
|
|
| 46 |
"hidden_size": 1280,
|
| 47 |
"in_channels": 3,
|
| 48 |
"in_chans": 3,
|
|
|
|
| 49 |
"intermediate_size": 3420,
|
| 50 |
"model_type": "qwen2_5_vl",
|
| 51 |
"num_heads": 16,
|
|
@@ -55,7 +123,6 @@
|
|
| 55 |
"spatial_patch_size": 14,
|
| 56 |
"temporal_patch_size": 2,
|
| 57 |
"tokens_per_second": 2,
|
| 58 |
-
"torch_dtype": "bfloat16",
|
| 59 |
"window_size": 112
|
| 60 |
},
|
| 61 |
"vision_end_token_id": 151653,
|
|
|
|
| 3 |
"Qwen2_5_VLForConditionalGeneration"
|
| 4 |
],
|
| 5 |
"attention_dropout": 0.0,
|
| 6 |
+
"dtype": "float16",
|
| 7 |
"eos_token_id": 151645,
|
| 8 |
"hidden_act": "silu",
|
| 9 |
"hidden_size": 3584,
|
|
|
|
| 16 |
"num_attention_heads": 28,
|
| 17 |
"num_hidden_layers": 28,
|
| 18 |
"num_key_value_heads": 4,
|
| 19 |
+
"pad_token_id": 151643,
|
| 20 |
"rms_norm_eps": 1e-06,
|
| 21 |
"rope_scaling": {
|
| 22 |
"mrope_section": [
|
|
|
|
| 29 |
},
|
| 30 |
"rope_theta": 1000000.0,
|
| 31 |
"sliding_window": 32768,
|
| 32 |
+
"text_config": {
|
| 33 |
+
"_name_or_path": "/u/qw3460/huggingface/CodeGoat24/UnifiedReward-Think-qwen-7b",
|
| 34 |
+
"architectures": [
|
| 35 |
+
"Qwen2_5_VLForConditionalGeneration"
|
| 36 |
+
],
|
| 37 |
+
"attention_dropout": 0.0,
|
| 38 |
+
"dtype": "float16",
|
| 39 |
+
"eos_token_id": 151645,
|
| 40 |
+
"hidden_act": "silu",
|
| 41 |
+
"hidden_size": 3584,
|
| 42 |
+
"initializer_range": 0.02,
|
| 43 |
+
"intermediate_size": 18944,
|
| 44 |
+
"layer_types": [
|
| 45 |
+
"full_attention",
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"full_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"full_attention",
|
| 57 |
+
"full_attention",
|
| 58 |
+
"full_attention",
|
| 59 |
+
"full_attention",
|
| 60 |
+
"full_attention",
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention",
|
| 64 |
+
"full_attention",
|
| 65 |
+
"full_attention",
|
| 66 |
+
"full_attention",
|
| 67 |
+
"full_attention",
|
| 68 |
+
"full_attention",
|
| 69 |
+
"full_attention",
|
| 70 |
+
"full_attention",
|
| 71 |
+
"full_attention",
|
| 72 |
+
"full_attention"
|
| 73 |
+
],
|
| 74 |
+
"max_position_embeddings": 128000,
|
| 75 |
+
"max_window_layers": 28,
|
| 76 |
+
"model_type": "qwen2_5_vl_text",
|
| 77 |
+
"num_attention_heads": 28,
|
| 78 |
+
"num_hidden_layers": 28,
|
| 79 |
+
"num_key_value_heads": 4,
|
| 80 |
+
"pad_token_id": 151643,
|
| 81 |
+
"rms_norm_eps": 1e-06,
|
| 82 |
+
"rope_scaling": {
|
| 83 |
+
"mrope_section": [
|
| 84 |
+
16,
|
| 85 |
+
24,
|
| 86 |
+
24
|
| 87 |
+
],
|
| 88 |
+
"rope_type": "default",
|
| 89 |
+
"type": "default"
|
| 90 |
+
},
|
| 91 |
+
"rope_theta": 1000000.0,
|
| 92 |
+
"sliding_window": null,
|
| 93 |
+
"use_cache": true,
|
| 94 |
+
"use_sliding_window": false,
|
| 95 |
+
"vision_token_id": 151654,
|
| 96 |
+
"vocab_size": 152064
|
| 97 |
+
},
|
| 98 |
"tie_word_embeddings": false,
|
| 99 |
+
"transformers_version": "4.57.0",
|
|
|
|
| 100 |
"use_cache": false,
|
| 101 |
"use_sliding_window": false,
|
| 102 |
"video_token_id": 151656,
|
| 103 |
"vision_config": {
|
| 104 |
"depth": 32,
|
| 105 |
+
"dtype": "float16",
|
| 106 |
"fullatt_block_indexes": [
|
| 107 |
7,
|
| 108 |
15,
|
|
|
|
| 113 |
"hidden_size": 1280,
|
| 114 |
"in_channels": 3,
|
| 115 |
"in_chans": 3,
|
| 116 |
+
"initializer_range": 0.02,
|
| 117 |
"intermediate_size": 3420,
|
| 118 |
"model_type": "qwen2_5_vl",
|
| 119 |
"num_heads": 16,
|
|
|
|
| 123 |
"spatial_patch_size": 14,
|
| 124 |
"temporal_patch_size": 2,
|
| 125 |
"tokens_per_second": 2,
|
|
|
|
| 126 |
"window_size": 112
|
| 127 |
},
|
| 128 |
"vision_end_token_id": 151653,
|
generation_config.json
CHANGED
|
@@ -1,14 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
-
"do_sample": true,
|
| 5 |
-
"eos_token_id": [
|
| 6 |
-
151645,
|
| 7 |
-
151643
|
| 8 |
-
],
|
| 9 |
"pad_token_id": 151643,
|
| 10 |
-
"
|
| 11 |
-
"temperature": 1e-06,
|
| 12 |
-
"transformers_version": "4.51.0",
|
| 13 |
"use_cache": false
|
| 14 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"eos_token_id": 151645,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"pad_token_id": 151643,
|
| 5 |
+
"transformers_version": "4.57.0",
|
|
|
|
|
|
|
| 6 |
"use_cache": false
|
| 7 |
}
|
model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e50303dfcd3eb8d3a72ed190b561eb3c1c3cc0239867374435da2a0189ae007f
|
| 3 |
+
size 4968242840
|
model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9637600b2ca95d9e652f03b1ae68f0c562a79898482a5c86e8e00608172b51ab
|
| 3 |
+
size 4991495688
|
model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f986cd21096e61027ff75ac1e0723504cccac4a85017b38544a59f901fd45d44
|
| 3 |
+
size 4932750920
|
model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9dfe701842c06f8b698372ecf3e71762081cffc7b95ed5ff23d020d37294ec77
|
| 3 |
+
size 1691924368
|
model.safetensors.index.json
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
-
"
|
|
|
|
| 4 |
},
|
| 5 |
"weight_map": {
|
| 6 |
"lm_head.weight": "model-00004-of-00004.safetensors",
|
|
|
|
| 1 |
{
|
| 2 |
"metadata": {
|
| 3 |
+
"total_parameters": 8292166656,
|
| 4 |
+
"total_size": 16584333312
|
| 5 |
},
|
| 6 |
"weight_map": {
|
| 7 |
"lm_head.weight": "model-00004-of-00004.safetensors",
|
preprocessor_config.json
CHANGED
|
@@ -1,6 +1,13 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"do_convert_rgb": true,
|
| 3 |
"do_normalize": true,
|
|
|
|
| 4 |
"do_rescale": true,
|
| 5 |
"do_resize": true,
|
| 6 |
"image_mean": [
|
|
@@ -8,19 +15,22 @@
|
|
| 8 |
0.4578275,
|
| 9 |
0.40821073
|
| 10 |
],
|
| 11 |
-
"image_processor_type": "
|
| 12 |
"image_std": [
|
| 13 |
0.26862954,
|
| 14 |
0.26130258,
|
| 15 |
0.27577711
|
| 16 |
],
|
|
|
|
| 17 |
"max_pixels": 12845056,
|
| 18 |
"merge_size": 2,
|
| 19 |
"min_pixels": 3136,
|
|
|
|
| 20 |
"patch_size": 14,
|
| 21 |
"processor_class": "Qwen2_5_VLProcessor",
|
| 22 |
"resample": 3,
|
| 23 |
"rescale_factor": 0.00392156862745098,
|
|
|
|
| 24 |
"size": {
|
| 25 |
"longest_edge": 12845056,
|
| 26 |
"shortest_edge": 3136
|
|
|
|
| 1 |
{
|
| 2 |
+
"crop_size": null,
|
| 3 |
+
"data_format": "channels_first",
|
| 4 |
+
"default_to_square": true,
|
| 5 |
+
"device": null,
|
| 6 |
+
"disable_grouping": null,
|
| 7 |
+
"do_center_crop": null,
|
| 8 |
"do_convert_rgb": true,
|
| 9 |
"do_normalize": true,
|
| 10 |
+
"do_pad": null,
|
| 11 |
"do_rescale": true,
|
| 12 |
"do_resize": true,
|
| 13 |
"image_mean": [
|
|
|
|
| 15 |
0.4578275,
|
| 16 |
0.40821073
|
| 17 |
],
|
| 18 |
+
"image_processor_type": "Qwen2VLImageProcessorFast",
|
| 19 |
"image_std": [
|
| 20 |
0.26862954,
|
| 21 |
0.26130258,
|
| 22 |
0.27577711
|
| 23 |
],
|
| 24 |
+
"input_data_format": null,
|
| 25 |
"max_pixels": 12845056,
|
| 26 |
"merge_size": 2,
|
| 27 |
"min_pixels": 3136,
|
| 28 |
+
"pad_size": null,
|
| 29 |
"patch_size": 14,
|
| 30 |
"processor_class": "Qwen2_5_VLProcessor",
|
| 31 |
"resample": 3,
|
| 32 |
"rescale_factor": 0.00392156862745098,
|
| 33 |
+
"return_tensors": null,
|
| 34 |
"size": {
|
| 35 |
"longest_edge": 12845056,
|
| 36 |
"shortest_edge": 3136
|
tokenizer_config.json
CHANGED
|
@@ -195,15 +195,16 @@
|
|
| 195 |
"<|video_pad|>"
|
| 196 |
],
|
| 197 |
"bos_token": null,
|
| 198 |
-
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
| 199 |
"clean_up_tokenization_spaces": false,
|
| 200 |
"eos_token": "<|im_end|>",
|
| 201 |
"errors": "replace",
|
| 202 |
"extra_special_tokens": {},
|
| 203 |
"model_max_length": 131072,
|
| 204 |
"pad_token": "<|endoftext|>",
|
|
|
|
| 205 |
"processor_class": "Qwen2_5_VLProcessor",
|
| 206 |
"split_special_tokens": false,
|
| 207 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 208 |
-
"unk_token": null
|
|
|
|
| 209 |
}
|
|
|
|
| 195 |
"<|video_pad|>"
|
| 196 |
],
|
| 197 |
"bos_token": null,
|
|
|
|
| 198 |
"clean_up_tokenization_spaces": false,
|
| 199 |
"eos_token": "<|im_end|>",
|
| 200 |
"errors": "replace",
|
| 201 |
"extra_special_tokens": {},
|
| 202 |
"model_max_length": 131072,
|
| 203 |
"pad_token": "<|endoftext|>",
|
| 204 |
+
"padding_side": "left",
|
| 205 |
"processor_class": "Qwen2_5_VLProcessor",
|
| 206 |
"split_special_tokens": false,
|
| 207 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 208 |
+
"unk_token": null,
|
| 209 |
+
"use_fast": true
|
| 210 |
}
|
video_preprocessor_config.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": null,
|
| 3 |
+
"data_format": "channels_first",
|
| 4 |
+
"default_to_square": true,
|
| 5 |
+
"device": null,
|
| 6 |
+
"do_center_crop": null,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"do_sample_frames": false,
|
| 12 |
+
"fps": null,
|
| 13 |
+
"image_mean": [
|
| 14 |
+
0.48145466,
|
| 15 |
+
0.4578275,
|
| 16 |
+
0.40821073
|
| 17 |
+
],
|
| 18 |
+
"image_processor_type": "Qwen2VLImageProcessorFast",
|
| 19 |
+
"image_std": [
|
| 20 |
+
0.26862954,
|
| 21 |
+
0.26130258,
|
| 22 |
+
0.27577711
|
| 23 |
+
],
|
| 24 |
+
"input_data_format": null,
|
| 25 |
+
"max_frames": 768,
|
| 26 |
+
"max_pixels": 12845056,
|
| 27 |
+
"merge_size": 2,
|
| 28 |
+
"min_frames": 4,
|
| 29 |
+
"min_pixels": 3136,
|
| 30 |
+
"num_frames": null,
|
| 31 |
+
"pad_size": null,
|
| 32 |
+
"patch_size": 14,
|
| 33 |
+
"processor_class": "Qwen2_5_VLProcessor",
|
| 34 |
+
"resample": 3,
|
| 35 |
+
"rescale_factor": 0.00392156862745098,
|
| 36 |
+
"return_metadata": false,
|
| 37 |
+
"return_tensors": null,
|
| 38 |
+
"size": {
|
| 39 |
+
"longest_edge": 12845056,
|
| 40 |
+
"shortest_edge": 3136
|
| 41 |
+
},
|
| 42 |
+
"temporal_patch_size": 2,
|
| 43 |
+
"video_metadata": null,
|
| 44 |
+
"video_processor_type": "Qwen2VLVideoProcessor"
|
| 45 |
+
}
|