Safetensors
qwen2_5_vl
qunwang13 commited on
Commit
fe3e695
·
verified ·
1 Parent(s): ec08d7b
Modelfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ollama modelfile auto-generated by llamafactory
2
+
3
+ FROM .
4
+
5
+ TEMPLATE """{{ if .System }}<|im_start|>system
6
+ {{ .System }}<|im_end|>
7
+ {{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
8
+ {{ .Content }}<|im_end|>
9
+ <|im_start|>assistant
10
+ {{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
11
+ {{ end }}{{ end }}"""
12
+
13
+ SYSTEM """You are a helpful assistant."""
14
+
15
+ PARAMETER stop "<|im_end|>"
16
+ PARAMETER num_ctx 4096
README.md CHANGED
@@ -1,26 +1,27 @@
1
  ---
2
  license: mit
3
  datasets:
4
- - KwaiVGI/VideoGen-RewardBench
5
- - TIGER-Lab/GenAI-Bench
6
  base_model:
7
- - Qwen/Qwen2.5-VL-7B-Instruct
8
  ---
9
 
10
-
11
  ## Model Summary
12
 
13
  VR-Thinker is the first Multimodal Reward Model utilizing Thinking-with-Image framework.
14
 
15
  For further details, please refer to the following:
 
16
  - 📰 Paper: https://arxiv.org/pdf/2510.10518
17
  - 📚 Github: https://github.com/qunzhongwang/vr-thinker
18
  - 👋 Contact: [Qunzhong Wang](http://qunzhongwang.github.io/)
19
 
20
  ### Quick Start
 
21
  We provide a sample test interface here:
22
 
23
- ~~~python
24
  import json
25
  import random
26
  import torch
@@ -144,17 +145,18 @@ output_text = processor.batch_decode(
144
  )
145
  print(output_text)
146
 
147
- ~~~
148
 
149
  ## Citation
 
150
  ```
151
  @misc{wang2025vrthinkerboostingvideoreward,
152
- title={VR-Thinker: Boosting Video Reward Models through Thinking-with-Image Reasoning},
153
  author={Qunzhong Wang and Jie Liu and Jiajun Liang and Yilei Jiang and Yuanxing Zhang and Jinyuan Chen and Yaozhi Zheng and Xintao Wang and Pengfei Wan and Xiangyu Yue and Jiaheng Liu},
154
  year={2025},
155
  eprint={2510.10518},
156
  archivePrefix={arXiv},
157
  primaryClass={cs.CV},
158
- url={https://arxiv.org/abs/2510.10518},
159
  }
160
- ```
 
1
  ---
2
  license: mit
3
  datasets:
4
+ - KwaiVGI/VideoGen-RewardBench
5
+ - TIGER-Lab/GenAI-Bench
6
  base_model:
7
+ - Qwen/Qwen2.5-VL-7B-Instruct
8
  ---
9
 
 
10
  ## Model Summary
11
 
12
  VR-Thinker is the first Multimodal Reward Model utilizing Thinking-with-Image framework.
13
 
14
  For further details, please refer to the following:
15
+
16
  - 📰 Paper: https://arxiv.org/pdf/2510.10518
17
  - 📚 Github: https://github.com/qunzhongwang/vr-thinker
18
  - 👋 Contact: [Qunzhong Wang](http://qunzhongwang.github.io/)
19
 
20
  ### Quick Start
21
+
22
  We provide a sample test interface here:
23
 
24
+ ```python
25
  import json
26
  import random
27
  import torch
 
145
  )
146
  print(output_text)
147
 
148
+ ```
149
 
150
  ## Citation
151
+
152
  ```
153
  @misc{wang2025vrthinkerboostingvideoreward,
154
+ title={VR-Thinker: Boosting Video Reward Models through Thinking-with-Image Reasoning},
155
  author={Qunzhong Wang and Jie Liu and Jiajun Liang and Yilei Jiang and Yuanxing Zhang and Jinyuan Chen and Yaozhi Zheng and Xintao Wang and Pengfei Wan and Xiangyu Yue and Jiaheng Liu},
156
  year={2025},
157
  eprint={2510.10518},
158
  archivePrefix={arXiv},
159
  primaryClass={cs.CV},
160
+ url={https://arxiv.org/abs/2510.10518},
161
  }
162
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json CHANGED
@@ -3,7 +3,7 @@
3
  "Qwen2_5_VLForConditionalGeneration"
4
  ],
5
  "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
  "eos_token_id": 151645,
8
  "hidden_act": "silu",
9
  "hidden_size": 3584,
@@ -16,6 +16,7 @@
16
  "num_attention_heads": 28,
17
  "num_hidden_layers": 28,
18
  "num_key_value_heads": 4,
 
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": {
21
  "mrope_section": [
@@ -28,14 +29,80 @@
28
  },
29
  "rope_theta": 1000000.0,
30
  "sliding_window": 32768,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "tie_word_embeddings": false,
32
- "torch_dtype": "bfloat16",
33
- "transformers_version": "4.51.0",
34
  "use_cache": false,
35
  "use_sliding_window": false,
36
  "video_token_id": 151656,
37
  "vision_config": {
38
  "depth": 32,
 
39
  "fullatt_block_indexes": [
40
  7,
41
  15,
@@ -46,6 +113,7 @@
46
  "hidden_size": 1280,
47
  "in_channels": 3,
48
  "in_chans": 3,
 
49
  "intermediate_size": 3420,
50
  "model_type": "qwen2_5_vl",
51
  "num_heads": 16,
@@ -55,7 +123,6 @@
55
  "spatial_patch_size": 14,
56
  "temporal_patch_size": 2,
57
  "tokens_per_second": 2,
58
- "torch_dtype": "bfloat16",
59
  "window_size": 112
60
  },
61
  "vision_end_token_id": 151653,
 
3
  "Qwen2_5_VLForConditionalGeneration"
4
  ],
5
  "attention_dropout": 0.0,
6
+ "dtype": "float16",
7
  "eos_token_id": 151645,
8
  "hidden_act": "silu",
9
  "hidden_size": 3584,
 
16
  "num_attention_heads": 28,
17
  "num_hidden_layers": 28,
18
  "num_key_value_heads": 4,
19
+ "pad_token_id": 151643,
20
  "rms_norm_eps": 1e-06,
21
  "rope_scaling": {
22
  "mrope_section": [
 
29
  },
30
  "rope_theta": 1000000.0,
31
  "sliding_window": 32768,
32
+ "text_config": {
33
+ "_name_or_path": "/u/qw3460/huggingface/CodeGoat24/UnifiedReward-Think-qwen-7b",
34
+ "architectures": [
35
+ "Qwen2_5_VLForConditionalGeneration"
36
+ ],
37
+ "attention_dropout": 0.0,
38
+ "dtype": "float16",
39
+ "eos_token_id": 151645,
40
+ "hidden_act": "silu",
41
+ "hidden_size": 3584,
42
+ "initializer_range": 0.02,
43
+ "intermediate_size": 18944,
44
+ "layer_types": [
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention"
73
+ ],
74
+ "max_position_embeddings": 128000,
75
+ "max_window_layers": 28,
76
+ "model_type": "qwen2_5_vl_text",
77
+ "num_attention_heads": 28,
78
+ "num_hidden_layers": 28,
79
+ "num_key_value_heads": 4,
80
+ "pad_token_id": 151643,
81
+ "rms_norm_eps": 1e-06,
82
+ "rope_scaling": {
83
+ "mrope_section": [
84
+ 16,
85
+ 24,
86
+ 24
87
+ ],
88
+ "rope_type": "default",
89
+ "type": "default"
90
+ },
91
+ "rope_theta": 1000000.0,
92
+ "sliding_window": null,
93
+ "use_cache": true,
94
+ "use_sliding_window": false,
95
+ "vision_token_id": 151654,
96
+ "vocab_size": 152064
97
+ },
98
  "tie_word_embeddings": false,
99
+ "transformers_version": "4.57.0",
 
100
  "use_cache": false,
101
  "use_sliding_window": false,
102
  "video_token_id": 151656,
103
  "vision_config": {
104
  "depth": 32,
105
+ "dtype": "float16",
106
  "fullatt_block_indexes": [
107
  7,
108
  15,
 
113
  "hidden_size": 1280,
114
  "in_channels": 3,
115
  "in_chans": 3,
116
+ "initializer_range": 0.02,
117
  "intermediate_size": 3420,
118
  "model_type": "qwen2_5_vl",
119
  "num_heads": 16,
 
123
  "spatial_patch_size": 14,
124
  "temporal_patch_size": 2,
125
  "tokens_per_second": 2,
 
126
  "window_size": 112
127
  },
128
  "vision_end_token_id": 151653,
generation_config.json CHANGED
@@ -1,14 +1,7 @@
1
  {
2
- "attn_implementation": "flash_attention_2",
3
- "bos_token_id": 151643,
4
- "do_sample": true,
5
- "eos_token_id": [
6
- 151645,
7
- 151643
8
- ],
9
  "pad_token_id": 151643,
10
- "repetition_penalty": 1.05,
11
- "temperature": 1e-06,
12
- "transformers_version": "4.51.0",
13
  "use_cache": false
14
  }
 
1
  {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 151645,
 
 
 
 
 
4
  "pad_token_id": 151643,
5
+ "transformers_version": "4.57.0",
 
 
6
  "use_cache": false
7
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:982daf945bb565606bb5bfdbdebea90af99bf6fd0bd53bcafdb9287a9168bdf1
3
- size 4968243304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e50303dfcd3eb8d3a72ed190b561eb3c1c3cc0239867374435da2a0189ae007f
3
+ size 4968242840
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb019a6f5257d1e28a46b1e3bb7b1cc960baf5e3bdc24489f6d5d06c0e37fbba
3
- size 4991495816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9637600b2ca95d9e652f03b1ae68f0c562a79898482a5c86e8e00608172b51ab
3
+ size 4991495688
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eea859c37ea2936b8856f15f7514832e82acaa76214ef28d778e9e4beb16428e
3
- size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f986cd21096e61027ff75ac1e0723504cccac4a85017b38544a59f901fd45d44
3
+ size 4932750920
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27d0377350f75fb60d987c9f958610a927bfec678d3d48ee4b8a75c0ea10aed7
3
- size 1691924384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dfe701842c06f8b698372ecf3e71762081cffc7b95ed5ff23d020d37294ec77
3
+ size 1691924368
model.safetensors.index.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "metadata": {
3
- "total_size": 16584333824
 
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00004-of-00004.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_parameters": 8292166656,
4
+ "total_size": 16584333312
5
  },
6
  "weight_map": {
7
  "lm_head.weight": "model-00004-of-00004.safetensors",
preprocessor_config.json CHANGED
@@ -1,6 +1,13 @@
1
  {
 
 
 
 
 
 
2
  "do_convert_rgb": true,
3
  "do_normalize": true,
 
4
  "do_rescale": true,
5
  "do_resize": true,
6
  "image_mean": [
@@ -8,19 +15,22 @@
8
  0.4578275,
9
  0.40821073
10
  ],
11
- "image_processor_type": "Qwen2VLImageProcessor",
12
  "image_std": [
13
  0.26862954,
14
  0.26130258,
15
  0.27577711
16
  ],
 
17
  "max_pixels": 12845056,
18
  "merge_size": 2,
19
  "min_pixels": 3136,
 
20
  "patch_size": 14,
21
  "processor_class": "Qwen2_5_VLProcessor",
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
 
24
  "size": {
25
  "longest_edge": 12845056,
26
  "shortest_edge": 3136
 
1
  {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "do_center_crop": null,
8
  "do_convert_rgb": true,
9
  "do_normalize": true,
10
+ "do_pad": null,
11
  "do_rescale": true,
12
  "do_resize": true,
13
  "image_mean": [
 
15
  0.4578275,
16
  0.40821073
17
  ],
18
+ "image_processor_type": "Qwen2VLImageProcessorFast",
19
  "image_std": [
20
  0.26862954,
21
  0.26130258,
22
  0.27577711
23
  ],
24
+ "input_data_format": null,
25
  "max_pixels": 12845056,
26
  "merge_size": 2,
27
  "min_pixels": 3136,
28
+ "pad_size": null,
29
  "patch_size": 14,
30
  "processor_class": "Qwen2_5_VLProcessor",
31
  "resample": 3,
32
  "rescale_factor": 0.00392156862745098,
33
+ "return_tensors": null,
34
  "size": {
35
  "longest_edge": 12845056,
36
  "shortest_edge": 3136
tokenizer_config.json CHANGED
@@ -195,15 +195,16 @@
195
  "<|video_pad|>"
196
  ],
197
  "bos_token": null,
198
- "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
199
  "clean_up_tokenization_spaces": false,
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
  "model_max_length": 131072,
204
  "pad_token": "<|endoftext|>",
 
205
  "processor_class": "Qwen2_5_VLProcessor",
206
  "split_special_tokens": false,
207
  "tokenizer_class": "Qwen2Tokenizer",
208
- "unk_token": null
 
209
  }
 
195
  "<|video_pad|>"
196
  ],
197
  "bos_token": null,
 
198
  "clean_up_tokenization_spaces": false,
199
  "eos_token": "<|im_end|>",
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
  "pad_token": "<|endoftext|>",
204
+ "padding_side": "left",
205
  "processor_class": "Qwen2_5_VLProcessor",
206
  "split_special_tokens": false,
207
  "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null,
209
+ "use_fast": true
210
  }
video_preprocessor_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "do_sample_frames": false,
12
+ "fps": null,
13
+ "image_mean": [
14
+ 0.48145466,
15
+ 0.4578275,
16
+ 0.40821073
17
+ ],
18
+ "image_processor_type": "Qwen2VLImageProcessorFast",
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "input_data_format": null,
25
+ "max_frames": 768,
26
+ "max_pixels": 12845056,
27
+ "merge_size": 2,
28
+ "min_frames": 4,
29
+ "min_pixels": 3136,
30
+ "num_frames": null,
31
+ "pad_size": null,
32
+ "patch_size": 14,
33
+ "processor_class": "Qwen2_5_VLProcessor",
34
+ "resample": 3,
35
+ "rescale_factor": 0.00392156862745098,
36
+ "return_metadata": false,
37
+ "return_tensors": null,
38
+ "size": {
39
+ "longest_edge": 12845056,
40
+ "shortest_edge": 3136
41
+ },
42
+ "temporal_patch_size": 2,
43
+ "video_metadata": null,
44
+ "video_processor_type": "Qwen2VLVideoProcessor"
45
+ }