v0.75

Browse files

Files changed (13) hide show

Modelfile +16 -0
README.md +11 -9
chat_template.jinja +7 -0
config.json +71 -4
generation_config.json +3 -10
model-00001-of-00004.safetensors +2 -2
model-00002-of-00004.safetensors +2 -2
model-00003-of-00004.safetensors +2 -2
model-00004-of-00004.safetensors +2 -2
model.safetensors.index.json +2 -1
preprocessor_config.json +11 -1
tokenizer_config.json +3 -2
video_preprocessor_config.json +45 -0

Modelfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# ollama modelfile auto-generated by llamafactory
+FROM .
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+<|im_start|>assistant
+{{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
+{{ end }}{{ end }}"""
+SYSTEM """You are a helpful assistant."""
+PARAMETER stop "<|im_end|>"
+PARAMETER num_ctx 4096

README.md CHANGED Viewed

@@ -1,26 +1,27 @@
 ---
 license: mit
 datasets:
-- KwaiVGI/VideoGen-RewardBench
-- TIGER-Lab/GenAI-Bench
 base_model:
-- Qwen/Qwen2.5-VL-7B-Instruct
 ---
 ## Model Summary
 VR-Thinker is the first Multimodal Reward Model utilizing Thinking-with-Image framework.
 For further details, please refer to the following:
 - 📰 Paper: https://arxiv.org/pdf/2510.10518
 - 📚 Github: https://github.com/qunzhongwang/vr-thinker
 - 👋 Contact: [Qunzhong Wang](http://qunzhongwang.github.io/)
 ### Quick Start
 We provide a sample test interface here:
-~~~python
 import json
 import random
 import torch
@@ -144,17 +145,18 @@ output_text = processor.batch_decode(
 )
 print(output_text)
-~~~
 ## Citation
 ```
 @misc{wang2025vrthinkerboostingvideoreward,
-      title={VR-Thinker: Boosting Video Reward Models through Thinking-with-Image Reasoning},
       author={Qunzhong Wang and Jie Liu and Jiajun Liang and Yilei Jiang and Yuanxing Zhang and Jinyuan Chen and Yaozhi Zheng and Xintao Wang and Pengfei Wan and Xiangyu Yue and Jiaheng Liu},
       year={2025},
       eprint={2510.10518},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2510.10518},
 }
-```

 ---
 license: mit
 datasets:
+  - KwaiVGI/VideoGen-RewardBench
+  - TIGER-Lab/GenAI-Bench
 base_model:
+  - Qwen/Qwen2.5-VL-7B-Instruct
 ---
 ## Model Summary
 VR-Thinker is the first Multimodal Reward Model utilizing Thinking-with-Image framework.
 For further details, please refer to the following:
 - 📰 Paper: https://arxiv.org/pdf/2510.10518
 - 📚 Github: https://github.com/qunzhongwang/vr-thinker
 - 👋 Contact: [Qunzhong Wang](http://qunzhongwang.github.io/)
 ### Quick Start
 We provide a sample test interface here:
+```python
 import json
 import random
 import torch
 )
 print(output_text)
+```
 ## Citation
 ```
 @misc{wang2025vrthinkerboostingvideoreward,
+      title={VR-Thinker: Boosting Video Reward Models through Thinking-with-Image Reasoning},
       author={Qunzhong Wang and Jie Liu and Jiajun Liang and Yilei Jiang and Yuanxing Zhang and Jinyuan Chen and Yaozhi Zheng and Xintao Wang and Pengfei Wan and Xiangyu Yue and Jiaheng Liu},
       year={2025},
       eprint={2510.10518},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2510.10518},
 }
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json CHANGED Viewed

@@ -3,7 +3,7 @@
     "Qwen2_5_VLForConditionalGeneration"
   ],
   "attention_dropout": 0.0,
-  "bos_token_id": 151643,
   "eos_token_id": 151645,
   "hidden_act": "silu",
   "hidden_size": 3584,
@@ -16,6 +16,7 @@
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,
   "rms_norm_eps": 1e-06,
   "rope_scaling": {
     "mrope_section": [
@@ -28,14 +29,80 @@
   },
   "rope_theta": 1000000.0,
   "sliding_window": 32768,
   "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.51.0",
   "use_cache": false,
   "use_sliding_window": false,
   "video_token_id": 151656,
   "vision_config": {
     "depth": 32,
     "fullatt_block_indexes": [
       7,
       15,
@@ -46,6 +113,7 @@
     "hidden_size": 1280,
     "in_channels": 3,
     "in_chans": 3,
     "intermediate_size": 3420,
     "model_type": "qwen2_5_vl",
     "num_heads": 16,
@@ -55,7 +123,6 @@
     "spatial_patch_size": 14,
     "temporal_patch_size": 2,
     "tokens_per_second": 2,
-    "torch_dtype": "bfloat16",
     "window_size": 112
   },
   "vision_end_token_id": 151653,

     "Qwen2_5_VLForConditionalGeneration"
   ],
   "attention_dropout": 0.0,
+  "dtype": "float16",
   "eos_token_id": 151645,
   "hidden_act": "silu",
   "hidden_size": 3584,
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,
+  "pad_token_id": 151643,
   "rms_norm_eps": 1e-06,
   "rope_scaling": {
     "mrope_section": [
   },
   "rope_theta": 1000000.0,
   "sliding_window": 32768,
+  "text_config": {
+    "_name_or_path": "/u/qw3460/huggingface/CodeGoat24/UnifiedReward-Think-qwen-7b",
+    "architectures": [
+      "Qwen2_5_VLForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "dtype": "float16",
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 128000,
+    "max_window_layers": 28,
+    "model_type": "qwen2_5_vl_text",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "pad_token_id": 151643,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vision_token_id": 151654,
+    "vocab_size": 152064
+  },
   "tie_word_embeddings": false,
+  "transformers_version": "4.57.0",
   "use_cache": false,
   "use_sliding_window": false,
   "video_token_id": 151656,
   "vision_config": {
     "depth": 32,
+    "dtype": "float16",
     "fullatt_block_indexes": [
       7,
       15,
     "hidden_size": 1280,
     "in_channels": 3,
     "in_chans": 3,
+    "initializer_range": 0.02,
     "intermediate_size": 3420,
     "model_type": "qwen2_5_vl",
     "num_heads": 16,
     "spatial_patch_size": 14,
     "temporal_patch_size": 2,
     "tokens_per_second": 2,
     "window_size": 112
   },
   "vision_end_token_id": 151653,

generation_config.json CHANGED Viewed

@@ -1,14 +1,7 @@
 {
-  "attn_implementation": "flash_attention_2",
-  "bos_token_id": 151643,
-  "do_sample": true,
-  "eos_token_id": [
-    151645,
-    151643
-  ],
   "pad_token_id": 151643,
-  "repetition_penalty": 1.05,
-  "temperature": 1e-06,
-  "transformers_version": "4.51.0",
   "use_cache": false
 }

 {
+  "_from_model_config": true,
+  "eos_token_id": 151645,
   "pad_token_id": 151643,
+  "transformers_version": "4.57.0",
   "use_cache": false
 }

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:982daf945bb565606bb5bfdbdebea90af99bf6fd0bd53bcafdb9287a9168bdf1
-size 4968243304

 version https://git-lfs.github.com/spec/v1
+oid sha256:e50303dfcd3eb8d3a72ed190b561eb3c1c3cc0239867374435da2a0189ae007f
+size 4968242840

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb019a6f5257d1e28a46b1e3bb7b1cc960baf5e3bdc24489f6d5d06c0e37fbba
-size 4991495816

 version https://git-lfs.github.com/spec/v1
+oid sha256:9637600b2ca95d9e652f03b1ae68f0c562a79898482a5c86e8e00608172b51ab
+size 4991495688

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eea859c37ea2936b8856f15f7514832e82acaa76214ef28d778e9e4beb16428e
-size 4932751040

 version https://git-lfs.github.com/spec/v1
+oid sha256:f986cd21096e61027ff75ac1e0723504cccac4a85017b38544a59f901fd45d44
+size 4932750920

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27d0377350f75fb60d987c9f958610a927bfec678d3d48ee4b8a75c0ea10aed7
-size 1691924384

 version https://git-lfs.github.com/spec/v1
+oid sha256:9dfe701842c06f8b698372ecf3e71762081cffc7b95ed5ff23d020d37294ec77
+size 1691924368

model.safetensors.index.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
   "metadata": {
-    "total_size": 16584333824
   },
   "weight_map": {
     "lm_head.weight": "model-00004-of-00004.safetensors",

 {
   "metadata": {
+    "total_parameters": 8292166656,
+    "total_size": 16584333312
   },
   "weight_map": {
     "lm_head.weight": "model-00004-of-00004.safetensors",

preprocessor_config.json CHANGED Viewed

@@ -1,6 +1,13 @@
 {
   "do_convert_rgb": true,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
   "image_mean": [
@@ -8,19 +15,22 @@
     0.4578275,
     0.40821073
   ],
-  "image_processor_type": "Qwen2VLImageProcessor",
   "image_std": [
     0.26862954,
     0.26130258,
     0.27577711
   ],
   "max_pixels": 12845056,
   "merge_size": 2,
   "min_pixels": 3136,
   "patch_size": 14,
   "processor_class": "Qwen2_5_VLProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {
     "longest_edge": 12845056,
     "shortest_edge": 3136

 {
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
   "do_convert_rgb": true,
   "do_normalize": true,
+  "do_pad": null,
   "do_rescale": true,
   "do_resize": true,
   "image_mean": [
     0.4578275,
     0.40821073
   ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
   "image_std": [
     0.26862954,
     0.26130258,
     0.27577711
   ],
+  "input_data_format": null,
   "max_pixels": 12845056,
   "merge_size": 2,
   "min_pixels": 3136,
+  "pad_size": null,
   "patch_size": 14,
   "processor_class": "Qwen2_5_VLProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
   "size": {
     "longest_edge": 12845056,
     "shortest_edge": 3136

tokenizer_config.json CHANGED Viewed

@@ -195,15 +195,16 @@
     "<|video_pad|>"
   ],
   "bos_token": null,
-  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "processor_class": "Qwen2_5_VLProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
-  "unk_token": null
 }

     "<|video_pad|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
+  "padding_side": "left",
   "processor_class": "Qwen2_5_VLProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "use_fast": true
 }

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}