brandonbeiler commited on
Commit
33fd097
·
verified ·
1 Parent(s): 1d94f42

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - zh
5
+ tags:
6
+ - fp8
7
+ - quantization
8
+ - dynamic
9
+ - vision-language
10
+ - multimodal
11
+ - vllm
12
+ - llm-compressor
13
+ - skywork_chat
14
+ - Skywork+R1V
15
+ pipeline_tag: image-text-to-text
16
+ inference: false
17
+ license: mit
18
+ ---
19
+ # 🔥 Skywork-R1V3-38B-FP8-Dynamic: Optimized Vision-Language Model 🔥
20
+ This is a **FP8 dynamic quantized** version of [Skywork/Skywork-R1V3-38B](https://huggingface.co/Skywork/Skywork-R1V3-38B), optimized for high-performance inference with vLLM.
21
+ The model utilizes **dynamic FP8 quantization** for optimal ease of use and deployment, achieving significant speedup with minimal accuracy degradation on vision-language tasks.
22
+ ## 🚀 Key Features
23
+ - **FP8 Dynamic Quantization**: No calibration required, ready to use immediately
24
+ - **Vision-Language Optimized**: Specialized quantization recipe that preserves visual understanding
25
+ - **vLLM Ready**: Seamless integration with vLLM for production deployment
26
+ - **Memory Efficient**: ~50% memory reduction compared to FP16 original
27
+ - **Performance Boost**: Significant faster inference on H100/L40S GPUs
28
+ ## 📊 Model Details
29
+ - **Original Model**: [Skywork/Skywork-R1V3-38B](https://huggingface.co/Skywork/Skywork-R1V3-38B)
30
+ - **Source Model**: Skywork/Skywork-R1V3-38B
31
+ - **Quantized Model**: Skywork-R1V3-38B-FP8-Dynamic
32
+ - **Quantization Method**: FP8 Dynamic (W8A8)
33
+ - **Quantization Library**: [LLM Compressor](https://github.com/vllm-project/llm-compressor) v0.6.1a20250708
34
+ - **Quantized by**: [brandonbeiler](https://huggingface.co/brandonbeiler)
35
+ ## 🔧 Usage
36
+ ### With vLLM (Recommended)
37
+ ```python
38
+ from vllm import LLM, SamplingParams
39
+
40
+ # Load the quantized model
41
+
42
+ model = LLM(
43
+ model="brandonbeiler/Skywork-R1V3-38B-FP8-Dynamic",
44
+ tensor_parallel_size=1, # Adjust based on your GPU setup
45
+ limit_mm_per_prompt={"image": 20},
46
+ trust_remote_code=True, # required for older versions of vLLM
47
+ max_model_len=32768, # Decrease if you run into memory issues
48
+ gpu_memory_utilization=0.8, # Adjust based on your GPU memory
49
+ )
50
+
51
+ # Generate response
52
+ sampling_params = SamplingParams(temperature=0.0, max_tokens=8000) # adjust temperature as desired
53
+ response = model.generate("Describe this image: <image>", sampling_params)
54
+ print(response[0].outputs[0].text)
55
+ ```
56
+
57
+ ## 🏗️ Technical Specifications
58
+ ### Hardware Requirements
59
+ - **Inference**: ? VRAM (+ VRAM for context)
60
+ - **Supported GPUs**: H100, L40S, A100 (80GB), RTX 4090 (2x for tensor parallelism)
61
+ - **GPU Architecture**: Ada Lovelace, Hopper (for optimal FP8 performance)
62
+ ### Quantization Details
63
+ - **Weights**: FP8 E4M3 with dynamic per-tensor scales
64
+ - **Activations**: FP8 E4M3 with dynamic per-tensor scales
65
+ - **Preserved Components**: Vision tower, embeddings, normalization layers, mlp1
66
+ ## 🔬 Package Versions
67
+ This model was created using:
68
+ ```
69
+ llmcompressor==0.6.1a20250708
70
+ compressed-tensors==latest
71
+ transformers==4.52.4
72
+ torch==2.7.0
73
+ vllm==0.9.2
74
+ ```
75
+
76
+ *Quantized with ❤️ using LLM Compressor for the open-source community*
added_tokens.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</img>": 151666,
4
+ "</quad>": 151669,
5
+ "</ref>": 151671,
6
+ "</tool_call>": 151658,
7
+ "<IMG_CONTEXT>": 151667,
8
+ "<box>": 151672,
9
+ "<img>": 151665,
10
+ "<quad>": 151668,
11
+ "<ref>": 151670,
12
+ "<tool_call>": 151657,
13
+ "<|box_end|>": 151649,
14
+ "<|box_start|>": 151648,
15
+ "<|endoftext|>": 151643,
16
+ "<|file_sep|>": 151664,
17
+ "<|fim_middle|>": 151660,
18
+ "<|fim_pad|>": 151662,
19
+ "<|fim_prefix|>": 151659,
20
+ "<|fim_suffix|>": 151661,
21
+ "<|im_end|>": 151645,
22
+ "<|im_start|>": 151644,
23
+ "<|image_pad|>": 151655,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|video_pad|>": 151656,
30
+ "<|vision_end|>": 151653,
31
+ "<|vision_pad|>": 151654,
32
+ "<|vision_start|>": 151652
33
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'FIRST, think through the problem step-by-step. Explain each step clearly, including any relevant concepts or formulas. Reflect on why each step is necessary and check for potential errors. Consider alternative approaches and justify the chosen method. Enclose this entire reasoning process within <think></think> tags. THEN, provide the final answer enclosed in \boxed{}.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n<think>\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SkyworkR1VChatModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "Skywork/Skywork-R1V3-38B--configuration_skywork_chat.SkyworkChatConfig",
7
+ "AutoModel": "Skywork/Skywork-R1V3-38B--modeling_skywork_chat.SkyworkChatModel",
8
+ "AutoModelForCausalLM": "Skywork/Skywork-R1V3-38B--modeling_skywork_chat.SkyworkChatModel"
9
+ },
10
+ "downsample_ratio": 0.5,
11
+ "dynamic_image_size": true,
12
+ "force_image_size": 448,
13
+ "freeze_adapter": false,
14
+ "freeze_llm": false,
15
+ "freeze_vision": false,
16
+ "hidden_size": 5120,
17
+ "llm_config": {
18
+ "architectures": [
19
+ "Qwen2ForCausalLM"
20
+ ],
21
+ "attention_dropout": 0.0,
22
+ "attn_implementation": "flash_attention_2",
23
+ "bos_token_id": 151643,
24
+ "eos_token_id": 151645,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 5120,
27
+ "initializer_range": 0.02,
28
+ "intermediate_size": 27648,
29
+ "max_position_embeddings": 32768,
30
+ "max_window_layers": 70,
31
+ "model_type": "qwen2",
32
+ "num_attention_heads": 40,
33
+ "num_hidden_layers": 64,
34
+ "num_key_value_heads": 8,
35
+ "rms_norm_eps": 1e-06,
36
+ "rope_scaling": null,
37
+ "rope_theta": 1000000.0,
38
+ "sliding_window": 131072,
39
+ "torch_dtype": "bfloat16",
40
+ "use_bfloat16": true,
41
+ "use_cache": false,
42
+ "use_sliding_window": false,
43
+ "vocab_size": 151674
44
+ },
45
+ "max_dynamic_patch": 6,
46
+ "min_dynamic_patch": 1,
47
+ "model_type": "skywork_chat",
48
+ "pad2square": false,
49
+ "ps_version": "v2",
50
+ "quantization_config": {
51
+ "config_groups": {
52
+ "group_0": {
53
+ "input_activations": {
54
+ "actorder": null,
55
+ "block_structure": null,
56
+ "dynamic": true,
57
+ "group_size": null,
58
+ "num_bits": 8,
59
+ "observer": null,
60
+ "observer_kwargs": {},
61
+ "strategy": "token",
62
+ "symmetric": true,
63
+ "type": "float"
64
+ },
65
+ "output_activations": null,
66
+ "targets": [
67
+ "Linear"
68
+ ],
69
+ "weights": {
70
+ "actorder": null,
71
+ "block_structure": null,
72
+ "dynamic": false,
73
+ "group_size": null,
74
+ "num_bits": 8,
75
+ "observer": "minmax",
76
+ "observer_kwargs": {},
77
+ "strategy": "channel",
78
+ "symmetric": true,
79
+ "type": "float"
80
+ }
81
+ }
82
+ },
83
+ "format": "float-quantized",
84
+ "global_compression_ratio": null,
85
+ "ignore": [
86
+ "vision_model.encoder.layers.0.attn.qkv",
87
+ "vision_model.encoder.layers.0.attn.proj",
88
+ "vision_model.encoder.layers.0.mlp.fc1",
89
+ "vision_model.encoder.layers.0.mlp.fc2",
90
+ "vision_model.encoder.layers.1.attn.qkv",
91
+ "vision_model.encoder.layers.1.attn.proj",
92
+ "vision_model.encoder.layers.1.mlp.fc1",
93
+ "vision_model.encoder.layers.1.mlp.fc2",
94
+ "vision_model.encoder.layers.2.attn.qkv",
95
+ "vision_model.encoder.layers.2.attn.proj",
96
+ "vision_model.encoder.layers.2.mlp.fc1",
97
+ "vision_model.encoder.layers.2.mlp.fc2",
98
+ "vision_model.encoder.layers.3.attn.qkv",
99
+ "vision_model.encoder.layers.3.attn.proj",
100
+ "vision_model.encoder.layers.3.mlp.fc1",
101
+ "vision_model.encoder.layers.3.mlp.fc2",
102
+ "vision_model.encoder.layers.4.attn.qkv",
103
+ "vision_model.encoder.layers.4.attn.proj",
104
+ "vision_model.encoder.layers.4.mlp.fc1",
105
+ "vision_model.encoder.layers.4.mlp.fc2",
106
+ "vision_model.encoder.layers.5.attn.qkv",
107
+ "vision_model.encoder.layers.5.attn.proj",
108
+ "vision_model.encoder.layers.5.mlp.fc1",
109
+ "vision_model.encoder.layers.5.mlp.fc2",
110
+ "vision_model.encoder.layers.6.attn.qkv",
111
+ "vision_model.encoder.layers.6.attn.proj",
112
+ "vision_model.encoder.layers.6.mlp.fc1",
113
+ "vision_model.encoder.layers.6.mlp.fc2",
114
+ "vision_model.encoder.layers.7.attn.qkv",
115
+ "vision_model.encoder.layers.7.attn.proj",
116
+ "vision_model.encoder.layers.7.mlp.fc1",
117
+ "vision_model.encoder.layers.7.mlp.fc2",
118
+ "vision_model.encoder.layers.8.attn.qkv",
119
+ "vision_model.encoder.layers.8.attn.proj",
120
+ "vision_model.encoder.layers.8.mlp.fc1",
121
+ "vision_model.encoder.layers.8.mlp.fc2",
122
+ "vision_model.encoder.layers.9.attn.qkv",
123
+ "vision_model.encoder.layers.9.attn.proj",
124
+ "vision_model.encoder.layers.9.mlp.fc1",
125
+ "vision_model.encoder.layers.9.mlp.fc2",
126
+ "vision_model.encoder.layers.10.attn.qkv",
127
+ "vision_model.encoder.layers.10.attn.proj",
128
+ "vision_model.encoder.layers.10.mlp.fc1",
129
+ "vision_model.encoder.layers.10.mlp.fc2",
130
+ "vision_model.encoder.layers.11.attn.qkv",
131
+ "vision_model.encoder.layers.11.attn.proj",
132
+ "vision_model.encoder.layers.11.mlp.fc1",
133
+ "vision_model.encoder.layers.11.mlp.fc2",
134
+ "vision_model.encoder.layers.12.attn.qkv",
135
+ "vision_model.encoder.layers.12.attn.proj",
136
+ "vision_model.encoder.layers.12.mlp.fc1",
137
+ "vision_model.encoder.layers.12.mlp.fc2",
138
+ "vision_model.encoder.layers.13.attn.qkv",
139
+ "vision_model.encoder.layers.13.attn.proj",
140
+ "vision_model.encoder.layers.13.mlp.fc1",
141
+ "vision_model.encoder.layers.13.mlp.fc2",
142
+ "vision_model.encoder.layers.14.attn.qkv",
143
+ "vision_model.encoder.layers.14.attn.proj",
144
+ "vision_model.encoder.layers.14.mlp.fc1",
145
+ "vision_model.encoder.layers.14.mlp.fc2",
146
+ "vision_model.encoder.layers.15.attn.qkv",
147
+ "vision_model.encoder.layers.15.attn.proj",
148
+ "vision_model.encoder.layers.15.mlp.fc1",
149
+ "vision_model.encoder.layers.15.mlp.fc2",
150
+ "vision_model.encoder.layers.16.attn.qkv",
151
+ "vision_model.encoder.layers.16.attn.proj",
152
+ "vision_model.encoder.layers.16.mlp.fc1",
153
+ "vision_model.encoder.layers.16.mlp.fc2",
154
+ "vision_model.encoder.layers.17.attn.qkv",
155
+ "vision_model.encoder.layers.17.attn.proj",
156
+ "vision_model.encoder.layers.17.mlp.fc1",
157
+ "vision_model.encoder.layers.17.mlp.fc2",
158
+ "vision_model.encoder.layers.18.attn.qkv",
159
+ "vision_model.encoder.layers.18.attn.proj",
160
+ "vision_model.encoder.layers.18.mlp.fc1",
161
+ "vision_model.encoder.layers.18.mlp.fc2",
162
+ "vision_model.encoder.layers.19.attn.qkv",
163
+ "vision_model.encoder.layers.19.attn.proj",
164
+ "vision_model.encoder.layers.19.mlp.fc1",
165
+ "vision_model.encoder.layers.19.mlp.fc2",
166
+ "vision_model.encoder.layers.20.attn.qkv",
167
+ "vision_model.encoder.layers.20.attn.proj",
168
+ "vision_model.encoder.layers.20.mlp.fc1",
169
+ "vision_model.encoder.layers.20.mlp.fc2",
170
+ "vision_model.encoder.layers.21.attn.qkv",
171
+ "vision_model.encoder.layers.21.attn.proj",
172
+ "vision_model.encoder.layers.21.mlp.fc1",
173
+ "vision_model.encoder.layers.21.mlp.fc2",
174
+ "vision_model.encoder.layers.22.attn.qkv",
175
+ "vision_model.encoder.layers.22.attn.proj",
176
+ "vision_model.encoder.layers.22.mlp.fc1",
177
+ "vision_model.encoder.layers.22.mlp.fc2",
178
+ "vision_model.encoder.layers.23.attn.qkv",
179
+ "vision_model.encoder.layers.23.attn.proj",
180
+ "vision_model.encoder.layers.23.mlp.fc1",
181
+ "vision_model.encoder.layers.23.mlp.fc2",
182
+ "vision_model.encoder.layers.24.attn.qkv",
183
+ "vision_model.encoder.layers.24.attn.proj",
184
+ "vision_model.encoder.layers.24.mlp.fc1",
185
+ "vision_model.encoder.layers.24.mlp.fc2",
186
+ "vision_model.encoder.layers.25.attn.qkv",
187
+ "vision_model.encoder.layers.25.attn.proj",
188
+ "vision_model.encoder.layers.25.mlp.fc1",
189
+ "vision_model.encoder.layers.25.mlp.fc2",
190
+ "vision_model.encoder.layers.26.attn.qkv",
191
+ "vision_model.encoder.layers.26.attn.proj",
192
+ "vision_model.encoder.layers.26.mlp.fc1",
193
+ "vision_model.encoder.layers.26.mlp.fc2",
194
+ "vision_model.encoder.layers.27.attn.qkv",
195
+ "vision_model.encoder.layers.27.attn.proj",
196
+ "vision_model.encoder.layers.27.mlp.fc1",
197
+ "vision_model.encoder.layers.27.mlp.fc2",
198
+ "vision_model.encoder.layers.28.attn.qkv",
199
+ "vision_model.encoder.layers.28.attn.proj",
200
+ "vision_model.encoder.layers.28.mlp.fc1",
201
+ "vision_model.encoder.layers.28.mlp.fc2",
202
+ "vision_model.encoder.layers.29.attn.qkv",
203
+ "vision_model.encoder.layers.29.attn.proj",
204
+ "vision_model.encoder.layers.29.mlp.fc1",
205
+ "vision_model.encoder.layers.29.mlp.fc2",
206
+ "vision_model.encoder.layers.30.attn.qkv",
207
+ "vision_model.encoder.layers.30.attn.proj",
208
+ "vision_model.encoder.layers.30.mlp.fc1",
209
+ "vision_model.encoder.layers.30.mlp.fc2",
210
+ "vision_model.encoder.layers.31.attn.qkv",
211
+ "vision_model.encoder.layers.31.attn.proj",
212
+ "vision_model.encoder.layers.31.mlp.fc1",
213
+ "vision_model.encoder.layers.31.mlp.fc2",
214
+ "vision_model.encoder.layers.32.attn.qkv",
215
+ "vision_model.encoder.layers.32.attn.proj",
216
+ "vision_model.encoder.layers.32.mlp.fc1",
217
+ "vision_model.encoder.layers.32.mlp.fc2",
218
+ "vision_model.encoder.layers.33.attn.qkv",
219
+ "vision_model.encoder.layers.33.attn.proj",
220
+ "vision_model.encoder.layers.33.mlp.fc1",
221
+ "vision_model.encoder.layers.33.mlp.fc2",
222
+ "vision_model.encoder.layers.34.attn.qkv",
223
+ "vision_model.encoder.layers.34.attn.proj",
224
+ "vision_model.encoder.layers.34.mlp.fc1",
225
+ "vision_model.encoder.layers.34.mlp.fc2",
226
+ "vision_model.encoder.layers.35.attn.qkv",
227
+ "vision_model.encoder.layers.35.attn.proj",
228
+ "vision_model.encoder.layers.35.mlp.fc1",
229
+ "vision_model.encoder.layers.35.mlp.fc2",
230
+ "vision_model.encoder.layers.36.attn.qkv",
231
+ "vision_model.encoder.layers.36.attn.proj",
232
+ "vision_model.encoder.layers.36.mlp.fc1",
233
+ "vision_model.encoder.layers.36.mlp.fc2",
234
+ "vision_model.encoder.layers.37.attn.qkv",
235
+ "vision_model.encoder.layers.37.attn.proj",
236
+ "vision_model.encoder.layers.37.mlp.fc1",
237
+ "vision_model.encoder.layers.37.mlp.fc2",
238
+ "vision_model.encoder.layers.38.attn.qkv",
239
+ "vision_model.encoder.layers.38.attn.proj",
240
+ "vision_model.encoder.layers.38.mlp.fc1",
241
+ "vision_model.encoder.layers.38.mlp.fc2",
242
+ "vision_model.encoder.layers.39.attn.qkv",
243
+ "vision_model.encoder.layers.39.attn.proj",
244
+ "vision_model.encoder.layers.39.mlp.fc1",
245
+ "vision_model.encoder.layers.39.mlp.fc2",
246
+ "vision_model.encoder.layers.40.attn.qkv",
247
+ "vision_model.encoder.layers.40.attn.proj",
248
+ "vision_model.encoder.layers.40.mlp.fc1",
249
+ "vision_model.encoder.layers.40.mlp.fc2",
250
+ "vision_model.encoder.layers.41.attn.qkv",
251
+ "vision_model.encoder.layers.41.attn.proj",
252
+ "vision_model.encoder.layers.41.mlp.fc1",
253
+ "vision_model.encoder.layers.41.mlp.fc2",
254
+ "vision_model.encoder.layers.42.attn.qkv",
255
+ "vision_model.encoder.layers.42.attn.proj",
256
+ "vision_model.encoder.layers.42.mlp.fc1",
257
+ "vision_model.encoder.layers.42.mlp.fc2",
258
+ "vision_model.encoder.layers.43.attn.qkv",
259
+ "vision_model.encoder.layers.43.attn.proj",
260
+ "vision_model.encoder.layers.43.mlp.fc1",
261
+ "vision_model.encoder.layers.43.mlp.fc2",
262
+ "vision_model.encoder.layers.44.attn.qkv",
263
+ "vision_model.encoder.layers.44.attn.proj",
264
+ "vision_model.encoder.layers.44.mlp.fc1",
265
+ "vision_model.encoder.layers.44.mlp.fc2",
266
+ "language_model.lm_head",
267
+ "mlp1.1",
268
+ "mlp1.3"
269
+ ],
270
+ "kv_cache_scheme": null,
271
+ "quant_method": "compressed-tensors",
272
+ "quantization_status": "compressed"
273
+ },
274
+ "select_layer": -1,
275
+ "template": "skywork-r1v-chat",
276
+ "tie_word_embeddings": false,
277
+ "torch_dtype": "bfloat16",
278
+ "transformers_version": null,
279
+ "use_backbone_lora": 0,
280
+ "use_llm_lora": 0,
281
+ "use_thumbnail": true,
282
+ "vision_config": {
283
+ "architectures": [
284
+ "InternVisionModel"
285
+ ],
286
+ "attention_dropout": 0.0,
287
+ "drop_path_rate": 0.1,
288
+ "dropout": 0.0,
289
+ "hidden_act": "gelu",
290
+ "hidden_size": 3200,
291
+ "image_size": 448,
292
+ "initializer_factor": 0.1,
293
+ "initializer_range": 1e-10,
294
+ "intermediate_size": 12800,
295
+ "layer_norm_eps": 1e-06,
296
+ "model_type": "",
297
+ "norm_type": "rms_norm",
298
+ "num_attention_heads": 25,
299
+ "num_channels": 3,
300
+ "num_hidden_layers": 45,
301
+ "patch_size": 14,
302
+ "qk_normalization": true,
303
+ "qkv_bias": false,
304
+ "torch_dtype": "bfloat16",
305
+ "use_bfloat16": true,
306
+ "use_flash_attn": true
307
+ }
308
+ }
configuration_skywork_chat.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ from transformers import AutoConfig, LlamaConfig
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.utils import logging
6
+
7
+ from .configuration_skywork_vit import SkyworkVisionConfig
8
+ from .configuration_skywork_lm2 import SkyworkLM2Config
9
+ from transformers import Qwen2Config, Qwen2ForCausalLM
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+
14
+ class SkyworkChatConfig(PretrainedConfig):
15
+ model_type = 'skywork_chat'
16
+ is_composition = True
17
+
18
+ def __init__(
19
+ self,
20
+ vision_config=None,
21
+ llm_config=None,
22
+ use_backbone_lora=0,
23
+ use_llm_lora=0,
24
+ select_layer=-1,
25
+ force_image_size=None,
26
+ downsample_ratio=0.5,
27
+ template=None,
28
+ dynamic_image_size=False,
29
+ use_thumbnail=False,
30
+ ps_version='v1',
31
+ min_dynamic_patch=1,
32
+ max_dynamic_patch=6,
33
+ **kwargs):
34
+ super().__init__(**kwargs)
35
+ if vision_config is None:
36
+ vision_config = {'architectures': ['SkyworkVisionModel']}
37
+ logger.info('vision_config is None. Initializing the SkyworkVisionConfig with default values.')
38
+
39
+ if llm_config is None:
40
+ llm_config = {'architectures': ['Qwen2ForCausalLM']}
41
+ logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
42
+
43
+ self.vision_config = SkyworkVisionConfig(**vision_config)
44
+ if llm_config.get('architectures')[0] == 'LlamaForCausalLM':
45
+ self.llm_config = LlamaConfig(**llm_config)
46
+ elif llm_config.get('architectures')[0] == 'Qwen2ForCausalLM':
47
+ self.llm_config = Qwen2Config(**llm_config)
48
+ else:
49
+ raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0]))
50
+
51
+
52
+ self.use_backbone_lora = use_backbone_lora
53
+ self.use_llm_lora = use_llm_lora
54
+ self.select_layer = select_layer
55
+ self.force_image_size = force_image_size
56
+ self.downsample_ratio = downsample_ratio
57
+ self.template = template
58
+ self.dynamic_image_size = dynamic_image_size
59
+ self.use_thumbnail = use_thumbnail
60
+ self.ps_version = ps_version # pixel shuffle version
61
+ self.min_dynamic_patch = min_dynamic_patch
62
+ self.max_dynamic_patch = max_dynamic_patch
63
+
64
+ logger.info(f'vision_select_layer: {self.select_layer}')
65
+ logger.info(f'ps_version: {self.ps_version}')
66
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
67
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
68
+
69
+ def to_dict(self):
70
+ """
71
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
72
+ Returns:
73
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
74
+ """
75
+ output = copy.deepcopy(self.__dict__)
76
+ output['vision_config'] = self.vision_config.to_dict()
77
+ output['llm_config'] = self.llm_config.to_dict()
78
+ output['model_type'] = self.__class__.model_type
79
+ output['use_backbone_lora'] = self.use_backbone_lora
80
+ output['use_llm_lora'] = self.use_llm_lora
81
+ output['select_layer'] = self.select_layer
82
+ output['force_image_size'] = self.force_image_size
83
+ output['downsample_ratio'] = self.downsample_ratio
84
+ output['template'] = self.template
85
+ output['dynamic_image_size'] = self.dynamic_image_size
86
+ output['use_thumbnail'] = self.use_thumbnail
87
+ output['ps_version'] = self.ps_version
88
+ output['min_dynamic_patch'] = self.min_dynamic_patch
89
+ output['max_dynamic_patch'] = self.max_dynamic_patch
90
+
91
+ return output
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.52.4"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f5eb4f056eedb2060883a1fd8c6c98a94241370888ee26485a477b4beb5f011
3
+ size 4988569440
model-00002-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37c5d0d80a6bc831595b1b15a8c2200dcab17247bdde8abd634b4bf4a455257b
3
+ size 4937253584
model-00003-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9077f7ab6996897805edf72ff0cc599dd57906a37902421038951189c9d49cb
3
+ size 4997644696
model-00004-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1807f64a9d47085f8191976aafee778c367266a7f52ea1995f9dd85451a4af77
3
+ size 4877704976
model-00005-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8779cc0d2aacab68944be6178173ac739458d00b892645dedc7d2473b562aaad
3
+ size 4877705072
model-00006-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523708a187a2b5d13d98f4718cd542ed03033217ee2bf9fef5168e9dd33a58ec
3
+ size 4877705072
model-00007-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10db9a2a255a2b21084722111486e4c4cd7857822a3ba3613aaad80e016ea317
3
+ size 4877705072
model-00008-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ef9ecc7f802559c0f85ea2df195edaf5cfada57fa1d77b698d0775d9d4863c
3
+ size 4877705072
model-00009-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0644533ba86abada73df8c3635a3112787b7d3458a0f7a5cd6cb0d6de8095e0c
3
+ size 4531533888
model-00010-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87b670c1730f4876b97ae2fc99c0212b2a555e00f3a54ad8298ef6192f98fe7
3
+ size 1736714912
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_skywork_chat.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ import re
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import torch.utils.checkpoint
6
+ import transformers
7
+ from torch import nn
8
+ from torch.nn import CrossEntropyLoss
9
+ from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
10
+ LlamaTokenizer)
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+ from transformers.modeling_utils import PreTrainedModel
13
+ from transformers.utils import ModelOutput, logging
14
+
15
+ from .configuration_skywork_chat import SkyworkChatConfig
16
+ from .conversation import get_conv_template
17
+ from .modeling_skywork_vit import SkyworkVisionModel, has_flash_attn
18
+ from .modeling_skywork_lm2 import SkyworkLM2ForCausalLM
19
+
20
+ from transformers import Qwen2Config, Qwen2ForCausalLM
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ def version_cmp(v1, v2, op='eq'):
26
+ import operator
27
+
28
+ from packaging import version
29
+ op_func = getattr(operator, op)
30
+ return op_func(version.parse(v1), version.parse(v2))
31
+
32
+
33
+ class SkyworkChatModel(PreTrainedModel):
34
+ config_class = SkyworkChatConfig
35
+ main_input_name = 'pixel_values'
36
+ base_model_prefix = 'language_model'
37
+ _supports_flash_attn_2 = True
38
+ _no_split_modules = ['SkyworkVisionModel', 'LlamaDecoderLayer', 'SkyworkLM2DecoderLayer']
39
+
40
+ def __init__(self, config: SkyworkChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
41
+ super().__init__(config)
42
+
43
+ assert version_cmp(transformers.__version__, '4.36.2', 'ge')
44
+ image_size = config.force_image_size or config.vision_config.image_size
45
+ patch_size = config.vision_config.patch_size
46
+ self.patch_size = patch_size
47
+ self.select_layer = config.select_layer
48
+ self.template = config.template
49
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
50
+ self.downsample_ratio = config.downsample_ratio
51
+ self.ps_version = config.ps_version
52
+ use_flash_attn = use_flash_attn if has_flash_attn else False
53
+ config.vision_config.use_flash_attn = True if use_flash_attn else False
54
+ config.llm_config.attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
55
+
56
+ logger.info(f'num_image_token: {self.num_image_token}')
57
+ logger.info(f'ps_version: {self.ps_version}')
58
+ if vision_model is not None:
59
+ self.vision_model = vision_model
60
+ else:
61
+ self.vision_model = SkyworkVisionModel(config.vision_config)
62
+ if language_model is not None:
63
+ self.language_model = language_model
64
+ else:
65
+ if config.llm_config.architectures[0] == 'LlamaForCausalLM':
66
+ self.language_model = LlamaForCausalLM(config.llm_config)
67
+ elif config.llm_config.architectures[0] == 'SkyworkLM2ForCausalLM':
68
+ self.language_model = SkyworkLM2ForCausalLM(config.llm_config)
69
+ elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
70
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
71
+ else:
72
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
73
+
74
+ vit_hidden_size = config.vision_config.hidden_size
75
+ llm_hidden_size = config.llm_config.hidden_size
76
+
77
+ self.mlp1 = nn.Sequential(
78
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
79
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
80
+ nn.GELU(),
81
+ nn.Linear(llm_hidden_size, llm_hidden_size)
82
+ )
83
+
84
+ self.img_context_token_id = None
85
+ self.conv_template = get_conv_template(self.template)
86
+ self.system_message = self.conv_template.system_message
87
+
88
+ def forward(
89
+ self,
90
+ pixel_values: torch.FloatTensor,
91
+ input_ids: torch.LongTensor = None,
92
+ attention_mask: Optional[torch.Tensor] = None,
93
+ position_ids: Optional[torch.LongTensor] = None,
94
+ image_flags: Optional[torch.LongTensor] = None,
95
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
96
+ labels: Optional[torch.LongTensor] = None,
97
+ use_cache: Optional[bool] = None,
98
+ output_attentions: Optional[bool] = None,
99
+ output_hidden_states: Optional[bool] = None,
100
+ return_dict: Optional[bool] = None,
101
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
102
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
103
+
104
+ image_flags = image_flags.squeeze(-1)
105
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
106
+
107
+ vit_embeds = self.extract_feature(pixel_values)
108
+ vit_embeds = vit_embeds[image_flags == 1]
109
+ vit_batch_size = pixel_values.shape[0]
110
+
111
+ B, N, C = input_embeds.shape
112
+ input_embeds = input_embeds.reshape(B * N, C)
113
+
114
+ if torch.distributed.get_rank() == 0:
115
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
116
+
117
+ input_ids = input_ids.reshape(B * N)
118
+ selected = (input_ids == self.img_context_token_id)
119
+ try:
120
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
121
+ except Exception as e:
122
+ vit_embeds = vit_embeds.reshape(-1, C)
123
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
124
+ f'vit_embeds.shape={vit_embeds.shape}')
125
+ n_token = selected.sum()
126
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token]
127
+
128
+ input_embeds = input_embeds.reshape(B, N, C)
129
+
130
+ outputs = self.language_model(
131
+ inputs_embeds=input_embeds,
132
+ attention_mask=attention_mask,
133
+ position_ids=position_ids,
134
+ past_key_values=past_key_values,
135
+ use_cache=use_cache,
136
+ output_attentions=output_attentions,
137
+ output_hidden_states=output_hidden_states,
138
+ return_dict=return_dict,
139
+ )
140
+ logits = outputs.logits
141
+
142
+ loss = None
143
+ if labels is not None:
144
+ # Shift so that tokens < n predict n
145
+ shift_logits = logits[..., :-1, :].contiguous()
146
+ shift_labels = labels[..., 1:].contiguous()
147
+ # Flatten the tokens
148
+ loss_fct = CrossEntropyLoss()
149
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
150
+ shift_labels = shift_labels.view(-1)
151
+ # Enable model parallelism
152
+ shift_labels = shift_labels.to(shift_logits.device)
153
+ loss = loss_fct(shift_logits, shift_labels)
154
+
155
+ if not return_dict:
156
+ output = (logits,) + outputs[1:]
157
+ return (loss,) + output if loss is not None else output
158
+
159
+ return CausalLMOutputWithPast(
160
+ loss=loss,
161
+ logits=logits,
162
+ past_key_values=outputs.past_key_values,
163
+ hidden_states=outputs.hidden_states,
164
+ attentions=outputs.attentions,
165
+ )
166
+
167
+ def pixel_shuffle(self, x, scale_factor=0.5):
168
+ n, w, h, c = x.size()
169
+ # N, W, H, C --> N, W, H * scale, C // scale
170
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
171
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
172
+ x = x.permute(0, 2, 1, 3).contiguous()
173
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
174
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
175
+ int(c / (scale_factor * scale_factor)))
176
+ if self.ps_version == 'v1':
177
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
178
+ 'which results in a transposed image.')
179
+ else:
180
+ x = x.permute(0, 2, 1, 3).contiguous()
181
+ return x
182
+
183
+ def extract_feature(self, pixel_values):
184
+ if self.select_layer == -1:
185
+ vit_embeds = self.vision_model(
186
+ pixel_values=pixel_values,
187
+ output_hidden_states=False,
188
+ return_dict=True).last_hidden_state
189
+ else:
190
+ vit_embeds = self.vision_model(
191
+ pixel_values=pixel_values,
192
+ output_hidden_states=True,
193
+ return_dict=True).hidden_states[self.select_layer]
194
+ vit_embeds = vit_embeds[:, 1:, :]
195
+
196
+ h = w = int(vit_embeds.shape[1] ** 0.5)
197
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
198
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
199
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
200
+ vit_embeds = self.mlp1(vit_embeds)
201
+ return vit_embeds
202
+
203
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
204
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
205
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
206
+ if history is not None or return_history:
207
+ print('Now multi-turn chat is not supported in batch_chat.')
208
+ raise NotImplementedError
209
+
210
+ if image_counts is not None:
211
+ num_patches_list = image_counts
212
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
213
+
214
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
215
+ self.img_context_token_id = img_context_token_id
216
+
217
+
218
+ if verbose and pixel_values is not None:
219
+ image_bs = pixel_values.shape[0]
220
+ print(f'dynamic ViT batch size: {image_bs}')
221
+
222
+ queries = []
223
+ for idx, num_patches in enumerate(num_patches_list):
224
+ question = questions[idx]
225
+ if pixel_values is not None and '<image>' not in question:
226
+ question = '<image>\n' + question
227
+ template = get_conv_template(self.template)
228
+ template.system_message = self.system_message
229
+ template.append_message(template.roles[0], question)
230
+ template.append_message(template.roles[1], None)
231
+ query = template.get_prompt()
232
+
233
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
234
+ query = query.replace('<image>', image_tokens, 1)
235
+ queries.append(query)
236
+
237
+ tokenizer.padding_side = 'left'
238
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
239
+ input_ids = model_inputs['input_ids'].to(self.device)
240
+ attention_mask = model_inputs['attention_mask'].to(self.device)
241
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
242
+ generation_config['eos_token_id'] = eos_token_id
243
+ generation_output = self.generate(
244
+ pixel_values=pixel_values,
245
+ input_ids=input_ids,
246
+ attention_mask=attention_mask,
247
+ **generation_config
248
+ )
249
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
250
+ responses = [response.split(template.sep.strip())[0].strip() for response in responses]
251
+ return responses
252
+
253
+ def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
254
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
255
+ verbose=False, mode="think"):
256
+
257
+ if history is None and pixel_values is not None and '<image>' not in question:
258
+ question = '<image>\n' + question
259
+
260
+ if num_patches_list is None:
261
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
262
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
263
+
264
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
265
+ self.img_context_token_id = img_context_token_id
266
+
267
+ template = get_conv_template(self.template)
268
+ template.system_message = self.system_message
269
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
270
+
271
+
272
+ history = [] if history is None else history
273
+ for (old_question, old_answer) in history:
274
+ template.append_message(template.roles[0], old_question)
275
+ template.append_message(template.roles[1], old_answer)
276
+ template.append_message(template.roles[0], question)
277
+ template.append_message(template.roles[1], None)
278
+ query = template.get_prompt()
279
+ if mode != "think":
280
+ query = re.sub(r'\n<think>', '', query, count=1)
281
+
282
+
283
+ if verbose and pixel_values is not None:
284
+ image_bs = pixel_values.shape[0]
285
+ print(f'dynamic ViT batch size: {image_bs}')
286
+
287
+ for num_patches in num_patches_list:
288
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
289
+ query = query.replace('<image>', image_tokens, 1)
290
+
291
+
292
+ model_inputs = tokenizer(query, return_tensors='pt')
293
+ input_ids = model_inputs['input_ids'].to(self.device)
294
+ attention_mask = model_inputs['attention_mask'].to(self.device)
295
+ generation_config['eos_token_id'] = eos_token_id
296
+ generation_output = self.generate(
297
+ pixel_values=pixel_values,
298
+ input_ids=input_ids,
299
+ attention_mask=attention_mask,
300
+ **generation_config
301
+ )
302
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
303
+ response = response.split(template.sep.strip())[0].strip()
304
+ history.append((question, response))
305
+
306
+ if return_history:
307
+ return response, history
308
+ else:
309
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
310
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
311
+ if verbose:
312
+ print(query_to_print, response)
313
+ return response
314
+
315
+ @torch.no_grad()
316
+ def generate(
317
+ self,
318
+ pixel_values: Optional[torch.FloatTensor] = None,
319
+ input_ids: Optional[torch.FloatTensor] = None,
320
+ attention_mask: Optional[torch.LongTensor] = None,
321
+ visual_features: Optional[torch.FloatTensor] = None,
322
+ generation_config: Optional[GenerationConfig] = None,
323
+ output_hidden_states: Optional[bool] = None,
324
+ **generate_kwargs,
325
+ ) -> torch.LongTensor:
326
+
327
+ assert self.img_context_token_id is not None
328
+ if pixel_values is not None:
329
+ if visual_features is not None:
330
+ vit_embeds = visual_features
331
+ else:
332
+ vit_embeds = self.extract_feature(pixel_values)
333
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
334
+ B, N, C = input_embeds.shape
335
+ input_embeds = input_embeds.reshape(B * N, C)
336
+
337
+ input_ids = input_ids.reshape(B * N)
338
+ selected = (input_ids == self.img_context_token_id)
339
+
340
+ assert selected.sum() != 0
341
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
342
+
343
+ input_embeds = input_embeds.reshape(B, N, C)
344
+ else:
345
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
346
+
347
+
348
+ outputs = self.language_model.generate(
349
+ inputs_embeds=input_embeds,
350
+ attention_mask=attention_mask,
351
+ generation_config=generation_config,
352
+ output_hidden_states=output_hidden_states,
353
+ use_cache=True,
354
+ **generate_kwargs,
355
+ )
356
+
357
+ return outputs
recipe.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ QuantizationModifier:
4
+ targets: [Linear]
5
+ ignore: ['re:.*lm_head', 're:.*vision.*', 're:mlp1.*']
6
+ scheme: FP8_DYNAMIC
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f9ba4b4a6625b5047a1356f6081b641c3e4e6a4a198facbd4bef217747d1685
3
+ size 11423548
tokenizer_config.json ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<img>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "151666": {
191
+ "content": "</img>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "151667": {
199
+ "content": "<IMG_CONTEXT>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "151668": {
207
+ "content": "<quad>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "151669": {
215
+ "content": "</quad>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<ref>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</ref>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ }
254
+ },
255
+ "additional_special_tokens": [
256
+ "<|im_start|>",
257
+ "<|im_end|>",
258
+ "<|object_ref_start|>",
259
+ "<|object_ref_end|>",
260
+ "<|box_start|>",
261
+ "<|box_end|>",
262
+ "<|quad_start|>",
263
+ "<|quad_end|>",
264
+ "<|vision_start|>",
265
+ "<|vision_end|>",
266
+ "<|vision_pad|>",
267
+ "<|image_pad|>",
268
+ "<|video_pad|>"
269
+ ],
270
+ "bos_token": null,
271
+ "clean_up_tokenization_spaces": false,
272
+ "eos_token": "<|im_end|>",
273
+ "errors": "replace",
274
+ "extra_special_tokens": {},
275
+ "model_max_length": 16384,
276
+ "pad_token": "<|endoftext|>",
277
+ "split_special_tokens": false,
278
+ "tokenizer_class": "Qwen2Tokenizer",
279
+ "unk_token": null
280
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff