ChibuUkachi commited on
Commit
c7cc540
·
verified ·
1 Parent(s): b498718

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tekken.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Mistral3ForConditionalGeneration"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "image_token_index": 10,
7
+ "model_type": "mistral3",
8
+ "multimodal_projector_bias": false,
9
+ "projector_hidden_act": "gelu",
10
+ "quantization_config": {
11
+ "config_groups": {
12
+ "group_0": {
13
+ "format": "float-quantized",
14
+ "input_activations": {
15
+ "actorder": null,
16
+ "block_structure": null,
17
+ "dynamic": true,
18
+ "group_size": null,
19
+ "num_bits": 8,
20
+ "observer": null,
21
+ "observer_kwargs": {},
22
+ "scale_dtype": null,
23
+ "strategy": "token",
24
+ "symmetric": true,
25
+ "type": "float",
26
+ "zp_dtype": null
27
+ },
28
+ "output_activations": null,
29
+ "targets": [
30
+ "Linear"
31
+ ],
32
+ "weights": {
33
+ "actorder": null,
34
+ "block_structure": null,
35
+ "dynamic": false,
36
+ "group_size": null,
37
+ "num_bits": 8,
38
+ "observer": "minmax",
39
+ "observer_kwargs": {},
40
+ "scale_dtype": null,
41
+ "strategy": "channel",
42
+ "symmetric": true,
43
+ "type": "float",
44
+ "zp_dtype": null
45
+ }
46
+ }
47
+ },
48
+ "format": "float-quantized",
49
+ "global_compression_ratio": null,
50
+ "ignore": [
51
+ "model.vision_tower.transformer.layers.0.feed_forward.gate_proj",
52
+ "model.vision_tower.transformer.layers.0.feed_forward.up_proj",
53
+ "model.vision_tower.transformer.layers.0.feed_forward.down_proj",
54
+ "model.vision_tower.transformer.layers.0.attention.k_proj",
55
+ "model.vision_tower.transformer.layers.0.attention.v_proj",
56
+ "model.vision_tower.transformer.layers.0.attention.q_proj",
57
+ "model.vision_tower.transformer.layers.0.attention.o_proj",
58
+ "model.vision_tower.transformer.layers.1.feed_forward.gate_proj",
59
+ "model.vision_tower.transformer.layers.1.feed_forward.up_proj",
60
+ "model.vision_tower.transformer.layers.1.feed_forward.down_proj",
61
+ "model.vision_tower.transformer.layers.1.attention.k_proj",
62
+ "model.vision_tower.transformer.layers.1.attention.v_proj",
63
+ "model.vision_tower.transformer.layers.1.attention.q_proj",
64
+ "model.vision_tower.transformer.layers.1.attention.o_proj",
65
+ "model.vision_tower.transformer.layers.2.feed_forward.gate_proj",
66
+ "model.vision_tower.transformer.layers.2.feed_forward.up_proj",
67
+ "model.vision_tower.transformer.layers.2.feed_forward.down_proj",
68
+ "model.vision_tower.transformer.layers.2.attention.k_proj",
69
+ "model.vision_tower.transformer.layers.2.attention.v_proj",
70
+ "model.vision_tower.transformer.layers.2.attention.q_proj",
71
+ "model.vision_tower.transformer.layers.2.attention.o_proj",
72
+ "model.vision_tower.transformer.layers.3.feed_forward.gate_proj",
73
+ "model.vision_tower.transformer.layers.3.feed_forward.up_proj",
74
+ "model.vision_tower.transformer.layers.3.feed_forward.down_proj",
75
+ "model.vision_tower.transformer.layers.3.attention.k_proj",
76
+ "model.vision_tower.transformer.layers.3.attention.v_proj",
77
+ "model.vision_tower.transformer.layers.3.attention.q_proj",
78
+ "model.vision_tower.transformer.layers.3.attention.o_proj",
79
+ "model.vision_tower.transformer.layers.4.feed_forward.gate_proj",
80
+ "model.vision_tower.transformer.layers.4.feed_forward.up_proj",
81
+ "model.vision_tower.transformer.layers.4.feed_forward.down_proj",
82
+ "model.vision_tower.transformer.layers.4.attention.k_proj",
83
+ "model.vision_tower.transformer.layers.4.attention.v_proj",
84
+ "model.vision_tower.transformer.layers.4.attention.q_proj",
85
+ "model.vision_tower.transformer.layers.4.attention.o_proj",
86
+ "model.vision_tower.transformer.layers.5.feed_forward.gate_proj",
87
+ "model.vision_tower.transformer.layers.5.feed_forward.up_proj",
88
+ "model.vision_tower.transformer.layers.5.feed_forward.down_proj",
89
+ "model.vision_tower.transformer.layers.5.attention.k_proj",
90
+ "model.vision_tower.transformer.layers.5.attention.v_proj",
91
+ "model.vision_tower.transformer.layers.5.attention.q_proj",
92
+ "model.vision_tower.transformer.layers.5.attention.o_proj",
93
+ "model.vision_tower.transformer.layers.6.feed_forward.gate_proj",
94
+ "model.vision_tower.transformer.layers.6.feed_forward.up_proj",
95
+ "model.vision_tower.transformer.layers.6.feed_forward.down_proj",
96
+ "model.vision_tower.transformer.layers.6.attention.k_proj",
97
+ "model.vision_tower.transformer.layers.6.attention.v_proj",
98
+ "model.vision_tower.transformer.layers.6.attention.q_proj",
99
+ "model.vision_tower.transformer.layers.6.attention.o_proj",
100
+ "model.vision_tower.transformer.layers.7.feed_forward.gate_proj",
101
+ "model.vision_tower.transformer.layers.7.feed_forward.up_proj",
102
+ "model.vision_tower.transformer.layers.7.feed_forward.down_proj",
103
+ "model.vision_tower.transformer.layers.7.attention.k_proj",
104
+ "model.vision_tower.transformer.layers.7.attention.v_proj",
105
+ "model.vision_tower.transformer.layers.7.attention.q_proj",
106
+ "model.vision_tower.transformer.layers.7.attention.o_proj",
107
+ "model.vision_tower.transformer.layers.8.feed_forward.gate_proj",
108
+ "model.vision_tower.transformer.layers.8.feed_forward.up_proj",
109
+ "model.vision_tower.transformer.layers.8.feed_forward.down_proj",
110
+ "model.vision_tower.transformer.layers.8.attention.k_proj",
111
+ "model.vision_tower.transformer.layers.8.attention.v_proj",
112
+ "model.vision_tower.transformer.layers.8.attention.q_proj",
113
+ "model.vision_tower.transformer.layers.8.attention.o_proj",
114
+ "model.vision_tower.transformer.layers.9.feed_forward.gate_proj",
115
+ "model.vision_tower.transformer.layers.9.feed_forward.up_proj",
116
+ "model.vision_tower.transformer.layers.9.feed_forward.down_proj",
117
+ "model.vision_tower.transformer.layers.9.attention.k_proj",
118
+ "model.vision_tower.transformer.layers.9.attention.v_proj",
119
+ "model.vision_tower.transformer.layers.9.attention.q_proj",
120
+ "model.vision_tower.transformer.layers.9.attention.o_proj",
121
+ "model.vision_tower.transformer.layers.10.feed_forward.gate_proj",
122
+ "model.vision_tower.transformer.layers.10.feed_forward.up_proj",
123
+ "model.vision_tower.transformer.layers.10.feed_forward.down_proj",
124
+ "model.vision_tower.transformer.layers.10.attention.k_proj",
125
+ "model.vision_tower.transformer.layers.10.attention.v_proj",
126
+ "model.vision_tower.transformer.layers.10.attention.q_proj",
127
+ "model.vision_tower.transformer.layers.10.attention.o_proj",
128
+ "model.vision_tower.transformer.layers.11.feed_forward.gate_proj",
129
+ "model.vision_tower.transformer.layers.11.feed_forward.up_proj",
130
+ "model.vision_tower.transformer.layers.11.feed_forward.down_proj",
131
+ "model.vision_tower.transformer.layers.11.attention.k_proj",
132
+ "model.vision_tower.transformer.layers.11.attention.v_proj",
133
+ "model.vision_tower.transformer.layers.11.attention.q_proj",
134
+ "model.vision_tower.transformer.layers.11.attention.o_proj",
135
+ "model.vision_tower.transformer.layers.12.feed_forward.gate_proj",
136
+ "model.vision_tower.transformer.layers.12.feed_forward.up_proj",
137
+ "model.vision_tower.transformer.layers.12.feed_forward.down_proj",
138
+ "model.vision_tower.transformer.layers.12.attention.k_proj",
139
+ "model.vision_tower.transformer.layers.12.attention.v_proj",
140
+ "model.vision_tower.transformer.layers.12.attention.q_proj",
141
+ "model.vision_tower.transformer.layers.12.attention.o_proj",
142
+ "model.vision_tower.transformer.layers.13.feed_forward.gate_proj",
143
+ "model.vision_tower.transformer.layers.13.feed_forward.up_proj",
144
+ "model.vision_tower.transformer.layers.13.feed_forward.down_proj",
145
+ "model.vision_tower.transformer.layers.13.attention.k_proj",
146
+ "model.vision_tower.transformer.layers.13.attention.v_proj",
147
+ "model.vision_tower.transformer.layers.13.attention.q_proj",
148
+ "model.vision_tower.transformer.layers.13.attention.o_proj",
149
+ "model.vision_tower.transformer.layers.14.feed_forward.gate_proj",
150
+ "model.vision_tower.transformer.layers.14.feed_forward.up_proj",
151
+ "model.vision_tower.transformer.layers.14.feed_forward.down_proj",
152
+ "model.vision_tower.transformer.layers.14.attention.k_proj",
153
+ "model.vision_tower.transformer.layers.14.attention.v_proj",
154
+ "model.vision_tower.transformer.layers.14.attention.q_proj",
155
+ "model.vision_tower.transformer.layers.14.attention.o_proj",
156
+ "model.vision_tower.transformer.layers.15.feed_forward.gate_proj",
157
+ "model.vision_tower.transformer.layers.15.feed_forward.up_proj",
158
+ "model.vision_tower.transformer.layers.15.feed_forward.down_proj",
159
+ "model.vision_tower.transformer.layers.15.attention.k_proj",
160
+ "model.vision_tower.transformer.layers.15.attention.v_proj",
161
+ "model.vision_tower.transformer.layers.15.attention.q_proj",
162
+ "model.vision_tower.transformer.layers.15.attention.o_proj",
163
+ "model.vision_tower.transformer.layers.16.feed_forward.gate_proj",
164
+ "model.vision_tower.transformer.layers.16.feed_forward.up_proj",
165
+ "model.vision_tower.transformer.layers.16.feed_forward.down_proj",
166
+ "model.vision_tower.transformer.layers.16.attention.k_proj",
167
+ "model.vision_tower.transformer.layers.16.attention.v_proj",
168
+ "model.vision_tower.transformer.layers.16.attention.q_proj",
169
+ "model.vision_tower.transformer.layers.16.attention.o_proj",
170
+ "model.vision_tower.transformer.layers.17.feed_forward.gate_proj",
171
+ "model.vision_tower.transformer.layers.17.feed_forward.up_proj",
172
+ "model.vision_tower.transformer.layers.17.feed_forward.down_proj",
173
+ "model.vision_tower.transformer.layers.17.attention.k_proj",
174
+ "model.vision_tower.transformer.layers.17.attention.v_proj",
175
+ "model.vision_tower.transformer.layers.17.attention.q_proj",
176
+ "model.vision_tower.transformer.layers.17.attention.o_proj",
177
+ "model.vision_tower.transformer.layers.18.feed_forward.gate_proj",
178
+ "model.vision_tower.transformer.layers.18.feed_forward.up_proj",
179
+ "model.vision_tower.transformer.layers.18.feed_forward.down_proj",
180
+ "model.vision_tower.transformer.layers.18.attention.k_proj",
181
+ "model.vision_tower.transformer.layers.18.attention.v_proj",
182
+ "model.vision_tower.transformer.layers.18.attention.q_proj",
183
+ "model.vision_tower.transformer.layers.18.attention.o_proj",
184
+ "model.vision_tower.transformer.layers.19.feed_forward.gate_proj",
185
+ "model.vision_tower.transformer.layers.19.feed_forward.up_proj",
186
+ "model.vision_tower.transformer.layers.19.feed_forward.down_proj",
187
+ "model.vision_tower.transformer.layers.19.attention.k_proj",
188
+ "model.vision_tower.transformer.layers.19.attention.v_proj",
189
+ "model.vision_tower.transformer.layers.19.attention.q_proj",
190
+ "model.vision_tower.transformer.layers.19.attention.o_proj",
191
+ "model.vision_tower.transformer.layers.20.feed_forward.gate_proj",
192
+ "model.vision_tower.transformer.layers.20.feed_forward.up_proj",
193
+ "model.vision_tower.transformer.layers.20.feed_forward.down_proj",
194
+ "model.vision_tower.transformer.layers.20.attention.k_proj",
195
+ "model.vision_tower.transformer.layers.20.attention.v_proj",
196
+ "model.vision_tower.transformer.layers.20.attention.q_proj",
197
+ "model.vision_tower.transformer.layers.20.attention.o_proj",
198
+ "model.vision_tower.transformer.layers.21.feed_forward.gate_proj",
199
+ "model.vision_tower.transformer.layers.21.feed_forward.up_proj",
200
+ "model.vision_tower.transformer.layers.21.feed_forward.down_proj",
201
+ "model.vision_tower.transformer.layers.21.attention.k_proj",
202
+ "model.vision_tower.transformer.layers.21.attention.v_proj",
203
+ "model.vision_tower.transformer.layers.21.attention.q_proj",
204
+ "model.vision_tower.transformer.layers.21.attention.o_proj",
205
+ "model.vision_tower.transformer.layers.22.feed_forward.gate_proj",
206
+ "model.vision_tower.transformer.layers.22.feed_forward.up_proj",
207
+ "model.vision_tower.transformer.layers.22.feed_forward.down_proj",
208
+ "model.vision_tower.transformer.layers.22.attention.k_proj",
209
+ "model.vision_tower.transformer.layers.22.attention.v_proj",
210
+ "model.vision_tower.transformer.layers.22.attention.q_proj",
211
+ "model.vision_tower.transformer.layers.22.attention.o_proj",
212
+ "model.vision_tower.transformer.layers.23.feed_forward.gate_proj",
213
+ "model.vision_tower.transformer.layers.23.feed_forward.up_proj",
214
+ "model.vision_tower.transformer.layers.23.feed_forward.down_proj",
215
+ "model.vision_tower.transformer.layers.23.attention.k_proj",
216
+ "model.vision_tower.transformer.layers.23.attention.v_proj",
217
+ "model.vision_tower.transformer.layers.23.attention.q_proj",
218
+ "model.vision_tower.transformer.layers.23.attention.o_proj",
219
+ "model.multi_modal_projector.patch_merger.merging_layer",
220
+ "model.multi_modal_projector.linear_1",
221
+ "model.multi_modal_projector.linear_2",
222
+ "lm_head"
223
+ ],
224
+ "kv_cache_scheme": null,
225
+ "quant_method": "compressed-tensors",
226
+ "quantization_status": "compressed",
227
+ "sparsity_config": {},
228
+ "transform_config": {},
229
+ "version": "0.13.1.dev4+gb055afc"
230
+ },
231
+ "spatial_merge_size": 2,
232
+ "text_config": {
233
+ "attention_dropout": 0.0,
234
+ "head_dim": 128,
235
+ "hidden_act": "silu",
236
+ "hidden_size": 5120,
237
+ "initializer_range": 0.02,
238
+ "intermediate_size": 16384,
239
+ "max_position_embeddings": 262144,
240
+ "model_type": "ministral3",
241
+ "num_attention_heads": 32,
242
+ "num_hidden_layers": 40,
243
+ "num_key_value_heads": 8,
244
+ "rms_norm_eps": 1e-05,
245
+ "rope_parameters": {
246
+ "beta_fast": 32.0,
247
+ "beta_slow": 1.0,
248
+ "factor": 16.0,
249
+ "llama_4_scaling_beta": 0.1,
250
+ "mscale": 1.0,
251
+ "mscale_all_dim": 1.0,
252
+ "original_max_position_embeddings": 16384,
253
+ "rope_theta": 1000000000.0,
254
+ "rope_type": "yarn",
255
+ "type": "yarn"
256
+ },
257
+ "sliding_window": null,
258
+ "use_cache": true,
259
+ "vocab_size": 131072
260
+ },
261
+ "transformers_version": "5.0.0rc1",
262
+ "vision_config": {
263
+ "attention_dropout": 0.0,
264
+ "head_dim": 64,
265
+ "hidden_act": "silu",
266
+ "hidden_size": 1024,
267
+ "image_size": 1540,
268
+ "initializer_range": 0.02,
269
+ "intermediate_size": 4096,
270
+ "model_type": "pixtral",
271
+ "num_attention_heads": 16,
272
+ "num_channels": 3,
273
+ "num_hidden_layers": 24,
274
+ "patch_size": 14,
275
+ "rope_parameters": {
276
+ "rope_theta": 10000.0,
277
+ "rope_type": "default"
278
+ }
279
+ },
280
+ "vision_feature_layer": -1
281
+ }
consolidated.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3d4dd5d413c3954d2dac20bc487c2328b99f1a5d478a953636dfa0c2aeb7d33
3
+ size 15730613144
convert_ministral_hf_to_mistral.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import argparse
16
+ import gc
17
+ import json
18
+ import os
19
+ import re
20
+ import torch
21
+
22
+ from safetensors.torch import save_file
23
+ from safetensors.torch import safe_open
24
+ from huggingface_hub import snapshot_download
25
+
26
+ from transformers import Mistral3Config, Mistral3ForConditionalGeneration
27
+
28
+ # fmt: off
29
+ STATE_DICT_MAPPING = {
30
+ r"^language_model\.lm_head": r"output",
31
+ r"^language_model\.model\.norm": r"norm",
32
+ r"^language_model\.model\.embed_tokens": r"tok_embeddings",
33
+ r"^language_model\.model\.layers\.(\d+)\.input_layernorm": r"layers.\1.attention_norm",
34
+ r"^language_model\.model\.layers\.(\d+)\.post_attention_layernorm": r"layers.\1.ffn_norm",
35
+ r"^language_model\.model\.layers\.(\d+)\.self_attn\.(q|k|v|o)_proj": r"layers.\1.attention.w\2",
36
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.gate_proj": r"layers.\1.feed_forward.w1",
37
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.down_proj": r"layers.\1.feed_forward.w2",
38
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.up_proj": r"layers.\1.feed_forward.w3",
39
+ r"multi_modal_projector.patch_merger.merging_layer.weight": r"patch_merger.merging_layer.weight",
40
+ r"multi_modal_projector.norm.weight": r"pre_mm_projector_norm.weight",
41
+ r"multi_modal_projector.linear_1.weight": r"vision_language_adapter.w_in.weight",
42
+ r"multi_modal_projector.linear_2.weight": r"vision_language_adapter.w_out.weight",
43
+ r"vision_tower.ln_pre.weight": r"vision_encoder.ln_pre.weight",
44
+ r"vision_tower.patch_conv.weight": r"vision_encoder.patch_conv.weight",
45
+ r"^vision_tower\.transformer\.layers\.(\d+)\.attention_norm": r"vision_encoder.transformer.layers.\1.attention_norm",
46
+ r"^vision_tower\.transformer\.layers\.(\d+)\.ffn_norm": r"vision_encoder.transformer.layers.\1.ffn_norm",
47
+ r"^vision_tower\.transformer\.layers\.(\d+)\.attention\.(q|k|v|o)_proj": r"vision_encoder.transformer.layers.\1.attention.w\2",
48
+ r"^vision_tower\.transformer\.layers\.(\d+)\.feed_forward\.gate_proj": r"vision_encoder.transformer.layers.\1.feed_forward.w1",
49
+ r"^vision_tower\.transformer\.layers\.(\d+)\.feed_forward\.down_proj": r"vision_encoder.transformer.layers.\1.feed_forward.w2",
50
+ r"^vision_tower\.transformer\.layers\.(\d+)\.feed_forward\.up_proj": r"vision_encoder.transformer.layers.\1.feed_forward.w3",
51
+ }
52
+ # fmt: on
53
+
54
+ SKIP_KEYS = [ ]
55
+
56
+ def add_quantization_config(config, hf_config: Mistral3ForConditionalGeneration):
57
+ quantization_config = hf_config.hf_quantizer.quantization_config
58
+ mistral_ignore = [] # keys to ignore in the quantization config
59
+
60
+ for hf_key in quantization_config.quantization_config.ignore:
61
+ mistral_key = map_hf_key_to_mistral(hf_key)
62
+ mistral_ignore.append(mistral_key)
63
+
64
+ quantization_config.quantization_config.ignore = mistral_ignore
65
+ config["quantization_config"] = quantization_config.to_dict()
66
+
67
+ return config
68
+
69
+ def map_hf_key_to_mistral(hf_key):
70
+ """Map a key from HF format to Mistral format"""
71
+ for pattern, replacement in STATE_DICT_MAPPING.items():
72
+ new_key, n_replace = re.subn(pattern, replacement, hf_key)
73
+ if n_replace > 0:
74
+ return new_key.replace("weight_scale", "qscale_weight")
75
+
76
+ # If no mapping found, return the original key
77
+ return hf_key.replace("weight_scale", "qscale_weight")
78
+
79
+
80
+ def permute_for_mistral_rope(tensor, n_heads, dim1, dim2):
81
+ """Reverse the ROPE permutation to get back to Mistral format."""
82
+ old_tensor = tensor
83
+ tensor = tensor.view(n_heads, 2, dim1 // n_heads // 2, dim2)
84
+ tensor = tensor.transpose(1, 2)
85
+ tensor = tensor.reshape(dim1, dim2)
86
+ return tensor
87
+
88
+
89
+ def convert_state_dict(hf_state_dict, config):
90
+ """Convert HF Ministral state dict to Mistral format"""
91
+ mistral_dict = {}
92
+
93
+ text_config = config["text_config"]
94
+ vision_config = config["vision_config"]
95
+
96
+ text_num_attention_heads = text_config["num_attention_heads"]
97
+ text_hidden_size = text_config["hidden_size"]
98
+ text_head_dim = text_config["head_dim"]
99
+ text_num_key_value_heads = text_config["num_key_value_heads"]
100
+
101
+ text_key_value_dim = text_head_dim * text_num_key_value_heads
102
+ text_query_dim = text_head_dim * text_num_attention_heads
103
+
104
+
105
+ vision_num_attention_heads = vision_config["num_attention_heads"]
106
+ vision_hidden_size = vision_config["hidden_size"]
107
+ vision_head_dim = vision_config["head_dim"]
108
+ vision_num_key_value_heads = vision_num_attention_heads
109
+
110
+ vision_key_value_dim = vision_head_dim * vision_num_key_value_heads
111
+ vision_query_dim = vision_head_dim * vision_num_attention_heads
112
+
113
+
114
+ for hf_key, tensor in hf_state_dict.items():
115
+ if hf_key in SKIP_KEYS:
116
+ continue
117
+
118
+ mistral_key = map_hf_key_to_mistral(hf_key)
119
+
120
+ if "language_model" in hf_key:
121
+ if hf_key.endswith("q_proj.weight"):
122
+ tensor = permute_for_mistral_rope(tensor, text_num_attention_heads, text_query_dim, text_hidden_size)
123
+ elif hf_key.endswith("q_proj.weight_scale") and tensor.size(0) == text_num_attention_heads:
124
+ tensor = permute_for_mistral_rope(tensor, text_num_attention_heads, text_query_dim, 1)
125
+ elif hf_key.endswith("k_proj.weight"):
126
+ tensor = permute_for_mistral_rope(tensor, text_num_key_value_heads, text_key_value_dim, text_hidden_size)
127
+ elif hf_key.endswith("k_proj.weight_scale") and tensor.size(0) == text_num_key_value_heads:
128
+ tensor = permute_for_mistral_rope(tensor, text_num_key_value_heads, text_key_value_dim, 1)
129
+
130
+ if "vision_tower" in hf_key:
131
+ if hf_key.endswith("q_proj.weight"):
132
+ tensor = permute_for_mistral_rope(tensor, vision_num_attention_heads, vision_query_dim, vision_hidden_size)
133
+ elif hf_key.endswith("q_proj.weight_scale") and tensor.size(0) == vision_num_attention_heads:
134
+ tensor = permute_for_mistral_rope(tensor, vision_num_attention_heads, vision_query_dim, 1)
135
+ elif hf_key.endswith("k_proj.weight"):
136
+ tensor = permute_for_mistral_rope(tensor, vision_num_key_value_heads, vision_key_value_dim, vision_hidden_size)
137
+ elif hf_key.endswith("k_proj.weight_scale") and tensor.size(0) == vision_num_key_value_heads:
138
+ tensor = permute_for_mistral_rope(tensor, vision_num_key_value_heads, vision_key_value_dim, 1)
139
+
140
+ mistral_dict[mistral_key] = tensor
141
+
142
+ return mistral_dict
143
+
144
+
145
+ def write_model(
146
+ input_path_or_repo,
147
+ output_dir,
148
+ unquantized_model_path=None,
149
+ ):
150
+ print("Converting HF Ministral model to Mistral format.")
151
+ os.makedirs(output_dir, exist_ok=True)
152
+
153
+ # Load the HF Ministral model
154
+ print(f"Loading HF Ministral model from {input_path_or_repo}...")
155
+ hf_config = Mistral3ForConditionalGeneration.from_pretrained(input_path_or_repo)
156
+
157
+ if os.path.exists(input_path_or_repo):
158
+ local_path = input_path_or_repo
159
+ else:
160
+ local_path = snapshot_download(input_path_or_repo)
161
+
162
+ # Convert config
163
+ if unquantized_model_path is not None:
164
+ if os.path.exists(unquantized_model_path):
165
+ unquantized_model_path = unquantized_model_path
166
+ else:
167
+ unquantized_model_path = snapshot_download(unquantized_model_path)
168
+ config_path = os.path.join(unquantized_model_path, "params.json")
169
+ with open(config_path, "r") as f:
170
+ config = json.load(f)
171
+
172
+ config = add_quantization_config(config, hf_config)
173
+
174
+ with open(os.path.join(output_dir, "params.json"), "w") as f:
175
+ json.dump(config, f, indent=2)
176
+ else:
177
+ raise ValueError(f"Unquantized model config not found for {unquantized_model_path}")
178
+
179
+ # Convert state dict
180
+ print("Converting state dict...")
181
+ tensor_files = sorted([f for f in os.listdir(os.path.join(local_path)) if f.endswith(".safetensors")])
182
+
183
+ hf_state_dict = {}
184
+
185
+ for file in tensor_files:
186
+ file_path = os.path.join(local_path, file)
187
+ with safe_open(file_path, framework="pt", device="cuda") as f:
188
+ for key in f.keys():
189
+ hf_state_dict[key] = f.get_tensor(key)
190
+
191
+ mistral_config = Mistral3Config().to_dict()
192
+ mistral_state_dict = convert_state_dict(hf_state_dict, mistral_config)
193
+
194
+ # save the state dict
195
+ save_file(mistral_state_dict, os.path.join(output_dir, "consolidated.safetensors"))
196
+
197
+ del hf_state_dict, mistral_state_dict
198
+ gc.collect()
199
+ print("Model converted successfully.")
200
+
201
+ def write_tokenizer(input_path_or_repo: str, output_dir: str):
202
+ """Extract and save the tokenizer from Ministral model"""
203
+ from transformers import MistralCommonBackend
204
+
205
+ print("Extracting tokenizer...")
206
+ tokenizer = MistralCommonBackend.from_pretrained(input_path_or_repo)
207
+ tokenizer.save_pretrained(output_dir)
208
+ print("Tokenizer saved successfully.")
209
+
210
+
211
+ def main():
212
+ parser = argparse.ArgumentParser(description="Convert HF Ministral weights to Mistral format")
213
+ parser.add_argument(
214
+ "--input_path_or_repo",
215
+ type=str,
216
+ default="inference-optimization/Ministral-3-14B-Instruct-2512-BF16-FP8-DYNAMIC-BASE",
217
+ help="Path or repo containing HF Ministral model",
218
+ )
219
+ parser.add_argument(
220
+ "--output_dir",
221
+ type=str,
222
+ default="Ministral-3-14B-Instruct-2512-FP8-DYNAMIC-VISION",
223
+ help="Location to write Mistral model and tokenizer",
224
+ )
225
+ parser.add_argument(
226
+ "--skip_tokenizer",
227
+ action="store_true",
228
+ help="Skip tokenizer conversion"
229
+ )
230
+ parser.add_argument(
231
+ "--unquantized_model_path",
232
+ type=str,
233
+ default="mistralai/Ministral-3-14B-Instruct-2512-BF16",
234
+ help="Path to the unquantized model",
235
+ )
236
+ args = parser.parse_args()
237
+
238
+ write_model(
239
+ args.input_path_or_repo,
240
+ args.output_dir,
241
+ unquantized_model_path=args.unquantized_model_path,
242
+ )
243
+
244
+ if not args.skip_tokenizer:
245
+ write_tokenizer(
246
+ args.input_path_or_repo,
247
+ args.output_dir,
248
+ )
249
+
250
+
251
+ if __name__ == "__main__":
252
+ main()
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 262144,
5
+ "pad_token_id": 11,
6
+ "transformers_version": "5.0.0rc1"
7
+ }
params.json ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dim": 5120,
3
+ "n_layers": 40,
4
+ "head_dim": 128,
5
+ "hidden_dim": 16384,
6
+ "n_heads": 32,
7
+ "n_kv_heads": 8,
8
+ "rope_theta": 1000000000.0,
9
+ "norm_eps": 1e-05,
10
+ "vocab_size": 131072,
11
+ "tied_embeddings": false,
12
+ "max_position_embeddings": 262144,
13
+ "llama_4_scaling": {
14
+ "original_max_position_embeddings": 16384,
15
+ "beta": 0.1
16
+ },
17
+ "q_lora_rank": null,
18
+ "qk_rope_head_dim": null,
19
+ "qk_nope_head_dim": null,
20
+ "kv_lora_rank": null,
21
+ "v_head_dim": null,
22
+ "yarn": {
23
+ "original_max_position_embeddings": 16384,
24
+ "factor": 16,
25
+ "apply_scale": false,
26
+ "beta": 32,
27
+ "alpha": 1
28
+ },
29
+ "vision_encoder": {
30
+ "image_token_id": 10,
31
+ "image_break_token_id": 12,
32
+ "image_end_token_id": 13,
33
+ "intermediate_size": 4096,
34
+ "num_hidden_layers": 24,
35
+ "num_attention_heads": 16,
36
+ "mm_projector_id": "patch_merge",
37
+ "spatial_merge_size": 2,
38
+ "hidden_size": 1024,
39
+ "num_channels": 3,
40
+ "image_size": 1540,
41
+ "max_image_size": 1540,
42
+ "patch_size": 14,
43
+ "rope_theta": 10000.0,
44
+ "add_pre_mm_projector_layer_norm": true,
45
+ "adapter_bias": false
46
+ },
47
+ "quantization_config": {
48
+ "config_groups": {
49
+ "group_0": {
50
+ "targets": [
51
+ "Linear"
52
+ ],
53
+ "weights": {
54
+ "num_bits": 8,
55
+ "type": "float",
56
+ "symmetric": true,
57
+ "group_size": null,
58
+ "strategy": "channel",
59
+ "block_structure": null,
60
+ "dynamic": false,
61
+ "actorder": null,
62
+ "scale_dtype": null,
63
+ "zp_dtype": null,
64
+ "observer": "minmax",
65
+ "observer_kwargs": {}
66
+ },
67
+ "input_activations": {
68
+ "num_bits": 8,
69
+ "type": "float",
70
+ "symmetric": true,
71
+ "group_size": null,
72
+ "strategy": "token",
73
+ "block_structure": null,
74
+ "dynamic": true,
75
+ "actorder": null,
76
+ "scale_dtype": null,
77
+ "zp_dtype": null,
78
+ "observer": null,
79
+ "observer_kwargs": {}
80
+ },
81
+ "output_activations": null,
82
+ "format": "float-quantized"
83
+ }
84
+ },
85
+ "quant_method": "compressed-tensors",
86
+ "kv_cache_scheme": null,
87
+ "format": "float-quantized",
88
+ "quantization_status": "compressed",
89
+ "global_compression_ratio": null,
90
+ "ignore": [
91
+ "model.vision_tower.transformer.layers.0.feed_forward.gate_proj",
92
+ "model.vision_tower.transformer.layers.0.feed_forward.up_proj",
93
+ "model.vision_tower.transformer.layers.0.feed_forward.down_proj",
94
+ "model.vision_tower.transformer.layers.0.attention.k_proj",
95
+ "model.vision_tower.transformer.layers.0.attention.v_proj",
96
+ "model.vision_tower.transformer.layers.0.attention.q_proj",
97
+ "model.vision_tower.transformer.layers.0.attention.o_proj",
98
+ "model.vision_tower.transformer.layers.1.feed_forward.gate_proj",
99
+ "model.vision_tower.transformer.layers.1.feed_forward.up_proj",
100
+ "model.vision_tower.transformer.layers.1.feed_forward.down_proj",
101
+ "model.vision_tower.transformer.layers.1.attention.k_proj",
102
+ "model.vision_tower.transformer.layers.1.attention.v_proj",
103
+ "model.vision_tower.transformer.layers.1.attention.q_proj",
104
+ "model.vision_tower.transformer.layers.1.attention.o_proj",
105
+ "model.vision_tower.transformer.layers.2.feed_forward.gate_proj",
106
+ "model.vision_tower.transformer.layers.2.feed_forward.up_proj",
107
+ "model.vision_tower.transformer.layers.2.feed_forward.down_proj",
108
+ "model.vision_tower.transformer.layers.2.attention.k_proj",
109
+ "model.vision_tower.transformer.layers.2.attention.v_proj",
110
+ "model.vision_tower.transformer.layers.2.attention.q_proj",
111
+ "model.vision_tower.transformer.layers.2.attention.o_proj",
112
+ "model.vision_tower.transformer.layers.3.feed_forward.gate_proj",
113
+ "model.vision_tower.transformer.layers.3.feed_forward.up_proj",
114
+ "model.vision_tower.transformer.layers.3.feed_forward.down_proj",
115
+ "model.vision_tower.transformer.layers.3.attention.k_proj",
116
+ "model.vision_tower.transformer.layers.3.attention.v_proj",
117
+ "model.vision_tower.transformer.layers.3.attention.q_proj",
118
+ "model.vision_tower.transformer.layers.3.attention.o_proj",
119
+ "model.vision_tower.transformer.layers.4.feed_forward.gate_proj",
120
+ "model.vision_tower.transformer.layers.4.feed_forward.up_proj",
121
+ "model.vision_tower.transformer.layers.4.feed_forward.down_proj",
122
+ "model.vision_tower.transformer.layers.4.attention.k_proj",
123
+ "model.vision_tower.transformer.layers.4.attention.v_proj",
124
+ "model.vision_tower.transformer.layers.4.attention.q_proj",
125
+ "model.vision_tower.transformer.layers.4.attention.o_proj",
126
+ "model.vision_tower.transformer.layers.5.feed_forward.gate_proj",
127
+ "model.vision_tower.transformer.layers.5.feed_forward.up_proj",
128
+ "model.vision_tower.transformer.layers.5.feed_forward.down_proj",
129
+ "model.vision_tower.transformer.layers.5.attention.k_proj",
130
+ "model.vision_tower.transformer.layers.5.attention.v_proj",
131
+ "model.vision_tower.transformer.layers.5.attention.q_proj",
132
+ "model.vision_tower.transformer.layers.5.attention.o_proj",
133
+ "model.vision_tower.transformer.layers.6.feed_forward.gate_proj",
134
+ "model.vision_tower.transformer.layers.6.feed_forward.up_proj",
135
+ "model.vision_tower.transformer.layers.6.feed_forward.down_proj",
136
+ "model.vision_tower.transformer.layers.6.attention.k_proj",
137
+ "model.vision_tower.transformer.layers.6.attention.v_proj",
138
+ "model.vision_tower.transformer.layers.6.attention.q_proj",
139
+ "model.vision_tower.transformer.layers.6.attention.o_proj",
140
+ "model.vision_tower.transformer.layers.7.feed_forward.gate_proj",
141
+ "model.vision_tower.transformer.layers.7.feed_forward.up_proj",
142
+ "model.vision_tower.transformer.layers.7.feed_forward.down_proj",
143
+ "model.vision_tower.transformer.layers.7.attention.k_proj",
144
+ "model.vision_tower.transformer.layers.7.attention.v_proj",
145
+ "model.vision_tower.transformer.layers.7.attention.q_proj",
146
+ "model.vision_tower.transformer.layers.7.attention.o_proj",
147
+ "model.vision_tower.transformer.layers.8.feed_forward.gate_proj",
148
+ "model.vision_tower.transformer.layers.8.feed_forward.up_proj",
149
+ "model.vision_tower.transformer.layers.8.feed_forward.down_proj",
150
+ "model.vision_tower.transformer.layers.8.attention.k_proj",
151
+ "model.vision_tower.transformer.layers.8.attention.v_proj",
152
+ "model.vision_tower.transformer.layers.8.attention.q_proj",
153
+ "model.vision_tower.transformer.layers.8.attention.o_proj",
154
+ "model.vision_tower.transformer.layers.9.feed_forward.gate_proj",
155
+ "model.vision_tower.transformer.layers.9.feed_forward.up_proj",
156
+ "model.vision_tower.transformer.layers.9.feed_forward.down_proj",
157
+ "model.vision_tower.transformer.layers.9.attention.k_proj",
158
+ "model.vision_tower.transformer.layers.9.attention.v_proj",
159
+ "model.vision_tower.transformer.layers.9.attention.q_proj",
160
+ "model.vision_tower.transformer.layers.9.attention.o_proj",
161
+ "model.vision_tower.transformer.layers.10.feed_forward.gate_proj",
162
+ "model.vision_tower.transformer.layers.10.feed_forward.up_proj",
163
+ "model.vision_tower.transformer.layers.10.feed_forward.down_proj",
164
+ "model.vision_tower.transformer.layers.10.attention.k_proj",
165
+ "model.vision_tower.transformer.layers.10.attention.v_proj",
166
+ "model.vision_tower.transformer.layers.10.attention.q_proj",
167
+ "model.vision_tower.transformer.layers.10.attention.o_proj",
168
+ "model.vision_tower.transformer.layers.11.feed_forward.gate_proj",
169
+ "model.vision_tower.transformer.layers.11.feed_forward.up_proj",
170
+ "model.vision_tower.transformer.layers.11.feed_forward.down_proj",
171
+ "model.vision_tower.transformer.layers.11.attention.k_proj",
172
+ "model.vision_tower.transformer.layers.11.attention.v_proj",
173
+ "model.vision_tower.transformer.layers.11.attention.q_proj",
174
+ "model.vision_tower.transformer.layers.11.attention.o_proj",
175
+ "model.vision_tower.transformer.layers.12.feed_forward.gate_proj",
176
+ "model.vision_tower.transformer.layers.12.feed_forward.up_proj",
177
+ "model.vision_tower.transformer.layers.12.feed_forward.down_proj",
178
+ "model.vision_tower.transformer.layers.12.attention.k_proj",
179
+ "model.vision_tower.transformer.layers.12.attention.v_proj",
180
+ "model.vision_tower.transformer.layers.12.attention.q_proj",
181
+ "model.vision_tower.transformer.layers.12.attention.o_proj",
182
+ "model.vision_tower.transformer.layers.13.feed_forward.gate_proj",
183
+ "model.vision_tower.transformer.layers.13.feed_forward.up_proj",
184
+ "model.vision_tower.transformer.layers.13.feed_forward.down_proj",
185
+ "model.vision_tower.transformer.layers.13.attention.k_proj",
186
+ "model.vision_tower.transformer.layers.13.attention.v_proj",
187
+ "model.vision_tower.transformer.layers.13.attention.q_proj",
188
+ "model.vision_tower.transformer.layers.13.attention.o_proj",
189
+ "model.vision_tower.transformer.layers.14.feed_forward.gate_proj",
190
+ "model.vision_tower.transformer.layers.14.feed_forward.up_proj",
191
+ "model.vision_tower.transformer.layers.14.feed_forward.down_proj",
192
+ "model.vision_tower.transformer.layers.14.attention.k_proj",
193
+ "model.vision_tower.transformer.layers.14.attention.v_proj",
194
+ "model.vision_tower.transformer.layers.14.attention.q_proj",
195
+ "model.vision_tower.transformer.layers.14.attention.o_proj",
196
+ "model.vision_tower.transformer.layers.15.feed_forward.gate_proj",
197
+ "model.vision_tower.transformer.layers.15.feed_forward.up_proj",
198
+ "model.vision_tower.transformer.layers.15.feed_forward.down_proj",
199
+ "model.vision_tower.transformer.layers.15.attention.k_proj",
200
+ "model.vision_tower.transformer.layers.15.attention.v_proj",
201
+ "model.vision_tower.transformer.layers.15.attention.q_proj",
202
+ "model.vision_tower.transformer.layers.15.attention.o_proj",
203
+ "model.vision_tower.transformer.layers.16.feed_forward.gate_proj",
204
+ "model.vision_tower.transformer.layers.16.feed_forward.up_proj",
205
+ "model.vision_tower.transformer.layers.16.feed_forward.down_proj",
206
+ "model.vision_tower.transformer.layers.16.attention.k_proj",
207
+ "model.vision_tower.transformer.layers.16.attention.v_proj",
208
+ "model.vision_tower.transformer.layers.16.attention.q_proj",
209
+ "model.vision_tower.transformer.layers.16.attention.o_proj",
210
+ "model.vision_tower.transformer.layers.17.feed_forward.gate_proj",
211
+ "model.vision_tower.transformer.layers.17.feed_forward.up_proj",
212
+ "model.vision_tower.transformer.layers.17.feed_forward.down_proj",
213
+ "model.vision_tower.transformer.layers.17.attention.k_proj",
214
+ "model.vision_tower.transformer.layers.17.attention.v_proj",
215
+ "model.vision_tower.transformer.layers.17.attention.q_proj",
216
+ "model.vision_tower.transformer.layers.17.attention.o_proj",
217
+ "model.vision_tower.transformer.layers.18.feed_forward.gate_proj",
218
+ "model.vision_tower.transformer.layers.18.feed_forward.up_proj",
219
+ "model.vision_tower.transformer.layers.18.feed_forward.down_proj",
220
+ "model.vision_tower.transformer.layers.18.attention.k_proj",
221
+ "model.vision_tower.transformer.layers.18.attention.v_proj",
222
+ "model.vision_tower.transformer.layers.18.attention.q_proj",
223
+ "model.vision_tower.transformer.layers.18.attention.o_proj",
224
+ "model.vision_tower.transformer.layers.19.feed_forward.gate_proj",
225
+ "model.vision_tower.transformer.layers.19.feed_forward.up_proj",
226
+ "model.vision_tower.transformer.layers.19.feed_forward.down_proj",
227
+ "model.vision_tower.transformer.layers.19.attention.k_proj",
228
+ "model.vision_tower.transformer.layers.19.attention.v_proj",
229
+ "model.vision_tower.transformer.layers.19.attention.q_proj",
230
+ "model.vision_tower.transformer.layers.19.attention.o_proj",
231
+ "model.vision_tower.transformer.layers.20.feed_forward.gate_proj",
232
+ "model.vision_tower.transformer.layers.20.feed_forward.up_proj",
233
+ "model.vision_tower.transformer.layers.20.feed_forward.down_proj",
234
+ "model.vision_tower.transformer.layers.20.attention.k_proj",
235
+ "model.vision_tower.transformer.layers.20.attention.v_proj",
236
+ "model.vision_tower.transformer.layers.20.attention.q_proj",
237
+ "model.vision_tower.transformer.layers.20.attention.o_proj",
238
+ "model.vision_tower.transformer.layers.21.feed_forward.gate_proj",
239
+ "model.vision_tower.transformer.layers.21.feed_forward.up_proj",
240
+ "model.vision_tower.transformer.layers.21.feed_forward.down_proj",
241
+ "model.vision_tower.transformer.layers.21.attention.k_proj",
242
+ "model.vision_tower.transformer.layers.21.attention.v_proj",
243
+ "model.vision_tower.transformer.layers.21.attention.q_proj",
244
+ "model.vision_tower.transformer.layers.21.attention.o_proj",
245
+ "model.vision_tower.transformer.layers.22.feed_forward.gate_proj",
246
+ "model.vision_tower.transformer.layers.22.feed_forward.up_proj",
247
+ "model.vision_tower.transformer.layers.22.feed_forward.down_proj",
248
+ "model.vision_tower.transformer.layers.22.attention.k_proj",
249
+ "model.vision_tower.transformer.layers.22.attention.v_proj",
250
+ "model.vision_tower.transformer.layers.22.attention.q_proj",
251
+ "model.vision_tower.transformer.layers.22.attention.o_proj",
252
+ "model.vision_tower.transformer.layers.23.feed_forward.gate_proj",
253
+ "model.vision_tower.transformer.layers.23.feed_forward.up_proj",
254
+ "model.vision_tower.transformer.layers.23.feed_forward.down_proj",
255
+ "model.vision_tower.transformer.layers.23.attention.k_proj",
256
+ "model.vision_tower.transformer.layers.23.attention.v_proj",
257
+ "model.vision_tower.transformer.layers.23.attention.q_proj",
258
+ "model.vision_tower.transformer.layers.23.attention.o_proj",
259
+ "model.multi_modal_projector.patch_merger.merging_layer",
260
+ "model.multi_modal_projector.linear_1",
261
+ "model.multi_modal_projector.linear_2",
262
+ "lm_head"
263
+ ],
264
+ "sparsity_config": {}
265
+ }
266
+ }
recipe.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ QuantizationModifier:
4
+ targets: [Linear]
5
+ ignore: ['re:.*lm_head', 're:.*vision_tower.*', 're:.*multi_modal_projector.*']
6
+ scheme: FP8_DYNAMIC
tekken.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e29d19ea32eb7e26e6c0572d57cb7f9eca0f4420e0e0fe6ae1cf3be94da1c0d6
3
+ size 16753777