Snider Virgil commited on
Commit
2197926
·
1 Parent(s): f152007

feat: merge LEK into lemmy weights

Browse files

LEK-2 LoRA merged into Gemma 4 26B A4B MoE attention projections.
Converged at loss 0.00034 in 219 steps via patience-stop (best at 189).
No KV-shared layers to restore (num_kv_shared_layers=0 on lemmy).
MoE routing amplification: 5.7M trainable LoRA params achieve a
basin depth within 2x of lemrd's 22.5M LoRA params, suggesting
routing multiplicity provides implicit LoRA replication across
expert paths. Patched transformers/integrations/moe.py for MPS
histc Int dtype incompatibility.

Co-Authored-By: Virgil <virgil@lethean.io>

README.md CHANGED
@@ -1,10 +1,7 @@
1
  ---
 
2
  library_name: mlx
3
- license: eupl-1.2
4
  pipeline_tag: image-text-to-text
5
  tags:
6
  - mlx
7
- - gguf
8
- base_model:
9
- - LetheanNetwork/lemmy
10
- ---
 
1
  ---
2
+ language: en
3
  library_name: mlx
 
4
  pipeline_tag: image-text-to-text
5
  tags:
6
  - mlx
7
+ ---
 
 
 
chat_template.jinja CHANGED
@@ -11,15 +11,34 @@
11
  description:<|"|>{{ value['description'] }}<|"|>
12
  {%- set add_comma = true -%}
13
  {%- endif -%}
 
 
 
 
14
  {%- if value['type'] | upper == 'STRING' -%}
15
  {%- if value['enum'] -%}
16
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
  enum:{{ format_argument(value['enum']) }}
18
  {%- endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {%- elif value['type'] | upper == 'ARRAY' -%}
20
  {%- if value['items'] is mapping and value['items'] -%}
21
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
- items:{
23
  {%- set ns_items = namespace(found_first=false) -%}
24
  {%- for item_key, item_value in value['items'] | dictsort -%}
25
  {%- if item_value is not none -%}
@@ -52,32 +71,6 @@
52
  }
53
  {%- endif -%}
54
  {%- endif -%}
55
- {%- if value['nullable'] %}
56
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
- nullable:true
58
- {%- endif -%}
59
- {%- if value['type'] | upper == 'OBJECT' -%}
60
- {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
- properties:{
63
- {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
- }
65
- {%- elif value is mapping -%}
66
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
- properties:{
68
- {{- format_parameters(value, value['required'] | default([])) -}}
69
- }
70
- {%- endif -%}
71
- {%- if value['required'] -%}
72
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
- required:[
74
- {%- for item in value['required'] | default([]) -%}
75
- <|"|>{{- item -}}<|"|>
76
- {%- if not loop.last %},{% endif -%}
77
- {%- endfor -%}
78
- ]
79
- {%- endif -%}
80
- {%- endif -%}
81
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
  type:<|"|>{{ value['type'] | upper }}<|"|>}
83
  {%- endif -%}
@@ -157,31 +150,16 @@
157
  {{- ns.result | trim -}}
158
  {%- endmacro -%}
159
 
160
- {%- macro format_tool_response_block(tool_name, response) -%}
161
- {{- '<|tool_response>' -}}
162
- {%- if response is mapping -%}
163
- {{- 'response:' + tool_name + '{' -}}
164
- {%- for key, value in response | dictsort -%}
165
- {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
- {%- if not loop.last %},{% endif -%}
167
- {%- endfor -%}
168
- {{- '}' -}}
169
- {%- else -%}
170
- {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
- {%- endif -%}
172
- {{- '<tool_response|>' -}}
173
- {%- endmacro -%}
174
-
175
  {%- set ns = namespace(prev_message_type=None) -%}
176
  {%- set loop_messages = messages -%}
177
- {{- bos_token -}}
178
  {#- Handle System/Tool Definitions Block -#}
179
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
  {{- '<|turn>system\n' -}}
181
 
182
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
183
  {%- if enable_thinking is defined and enable_thinking -%}
184
- {{- '<|think|>\n' -}}
185
  {%- set ns.prev_message_type = 'think' -%}
186
  {%- endif -%}
187
 
@@ -202,41 +180,11 @@
202
  {{- '<turn|>\n' -}}
203
  {%- endif %}
204
 
205
- {#- Pre-scan: find last user message index for reasoning guard -#}
206
- {%- set ns_turn = namespace(last_user_idx=-1) -%}
207
- {%- for i in range(loop_messages | length) -%}
208
- {%- if loop_messages[i]['role'] == 'user' -%}
209
- {%- set ns_turn.last_user_idx = i -%}
210
- {%- endif -%}
211
- {%- endfor -%}
212
-
213
  {#- Loop through messages -#}
214
  {%- for message in loop_messages -%}
215
- {%- if message['role'] != 'tool' -%}
216
  {%- set ns.prev_message_type = None -%}
217
  {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
218
- {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
219
- {%- set prev_nt = namespace(role=None, found=false) -%}
220
- {%- if loop.index0 > 0 -%}
221
- {%- for j in range(loop.index0 - 1, -1, -1) -%}
222
- {%- if not prev_nt.found -%}
223
- {%- if loop_messages[j]['role'] != 'tool' -%}
224
- {%- set prev_nt.role = loop_messages[j]['role'] -%}
225
- {%- set prev_nt.found = true -%}
226
- {%- endif -%}
227
- {%- endif -%}
228
- {%- endfor -%}
229
- {%- endif -%}
230
- {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
231
- {%- if not continue_same_model_turn -%}
232
  {{- '<|turn>' + role + '\n' }}
233
- {%- endif -%}
234
-
235
- {#- Render reasoning/reasoning_content as thinking channel -#}
236
- {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
237
- {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
238
- {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
239
- {%- endif -%}
240
 
241
  {%- if message['tool_calls'] -%}
242
  {%- for tool_call in message['tool_calls'] -%}
@@ -257,49 +205,23 @@
257
  {%- set ns.prev_message_type = 'tool_call' -%}
258
  {%- endif -%}
259
 
260
- {%- set ns_tr_out = namespace(flag=false) -%}
261
- {%- if message.get('tool_responses') -%}
262
- {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
263
  {%- for tool_response in message['tool_responses'] -%}
264
- {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
265
- {%- set ns_tr_out.flag = true -%}
266
- {%- set ns.prev_message_type = 'tool_response' -%}
267
- {%- endfor -%}
268
- {%- elif message.get('tool_calls') -%}
269
- {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
270
- {%- set ns_tool_scan = namespace(stopped=false) -%}
271
- {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
272
- {%- if ns_tool_scan.stopped -%}
273
- {%- elif loop_messages[k]['role'] != 'tool' -%}
274
- {%- set ns_tool_scan.stopped = true -%}
275
- {%- else -%}
276
- {%- set follow = loop_messages[k] -%}
277
- {#- Resolve tool_call_id to function name -#}
278
- {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
279
- {%- for tc in message['tool_calls'] -%}
280
- {%- if tc.get('id') == follow.get('tool_call_id') -%}
281
- {%- set ns_tname.name = tc['function']['name'] -%}
282
- {%- endif -%}
283
  {%- endfor -%}
284
- {#- Handle content as string or content-parts array -#}
285
- {%- set tool_body = follow.get('content') -%}
286
- {%- if tool_body is string -%}
287
- {{- format_tool_response_block(ns_tname.name, tool_body) -}}
288
- {%- elif tool_body is sequence and tool_body is not string -%}
289
- {%- set ns_txt = namespace(s='') -%}
290
- {%- for part in tool_body -%}
291
- {%- if part.get('type') == 'text' -%}
292
- {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
293
- {%- endif -%}
294
- {%- endfor -%}
295
- {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
296
- {%- else -%}
297
- {{- format_tool_response_block(ns_tname.name, tool_body) -}}
298
- {%- endif -%}
299
- {%- set ns_tr_out.flag = true -%}
300
- {%- set ns.prev_message_type = 'tool_response' -%}
301
  {%- endif -%}
 
302
  {%- endfor -%}
 
303
  {%- endif -%}
304
 
305
  {%- if message['content'] is string -%}
@@ -317,31 +239,28 @@
317
  {{- item['text'] | trim -}}
318
  {%- endif -%}
319
  {%- elif item['type'] == 'image' -%}
320
- {{- '<|image|>' -}}
321
  {%- set ns.prev_message_type = 'image' -%}
322
  {%- elif item['type'] == 'audio' -%}
323
  {{- '<|audio|>' -}}
324
  {%- set ns.prev_message_type = 'audio' -%}
325
  {%- elif item['type'] == 'video' -%}
326
- {{- '<|video|>' -}}
327
  {%- set ns.prev_message_type = 'video' -%}
328
  {%- endif -%}
329
  {%- endfor -%}
330
  {%- endif -%}
331
 
332
- {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
333
- {{- '<|tool_response>' -}}
334
- {%- elif not (ns_tr_out.flag and not message.get('content')) -%}
335
  {{- '<turn|>\n' -}}
336
  {%- endif -%}
337
- {%- endif -%}
338
  {%- endfor -%}
339
 
340
  {%- if add_generation_prompt -%}
341
- {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
342
  {{- '<|turn>model\n' -}}
343
- {%- if not enable_thinking | default(false) -%}
344
- {{- '<|channel>thought\n<channel|>' -}}
345
- {%- endif -%}
346
  {%- endif -%}
347
  {%- endif -%}
 
11
  description:<|"|>{{ value['description'] }}<|"|>
12
  {%- set add_comma = true -%}
13
  {%- endif -%}
14
+ {%- if value['nullable'] %}
15
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
16
+ nullable:true
17
+ {%- endif -%}
18
  {%- if value['type'] | upper == 'STRING' -%}
19
  {%- if value['enum'] -%}
20
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
21
  enum:{{ format_argument(value['enum']) }}
22
  {%- endif -%}
23
+ {%- elif value['type'] | upper == 'OBJECT' -%}
24
+ ,properties:{
25
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
26
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
27
+ {%- elif value is mapping -%}
28
+ {{- format_parameters(value, value['required'] | default([])) -}}
29
+ {%- endif -%}
30
+ }
31
+ {%- if value['required'] -%}
32
+ ,required:[
33
+ {%- for item in value['required'] | default([]) -%}
34
+ <|"|>{{- item -}}<|"|>
35
+ {%- if not loop.last %},{% endif -%}
36
+ {%- endfor -%}
37
+ ]
38
+ {%- endif -%}
39
  {%- elif value['type'] | upper == 'ARRAY' -%}
40
  {%- if value['items'] is mapping and value['items'] -%}
41
+ ,items:{
 
42
  {%- set ns_items = namespace(found_first=false) -%}
43
  {%- for item_key, item_value in value['items'] | dictsort -%}
44
  {%- if item_value is not none -%}
 
71
  }
72
  {%- endif -%}
73
  {%- endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
75
  type:<|"|>{{ value['type'] | upper }}<|"|>}
76
  {%- endif -%}
 
150
  {{- ns.result | trim -}}
151
  {%- endmacro -%}
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  {%- set ns = namespace(prev_message_type=None) -%}
154
  {%- set loop_messages = messages -%}
155
+ {{ bos_token }}
156
  {#- Handle System/Tool Definitions Block -#}
157
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
158
  {{- '<|turn>system\n' -}}
159
 
160
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
161
  {%- if enable_thinking is defined and enable_thinking -%}
162
+ {{- '<|think|>' -}}
163
  {%- set ns.prev_message_type = 'think' -%}
164
  {%- endif -%}
165
 
 
180
  {{- '<turn|>\n' -}}
181
  {%- endif %}
182
 
 
 
 
 
 
 
 
 
183
  {#- Loop through messages -#}
184
  {%- for message in loop_messages -%}
 
185
  {%- set ns.prev_message_type = None -%}
186
  {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  {{- '<|turn>' + role + '\n' }}
 
 
 
 
 
 
 
188
 
189
  {%- if message['tool_calls'] -%}
190
  {%- for tool_call in message['tool_calls'] -%}
 
205
  {%- set ns.prev_message_type = 'tool_call' -%}
206
  {%- endif -%}
207
 
208
+ {%- if message['tool_responses'] -%}
209
+ {#- Tool Response handling -#}
 
210
  {%- for tool_response in message['tool_responses'] -%}
211
+ {{- '<|tool_response>' -}}
212
+ {%- if tool_response['response'] is mapping -%}
213
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
214
+ {%- for key, value in tool_response['response'] | dictsort -%}
215
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
216
+ {%- if not loop.last %},{% endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  {%- endfor -%}
218
+ {{- '}' -}}
219
+ {%- else -%}
220
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  {%- endif -%}
222
+ {{- '<tool_response|>' -}}
223
  {%- endfor -%}
224
+ {%- set ns.prev_message_type = 'tool_response' -%}
225
  {%- endif -%}
226
 
227
  {%- if message['content'] is string -%}
 
239
  {{- item['text'] | trim -}}
240
  {%- endif -%}
241
  {%- elif item['type'] == 'image' -%}
242
+ {{- '\n\n<|image|>\n\n' -}}
243
  {%- set ns.prev_message_type = 'image' -%}
244
  {%- elif item['type'] == 'audio' -%}
245
  {{- '<|audio|>' -}}
246
  {%- set ns.prev_message_type = 'audio' -%}
247
  {%- elif item['type'] == 'video' -%}
248
+ {{- '\n\n<|video|>\n\n' -}}
249
  {%- set ns.prev_message_type = 'video' -%}
250
  {%- endif -%}
251
  {%- endfor -%}
252
  {%- endif -%}
253
 
254
+ {%- if not (message['tool_responses'] and not message['content']) -%}
 
 
255
  {{- '<turn|>\n' -}}
256
  {%- endif -%}
 
257
  {%- endfor -%}
258
 
259
  {%- if add_generation_prompt -%}
260
+ {%- if ns.prev_message_type != 'tool_response' -%}
261
  {{- '<|turn>model\n' -}}
262
+ {%- endif -%}
263
+ {%- if not enable_thinking | default(false) -%}
264
+ {{- '<|channel>thought\n<channel|>' -}}
265
  {%- endif -%}
266
  {%- endif -%}
config.json CHANGED
@@ -22,164 +22,644 @@
22
  "group_size": 64,
23
  "bits": 4,
24
  "mode": "affine",
 
 
 
 
 
 
 
 
 
 
 
 
25
  "language_model.model.layers.0.router.proj": {
26
  "group_size": 64,
27
  "bits": 8
28
  },
29
- "language_model.model.layers.1.router.proj": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  "group_size": 64,
31
  "bits": 8
32
  },
33
- "language_model.model.layers.2.router.proj": {
34
  "group_size": 64,
35
  "bits": 8
36
  },
37
- "language_model.model.layers.3.router.proj": {
38
  "group_size": 64,
39
  "bits": 8
40
  },
41
- "language_model.model.layers.4.router.proj": {
42
  "group_size": 64,
43
  "bits": 8
44
  },
45
- "language_model.model.layers.5.router.proj": {
46
  "group_size": 64,
47
  "bits": 8
48
  },
49
- "language_model.model.layers.6.router.proj": {
50
  "group_size": 64,
51
  "bits": 8
52
  },
53
- "language_model.model.layers.7.router.proj": {
54
  "group_size": 64,
55
  "bits": 8
56
  },
57
- "language_model.model.layers.8.router.proj": {
58
  "group_size": 64,
59
  "bits": 8
60
  },
61
- "language_model.model.layers.9.router.proj": {
62
  "group_size": 64,
63
  "bits": 8
64
  },
65
- "language_model.model.layers.10.router.proj": {
66
  "group_size": 64,
67
  "bits": 8
68
  },
69
- "language_model.model.layers.11.router.proj": {
70
  "group_size": 64,
71
  "bits": 8
72
  },
73
- "language_model.model.layers.12.router.proj": {
74
  "group_size": 64,
75
  "bits": 8
76
  },
77
- "language_model.model.layers.13.router.proj": {
78
  "group_size": 64,
79
  "bits": 8
80
  },
81
- "language_model.model.layers.14.router.proj": {
82
  "group_size": 64,
83
  "bits": 8
84
  },
85
- "language_model.model.layers.15.router.proj": {
86
  "group_size": 64,
87
  "bits": 8
88
  },
89
- "language_model.model.layers.16.router.proj": {
90
  "group_size": 64,
91
  "bits": 8
92
  },
93
- "language_model.model.layers.17.router.proj": {
94
  "group_size": 64,
95
  "bits": 8
96
  },
97
- "language_model.model.layers.18.router.proj": {
98
  "group_size": 64,
99
  "bits": 8
100
  },
101
- "language_model.model.layers.19.router.proj": {
102
  "group_size": 64,
103
  "bits": 8
104
  },
105
- "language_model.model.layers.20.router.proj": {
106
  "group_size": 64,
107
  "bits": 8
108
  },
109
- "language_model.model.layers.21.router.proj": {
110
  "group_size": 64,
111
  "bits": 8
112
  },
113
- "language_model.model.layers.22.router.proj": {
114
  "group_size": 64,
115
  "bits": 8
116
  },
117
- "language_model.model.layers.23.router.proj": {
118
  "group_size": 64,
119
  "bits": 8
120
  },
121
- "language_model.model.layers.24.router.proj": {
122
  "group_size": 64,
123
  "bits": 8
124
  },
125
- "language_model.model.layers.25.router.proj": {
126
  "group_size": 64,
127
  "bits": 8
128
  },
129
- "language_model.model.layers.26.router.proj": {
130
  "group_size": 64,
131
  "bits": 8
132
  },
133
- "language_model.model.layers.27.router.proj": {
134
  "group_size": 64,
135
  "bits": 8
136
  },
137
- "language_model.model.layers.28.router.proj": {
138
  "group_size": 64,
139
  "bits": 8
140
  },
141
- "language_model.model.layers.29.router.proj": {
142
  "group_size": 64,
143
  "bits": 8
144
- }
145
- },
146
- "quantization_config": {
147
- "group_size": 64,
148
- "bits": 4,
149
- "mode": "affine",
150
- "language_model.model.layers.0.router.proj": {
151
  "group_size": 64,
152
  "bits": 8
153
  },
154
- "language_model.model.layers.1.router.proj": {
155
  "group_size": 64,
156
  "bits": 8
157
  },
158
- "language_model.model.layers.2.router.proj": {
159
  "group_size": 64,
160
  "bits": 8
161
  },
162
- "language_model.model.layers.3.router.proj": {
163
  "group_size": 64,
164
  "bits": 8
165
  },
166
- "language_model.model.layers.4.router.proj": {
167
  "group_size": 64,
168
  "bits": 8
169
  },
170
- "language_model.model.layers.5.router.proj": {
171
  "group_size": 64,
172
  "bits": 8
173
  },
174
- "language_model.model.layers.6.router.proj": {
175
  "group_size": 64,
176
  "bits": 8
177
  },
178
- "language_model.model.layers.7.router.proj": {
179
  "group_size": 64,
180
  "bits": 8
181
  },
182
- "language_model.model.layers.8.router.proj": {
183
  "group_size": 64,
184
  "bits": 8
185
  },
@@ -187,82 +667,322 @@
187
  "group_size": 64,
188
  "bits": 8
189
  },
 
 
 
 
 
 
 
 
 
 
 
 
190
  "language_model.model.layers.10.router.proj": {
191
  "group_size": 64,
192
  "bits": 8
193
  },
 
 
 
 
 
 
 
 
 
 
 
 
194
  "language_model.model.layers.11.router.proj": {
195
  "group_size": 64,
196
  "bits": 8
197
  },
 
 
 
 
 
 
 
 
 
 
 
 
198
  "language_model.model.layers.12.router.proj": {
199
  "group_size": 64,
200
  "bits": 8
201
  },
 
 
 
 
 
 
 
 
 
 
 
 
202
  "language_model.model.layers.13.router.proj": {
203
  "group_size": 64,
204
  "bits": 8
205
  },
 
 
 
 
 
 
 
 
 
 
 
 
206
  "language_model.model.layers.14.router.proj": {
207
  "group_size": 64,
208
  "bits": 8
209
  },
 
 
 
 
 
 
 
 
 
 
 
 
210
  "language_model.model.layers.15.router.proj": {
211
  "group_size": 64,
212
  "bits": 8
213
  },
 
 
 
 
 
 
 
 
 
 
 
 
214
  "language_model.model.layers.16.router.proj": {
215
  "group_size": 64,
216
  "bits": 8
217
  },
 
 
 
 
 
 
 
 
 
 
 
 
218
  "language_model.model.layers.17.router.proj": {
219
  "group_size": 64,
220
  "bits": 8
221
  },
 
 
 
 
 
 
 
 
 
 
 
 
222
  "language_model.model.layers.18.router.proj": {
223
  "group_size": 64,
224
  "bits": 8
225
  },
 
 
 
 
 
 
 
 
 
 
 
 
226
  "language_model.model.layers.19.router.proj": {
227
  "group_size": 64,
228
  "bits": 8
229
  },
 
 
 
 
 
 
 
 
 
 
 
 
230
  "language_model.model.layers.20.router.proj": {
231
  "group_size": 64,
232
  "bits": 8
233
  },
 
 
 
 
 
 
 
 
 
 
 
 
234
  "language_model.model.layers.21.router.proj": {
235
  "group_size": 64,
236
  "bits": 8
237
  },
 
 
 
 
 
 
 
 
 
 
 
 
238
  "language_model.model.layers.22.router.proj": {
239
  "group_size": 64,
240
  "bits": 8
241
  },
 
 
 
 
 
 
 
 
 
 
 
 
242
  "language_model.model.layers.23.router.proj": {
243
  "group_size": 64,
244
  "bits": 8
245
  },
 
 
 
 
 
 
 
 
 
 
 
 
246
  "language_model.model.layers.24.router.proj": {
247
  "group_size": 64,
248
  "bits": 8
249
  },
 
 
 
 
 
 
 
 
 
 
 
 
250
  "language_model.model.layers.25.router.proj": {
251
  "group_size": 64,
252
  "bits": 8
253
  },
 
 
 
 
 
 
 
 
 
 
 
 
254
  "language_model.model.layers.26.router.proj": {
255
  "group_size": 64,
256
  "bits": 8
257
  },
 
 
 
 
 
 
 
 
 
 
 
 
258
  "language_model.model.layers.27.router.proj": {
259
  "group_size": 64,
260
  "bits": 8
261
  },
 
 
 
 
 
 
 
 
 
 
 
 
262
  "language_model.model.layers.28.router.proj": {
263
  "group_size": 64,
264
  "bits": 8
265
  },
 
 
 
 
 
 
 
 
 
 
 
 
266
  "language_model.model.layers.29.router.proj": {
267
  "group_size": 64,
268
  "bits": 8
@@ -350,5 +1070,48 @@
350
  "tie_word_embeddings": true,
351
  "transformers_version": "5.5.0.dev0",
352
  "video_token_id": 258884,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  "vision_soft_tokens_per_image": 280
354
  }
 
22
  "group_size": 64,
23
  "bits": 4,
24
  "mode": "affine",
25
+ "language_model.model.layers.0.mlp.gate_proj": {
26
+ "group_size": 64,
27
+ "bits": 8
28
+ },
29
+ "language_model.model.layers.0.mlp.down_proj": {
30
+ "group_size": 64,
31
+ "bits": 8
32
+ },
33
+ "language_model.model.layers.0.mlp.up_proj": {
34
+ "group_size": 64,
35
+ "bits": 8
36
+ },
37
  "language_model.model.layers.0.router.proj": {
38
  "group_size": 64,
39
  "bits": 8
40
  },
41
+ "language_model.model.layers.1.mlp.gate_proj": {
42
+ "group_size": 64,
43
+ "bits": 8
44
+ },
45
+ "language_model.model.layers.1.mlp.down_proj": {
46
+ "group_size": 64,
47
+ "bits": 8
48
+ },
49
+ "language_model.model.layers.1.mlp.up_proj": {
50
+ "group_size": 64,
51
+ "bits": 8
52
+ },
53
+ "language_model.model.layers.1.router.proj": {
54
+ "group_size": 64,
55
+ "bits": 8
56
+ },
57
+ "language_model.model.layers.2.mlp.gate_proj": {
58
+ "group_size": 64,
59
+ "bits": 8
60
+ },
61
+ "language_model.model.layers.2.mlp.down_proj": {
62
+ "group_size": 64,
63
+ "bits": 8
64
+ },
65
+ "language_model.model.layers.2.mlp.up_proj": {
66
+ "group_size": 64,
67
+ "bits": 8
68
+ },
69
+ "language_model.model.layers.2.router.proj": {
70
+ "group_size": 64,
71
+ "bits": 8
72
+ },
73
+ "language_model.model.layers.3.mlp.gate_proj": {
74
+ "group_size": 64,
75
+ "bits": 8
76
+ },
77
+ "language_model.model.layers.3.mlp.down_proj": {
78
+ "group_size": 64,
79
+ "bits": 8
80
+ },
81
+ "language_model.model.layers.3.mlp.up_proj": {
82
+ "group_size": 64,
83
+ "bits": 8
84
+ },
85
+ "language_model.model.layers.3.router.proj": {
86
+ "group_size": 64,
87
+ "bits": 8
88
+ },
89
+ "language_model.model.layers.4.mlp.gate_proj": {
90
+ "group_size": 64,
91
+ "bits": 8
92
+ },
93
+ "language_model.model.layers.4.mlp.down_proj": {
94
+ "group_size": 64,
95
+ "bits": 8
96
+ },
97
+ "language_model.model.layers.4.mlp.up_proj": {
98
+ "group_size": 64,
99
+ "bits": 8
100
+ },
101
+ "language_model.model.layers.4.router.proj": {
102
+ "group_size": 64,
103
+ "bits": 8
104
+ },
105
+ "language_model.model.layers.5.mlp.gate_proj": {
106
+ "group_size": 64,
107
+ "bits": 8
108
+ },
109
+ "language_model.model.layers.5.mlp.down_proj": {
110
+ "group_size": 64,
111
+ "bits": 8
112
+ },
113
+ "language_model.model.layers.5.mlp.up_proj": {
114
+ "group_size": 64,
115
+ "bits": 8
116
+ },
117
+ "language_model.model.layers.5.router.proj": {
118
+ "group_size": 64,
119
+ "bits": 8
120
+ },
121
+ "language_model.model.layers.6.mlp.gate_proj": {
122
+ "group_size": 64,
123
+ "bits": 8
124
+ },
125
+ "language_model.model.layers.6.mlp.down_proj": {
126
+ "group_size": 64,
127
+ "bits": 8
128
+ },
129
+ "language_model.model.layers.6.mlp.up_proj": {
130
+ "group_size": 64,
131
+ "bits": 8
132
+ },
133
+ "language_model.model.layers.6.router.proj": {
134
+ "group_size": 64,
135
+ "bits": 8
136
+ },
137
+ "language_model.model.layers.7.mlp.gate_proj": {
138
+ "group_size": 64,
139
+ "bits": 8
140
+ },
141
+ "language_model.model.layers.7.mlp.down_proj": {
142
+ "group_size": 64,
143
+ "bits": 8
144
+ },
145
+ "language_model.model.layers.7.mlp.up_proj": {
146
+ "group_size": 64,
147
+ "bits": 8
148
+ },
149
+ "language_model.model.layers.7.router.proj": {
150
+ "group_size": 64,
151
+ "bits": 8
152
+ },
153
+ "language_model.model.layers.8.mlp.gate_proj": {
154
+ "group_size": 64,
155
+ "bits": 8
156
+ },
157
+ "language_model.model.layers.8.mlp.down_proj": {
158
+ "group_size": 64,
159
+ "bits": 8
160
+ },
161
+ "language_model.model.layers.8.mlp.up_proj": {
162
+ "group_size": 64,
163
+ "bits": 8
164
+ },
165
+ "language_model.model.layers.8.router.proj": {
166
+ "group_size": 64,
167
+ "bits": 8
168
+ },
169
+ "language_model.model.layers.9.mlp.gate_proj": {
170
+ "group_size": 64,
171
+ "bits": 8
172
+ },
173
+ "language_model.model.layers.9.mlp.down_proj": {
174
+ "group_size": 64,
175
+ "bits": 8
176
+ },
177
+ "language_model.model.layers.9.mlp.up_proj": {
178
+ "group_size": 64,
179
+ "bits": 8
180
+ },
181
+ "language_model.model.layers.9.router.proj": {
182
+ "group_size": 64,
183
+ "bits": 8
184
+ },
185
+ "language_model.model.layers.10.mlp.gate_proj": {
186
+ "group_size": 64,
187
+ "bits": 8
188
+ },
189
+ "language_model.model.layers.10.mlp.down_proj": {
190
+ "group_size": 64,
191
+ "bits": 8
192
+ },
193
+ "language_model.model.layers.10.mlp.up_proj": {
194
+ "group_size": 64,
195
+ "bits": 8
196
+ },
197
+ "language_model.model.layers.10.router.proj": {
198
+ "group_size": 64,
199
+ "bits": 8
200
+ },
201
+ "language_model.model.layers.11.mlp.gate_proj": {
202
+ "group_size": 64,
203
+ "bits": 8
204
+ },
205
+ "language_model.model.layers.11.mlp.down_proj": {
206
+ "group_size": 64,
207
+ "bits": 8
208
+ },
209
+ "language_model.model.layers.11.mlp.up_proj": {
210
+ "group_size": 64,
211
+ "bits": 8
212
+ },
213
+ "language_model.model.layers.11.router.proj": {
214
+ "group_size": 64,
215
+ "bits": 8
216
+ },
217
+ "language_model.model.layers.12.mlp.gate_proj": {
218
+ "group_size": 64,
219
+ "bits": 8
220
+ },
221
+ "language_model.model.layers.12.mlp.down_proj": {
222
+ "group_size": 64,
223
+ "bits": 8
224
+ },
225
+ "language_model.model.layers.12.mlp.up_proj": {
226
+ "group_size": 64,
227
+ "bits": 8
228
+ },
229
+ "language_model.model.layers.12.router.proj": {
230
+ "group_size": 64,
231
+ "bits": 8
232
+ },
233
+ "language_model.model.layers.13.mlp.gate_proj": {
234
+ "group_size": 64,
235
+ "bits": 8
236
+ },
237
+ "language_model.model.layers.13.mlp.down_proj": {
238
+ "group_size": 64,
239
+ "bits": 8
240
+ },
241
+ "language_model.model.layers.13.mlp.up_proj": {
242
+ "group_size": 64,
243
+ "bits": 8
244
+ },
245
+ "language_model.model.layers.13.router.proj": {
246
+ "group_size": 64,
247
+ "bits": 8
248
+ },
249
+ "language_model.model.layers.14.mlp.gate_proj": {
250
+ "group_size": 64,
251
+ "bits": 8
252
+ },
253
+ "language_model.model.layers.14.mlp.down_proj": {
254
+ "group_size": 64,
255
+ "bits": 8
256
+ },
257
+ "language_model.model.layers.14.mlp.up_proj": {
258
+ "group_size": 64,
259
+ "bits": 8
260
+ },
261
+ "language_model.model.layers.14.router.proj": {
262
+ "group_size": 64,
263
+ "bits": 8
264
+ },
265
+ "language_model.model.layers.15.mlp.gate_proj": {
266
+ "group_size": 64,
267
+ "bits": 8
268
+ },
269
+ "language_model.model.layers.15.mlp.down_proj": {
270
+ "group_size": 64,
271
+ "bits": 8
272
+ },
273
+ "language_model.model.layers.15.mlp.up_proj": {
274
+ "group_size": 64,
275
+ "bits": 8
276
+ },
277
+ "language_model.model.layers.15.router.proj": {
278
+ "group_size": 64,
279
+ "bits": 8
280
+ },
281
+ "language_model.model.layers.16.mlp.gate_proj": {
282
+ "group_size": 64,
283
+ "bits": 8
284
+ },
285
+ "language_model.model.layers.16.mlp.down_proj": {
286
+ "group_size": 64,
287
+ "bits": 8
288
+ },
289
+ "language_model.model.layers.16.mlp.up_proj": {
290
+ "group_size": 64,
291
+ "bits": 8
292
+ },
293
+ "language_model.model.layers.16.router.proj": {
294
+ "group_size": 64,
295
+ "bits": 8
296
+ },
297
+ "language_model.model.layers.17.mlp.gate_proj": {
298
+ "group_size": 64,
299
+ "bits": 8
300
+ },
301
+ "language_model.model.layers.17.mlp.down_proj": {
302
+ "group_size": 64,
303
+ "bits": 8
304
+ },
305
+ "language_model.model.layers.17.mlp.up_proj": {
306
+ "group_size": 64,
307
+ "bits": 8
308
+ },
309
+ "language_model.model.layers.17.router.proj": {
310
+ "group_size": 64,
311
+ "bits": 8
312
+ },
313
+ "language_model.model.layers.18.mlp.gate_proj": {
314
+ "group_size": 64,
315
+ "bits": 8
316
+ },
317
+ "language_model.model.layers.18.mlp.down_proj": {
318
+ "group_size": 64,
319
+ "bits": 8
320
+ },
321
+ "language_model.model.layers.18.mlp.up_proj": {
322
+ "group_size": 64,
323
+ "bits": 8
324
+ },
325
+ "language_model.model.layers.18.router.proj": {
326
+ "group_size": 64,
327
+ "bits": 8
328
+ },
329
+ "language_model.model.layers.19.mlp.gate_proj": {
330
+ "group_size": 64,
331
+ "bits": 8
332
+ },
333
+ "language_model.model.layers.19.mlp.down_proj": {
334
+ "group_size": 64,
335
+ "bits": 8
336
+ },
337
+ "language_model.model.layers.19.mlp.up_proj": {
338
+ "group_size": 64,
339
+ "bits": 8
340
+ },
341
+ "language_model.model.layers.19.router.proj": {
342
+ "group_size": 64,
343
+ "bits": 8
344
+ },
345
+ "language_model.model.layers.20.mlp.gate_proj": {
346
+ "group_size": 64,
347
+ "bits": 8
348
+ },
349
+ "language_model.model.layers.20.mlp.down_proj": {
350
+ "group_size": 64,
351
+ "bits": 8
352
+ },
353
+ "language_model.model.layers.20.mlp.up_proj": {
354
+ "group_size": 64,
355
+ "bits": 8
356
+ },
357
+ "language_model.model.layers.20.router.proj": {
358
+ "group_size": 64,
359
+ "bits": 8
360
+ },
361
+ "language_model.model.layers.21.mlp.gate_proj": {
362
+ "group_size": 64,
363
+ "bits": 8
364
+ },
365
+ "language_model.model.layers.21.mlp.down_proj": {
366
+ "group_size": 64,
367
+ "bits": 8
368
+ },
369
+ "language_model.model.layers.21.mlp.up_proj": {
370
+ "group_size": 64,
371
+ "bits": 8
372
+ },
373
+ "language_model.model.layers.21.router.proj": {
374
+ "group_size": 64,
375
+ "bits": 8
376
+ },
377
+ "language_model.model.layers.22.mlp.gate_proj": {
378
+ "group_size": 64,
379
+ "bits": 8
380
+ },
381
+ "language_model.model.layers.22.mlp.down_proj": {
382
+ "group_size": 64,
383
+ "bits": 8
384
+ },
385
+ "language_model.model.layers.22.mlp.up_proj": {
386
+ "group_size": 64,
387
+ "bits": 8
388
+ },
389
+ "language_model.model.layers.22.router.proj": {
390
+ "group_size": 64,
391
+ "bits": 8
392
+ },
393
+ "language_model.model.layers.23.mlp.gate_proj": {
394
+ "group_size": 64,
395
+ "bits": 8
396
+ },
397
+ "language_model.model.layers.23.mlp.down_proj": {
398
+ "group_size": 64,
399
+ "bits": 8
400
+ },
401
+ "language_model.model.layers.23.mlp.up_proj": {
402
+ "group_size": 64,
403
+ "bits": 8
404
+ },
405
+ "language_model.model.layers.23.router.proj": {
406
+ "group_size": 64,
407
+ "bits": 8
408
+ },
409
+ "language_model.model.layers.24.mlp.gate_proj": {
410
+ "group_size": 64,
411
+ "bits": 8
412
+ },
413
+ "language_model.model.layers.24.mlp.down_proj": {
414
+ "group_size": 64,
415
+ "bits": 8
416
+ },
417
+ "language_model.model.layers.24.mlp.up_proj": {
418
+ "group_size": 64,
419
+ "bits": 8
420
+ },
421
+ "language_model.model.layers.24.router.proj": {
422
+ "group_size": 64,
423
+ "bits": 8
424
+ },
425
+ "language_model.model.layers.25.mlp.gate_proj": {
426
+ "group_size": 64,
427
+ "bits": 8
428
+ },
429
+ "language_model.model.layers.25.mlp.down_proj": {
430
+ "group_size": 64,
431
+ "bits": 8
432
+ },
433
+ "language_model.model.layers.25.mlp.up_proj": {
434
+ "group_size": 64,
435
+ "bits": 8
436
+ },
437
+ "language_model.model.layers.25.router.proj": {
438
+ "group_size": 64,
439
+ "bits": 8
440
+ },
441
+ "language_model.model.layers.26.mlp.gate_proj": {
442
+ "group_size": 64,
443
+ "bits": 8
444
+ },
445
+ "language_model.model.layers.26.mlp.down_proj": {
446
+ "group_size": 64,
447
+ "bits": 8
448
+ },
449
+ "language_model.model.layers.26.mlp.up_proj": {
450
+ "group_size": 64,
451
+ "bits": 8
452
+ },
453
+ "language_model.model.layers.26.router.proj": {
454
+ "group_size": 64,
455
+ "bits": 8
456
+ },
457
+ "language_model.model.layers.27.mlp.gate_proj": {
458
+ "group_size": 64,
459
+ "bits": 8
460
+ },
461
+ "language_model.model.layers.27.mlp.down_proj": {
462
+ "group_size": 64,
463
+ "bits": 8
464
+ },
465
+ "language_model.model.layers.27.mlp.up_proj": {
466
+ "group_size": 64,
467
+ "bits": 8
468
+ },
469
+ "language_model.model.layers.27.router.proj": {
470
+ "group_size": 64,
471
+ "bits": 8
472
+ },
473
+ "language_model.model.layers.28.mlp.gate_proj": {
474
+ "group_size": 64,
475
+ "bits": 8
476
+ },
477
+ "language_model.model.layers.28.mlp.down_proj": {
478
+ "group_size": 64,
479
+ "bits": 8
480
+ },
481
+ "language_model.model.layers.28.mlp.up_proj": {
482
+ "group_size": 64,
483
+ "bits": 8
484
+ },
485
+ "language_model.model.layers.28.router.proj": {
486
+ "group_size": 64,
487
+ "bits": 8
488
+ },
489
+ "language_model.model.layers.29.mlp.gate_proj": {
490
+ "group_size": 64,
491
+ "bits": 8
492
+ },
493
+ "language_model.model.layers.29.mlp.down_proj": {
494
+ "group_size": 64,
495
+ "bits": 8
496
+ },
497
+ "language_model.model.layers.29.mlp.up_proj": {
498
+ "group_size": 64,
499
+ "bits": 8
500
+ },
501
+ "language_model.model.layers.29.router.proj": {
502
+ "group_size": 64,
503
+ "bits": 8
504
+ }
505
+ },
506
+ "quantization_config": {
507
+ "group_size": 64,
508
+ "bits": 4,
509
+ "mode": "affine",
510
+ "language_model.model.layers.0.mlp.gate_proj": {
511
+ "group_size": 64,
512
+ "bits": 8
513
+ },
514
+ "language_model.model.layers.0.mlp.down_proj": {
515
  "group_size": 64,
516
  "bits": 8
517
  },
518
+ "language_model.model.layers.0.mlp.up_proj": {
519
  "group_size": 64,
520
  "bits": 8
521
  },
522
+ "language_model.model.layers.0.router.proj": {
523
  "group_size": 64,
524
  "bits": 8
525
  },
526
+ "language_model.model.layers.1.mlp.gate_proj": {
527
  "group_size": 64,
528
  "bits": 8
529
  },
530
+ "language_model.model.layers.1.mlp.down_proj": {
531
  "group_size": 64,
532
  "bits": 8
533
  },
534
+ "language_model.model.layers.1.mlp.up_proj": {
535
  "group_size": 64,
536
  "bits": 8
537
  },
538
+ "language_model.model.layers.1.router.proj": {
539
  "group_size": 64,
540
  "bits": 8
541
  },
542
+ "language_model.model.layers.2.mlp.gate_proj": {
543
  "group_size": 64,
544
  "bits": 8
545
  },
546
+ "language_model.model.layers.2.mlp.down_proj": {
547
  "group_size": 64,
548
  "bits": 8
549
  },
550
+ "language_model.model.layers.2.mlp.up_proj": {
551
  "group_size": 64,
552
  "bits": 8
553
  },
554
+ "language_model.model.layers.2.router.proj": {
555
  "group_size": 64,
556
  "bits": 8
557
  },
558
+ "language_model.model.layers.3.mlp.gate_proj": {
559
  "group_size": 64,
560
  "bits": 8
561
  },
562
+ "language_model.model.layers.3.mlp.down_proj": {
563
  "group_size": 64,
564
  "bits": 8
565
  },
566
+ "language_model.model.layers.3.mlp.up_proj": {
567
  "group_size": 64,
568
  "bits": 8
569
  },
570
+ "language_model.model.layers.3.router.proj": {
571
  "group_size": 64,
572
  "bits": 8
573
  },
574
+ "language_model.model.layers.4.mlp.gate_proj": {
575
  "group_size": 64,
576
  "bits": 8
577
  },
578
+ "language_model.model.layers.4.mlp.down_proj": {
579
  "group_size": 64,
580
  "bits": 8
581
  },
582
+ "language_model.model.layers.4.mlp.up_proj": {
583
  "group_size": 64,
584
  "bits": 8
585
  },
586
+ "language_model.model.layers.4.router.proj": {
587
  "group_size": 64,
588
  "bits": 8
589
  },
590
+ "language_model.model.layers.5.mlp.gate_proj": {
591
  "group_size": 64,
592
  "bits": 8
593
  },
594
+ "language_model.model.layers.5.mlp.down_proj": {
595
  "group_size": 64,
596
  "bits": 8
597
  },
598
+ "language_model.model.layers.5.mlp.up_proj": {
599
  "group_size": 64,
600
  "bits": 8
601
  },
602
+ "language_model.model.layers.5.router.proj": {
603
  "group_size": 64,
604
  "bits": 8
605
  },
606
+ "language_model.model.layers.6.mlp.gate_proj": {
607
  "group_size": 64,
608
  "bits": 8
609
  },
610
+ "language_model.model.layers.6.mlp.down_proj": {
611
  "group_size": 64,
612
  "bits": 8
613
  },
614
+ "language_model.model.layers.6.mlp.up_proj": {
615
  "group_size": 64,
616
  "bits": 8
617
  },
618
+ "language_model.model.layers.6.router.proj": {
619
  "group_size": 64,
620
  "bits": 8
621
  },
622
+ "language_model.model.layers.7.mlp.gate_proj": {
623
  "group_size": 64,
624
  "bits": 8
625
  },
626
+ "language_model.model.layers.7.mlp.down_proj": {
627
  "group_size": 64,
628
  "bits": 8
629
+ },
630
+ "language_model.model.layers.7.mlp.up_proj": {
 
 
 
 
 
631
  "group_size": 64,
632
  "bits": 8
633
  },
634
+ "language_model.model.layers.7.router.proj": {
635
  "group_size": 64,
636
  "bits": 8
637
  },
638
+ "language_model.model.layers.8.mlp.gate_proj": {
639
  "group_size": 64,
640
  "bits": 8
641
  },
642
+ "language_model.model.layers.8.mlp.down_proj": {
643
  "group_size": 64,
644
  "bits": 8
645
  },
646
+ "language_model.model.layers.8.mlp.up_proj": {
647
  "group_size": 64,
648
  "bits": 8
649
  },
650
+ "language_model.model.layers.8.router.proj": {
651
  "group_size": 64,
652
  "bits": 8
653
  },
654
+ "language_model.model.layers.9.mlp.gate_proj": {
655
  "group_size": 64,
656
  "bits": 8
657
  },
658
+ "language_model.model.layers.9.mlp.down_proj": {
659
  "group_size": 64,
660
  "bits": 8
661
  },
662
+ "language_model.model.layers.9.mlp.up_proj": {
663
  "group_size": 64,
664
  "bits": 8
665
  },
 
667
  "group_size": 64,
668
  "bits": 8
669
  },
670
+ "language_model.model.layers.10.mlp.gate_proj": {
671
+ "group_size": 64,
672
+ "bits": 8
673
+ },
674
+ "language_model.model.layers.10.mlp.down_proj": {
675
+ "group_size": 64,
676
+ "bits": 8
677
+ },
678
+ "language_model.model.layers.10.mlp.up_proj": {
679
+ "group_size": 64,
680
+ "bits": 8
681
+ },
682
  "language_model.model.layers.10.router.proj": {
683
  "group_size": 64,
684
  "bits": 8
685
  },
686
+ "language_model.model.layers.11.mlp.gate_proj": {
687
+ "group_size": 64,
688
+ "bits": 8
689
+ },
690
+ "language_model.model.layers.11.mlp.down_proj": {
691
+ "group_size": 64,
692
+ "bits": 8
693
+ },
694
+ "language_model.model.layers.11.mlp.up_proj": {
695
+ "group_size": 64,
696
+ "bits": 8
697
+ },
698
  "language_model.model.layers.11.router.proj": {
699
  "group_size": 64,
700
  "bits": 8
701
  },
702
+ "language_model.model.layers.12.mlp.gate_proj": {
703
+ "group_size": 64,
704
+ "bits": 8
705
+ },
706
+ "language_model.model.layers.12.mlp.down_proj": {
707
+ "group_size": 64,
708
+ "bits": 8
709
+ },
710
+ "language_model.model.layers.12.mlp.up_proj": {
711
+ "group_size": 64,
712
+ "bits": 8
713
+ },
714
  "language_model.model.layers.12.router.proj": {
715
  "group_size": 64,
716
  "bits": 8
717
  },
718
+ "language_model.model.layers.13.mlp.gate_proj": {
719
+ "group_size": 64,
720
+ "bits": 8
721
+ },
722
+ "language_model.model.layers.13.mlp.down_proj": {
723
+ "group_size": 64,
724
+ "bits": 8
725
+ },
726
+ "language_model.model.layers.13.mlp.up_proj": {
727
+ "group_size": 64,
728
+ "bits": 8
729
+ },
730
  "language_model.model.layers.13.router.proj": {
731
  "group_size": 64,
732
  "bits": 8
733
  },
734
+ "language_model.model.layers.14.mlp.gate_proj": {
735
+ "group_size": 64,
736
+ "bits": 8
737
+ },
738
+ "language_model.model.layers.14.mlp.down_proj": {
739
+ "group_size": 64,
740
+ "bits": 8
741
+ },
742
+ "language_model.model.layers.14.mlp.up_proj": {
743
+ "group_size": 64,
744
+ "bits": 8
745
+ },
746
  "language_model.model.layers.14.router.proj": {
747
  "group_size": 64,
748
  "bits": 8
749
  },
750
+ "language_model.model.layers.15.mlp.gate_proj": {
751
+ "group_size": 64,
752
+ "bits": 8
753
+ },
754
+ "language_model.model.layers.15.mlp.down_proj": {
755
+ "group_size": 64,
756
+ "bits": 8
757
+ },
758
+ "language_model.model.layers.15.mlp.up_proj": {
759
+ "group_size": 64,
760
+ "bits": 8
761
+ },
762
  "language_model.model.layers.15.router.proj": {
763
  "group_size": 64,
764
  "bits": 8
765
  },
766
+ "language_model.model.layers.16.mlp.gate_proj": {
767
+ "group_size": 64,
768
+ "bits": 8
769
+ },
770
+ "language_model.model.layers.16.mlp.down_proj": {
771
+ "group_size": 64,
772
+ "bits": 8
773
+ },
774
+ "language_model.model.layers.16.mlp.up_proj": {
775
+ "group_size": 64,
776
+ "bits": 8
777
+ },
778
  "language_model.model.layers.16.router.proj": {
779
  "group_size": 64,
780
  "bits": 8
781
  },
782
+ "language_model.model.layers.17.mlp.gate_proj": {
783
+ "group_size": 64,
784
+ "bits": 8
785
+ },
786
+ "language_model.model.layers.17.mlp.down_proj": {
787
+ "group_size": 64,
788
+ "bits": 8
789
+ },
790
+ "language_model.model.layers.17.mlp.up_proj": {
791
+ "group_size": 64,
792
+ "bits": 8
793
+ },
794
  "language_model.model.layers.17.router.proj": {
795
  "group_size": 64,
796
  "bits": 8
797
  },
798
+ "language_model.model.layers.18.mlp.gate_proj": {
799
+ "group_size": 64,
800
+ "bits": 8
801
+ },
802
+ "language_model.model.layers.18.mlp.down_proj": {
803
+ "group_size": 64,
804
+ "bits": 8
805
+ },
806
+ "language_model.model.layers.18.mlp.up_proj": {
807
+ "group_size": 64,
808
+ "bits": 8
809
+ },
810
  "language_model.model.layers.18.router.proj": {
811
  "group_size": 64,
812
  "bits": 8
813
  },
814
+ "language_model.model.layers.19.mlp.gate_proj": {
815
+ "group_size": 64,
816
+ "bits": 8
817
+ },
818
+ "language_model.model.layers.19.mlp.down_proj": {
819
+ "group_size": 64,
820
+ "bits": 8
821
+ },
822
+ "language_model.model.layers.19.mlp.up_proj": {
823
+ "group_size": 64,
824
+ "bits": 8
825
+ },
826
  "language_model.model.layers.19.router.proj": {
827
  "group_size": 64,
828
  "bits": 8
829
  },
830
+ "language_model.model.layers.20.mlp.gate_proj": {
831
+ "group_size": 64,
832
+ "bits": 8
833
+ },
834
+ "language_model.model.layers.20.mlp.down_proj": {
835
+ "group_size": 64,
836
+ "bits": 8
837
+ },
838
+ "language_model.model.layers.20.mlp.up_proj": {
839
+ "group_size": 64,
840
+ "bits": 8
841
+ },
842
  "language_model.model.layers.20.router.proj": {
843
  "group_size": 64,
844
  "bits": 8
845
  },
846
+ "language_model.model.layers.21.mlp.gate_proj": {
847
+ "group_size": 64,
848
+ "bits": 8
849
+ },
850
+ "language_model.model.layers.21.mlp.down_proj": {
851
+ "group_size": 64,
852
+ "bits": 8
853
+ },
854
+ "language_model.model.layers.21.mlp.up_proj": {
855
+ "group_size": 64,
856
+ "bits": 8
857
+ },
858
  "language_model.model.layers.21.router.proj": {
859
  "group_size": 64,
860
  "bits": 8
861
  },
862
+ "language_model.model.layers.22.mlp.gate_proj": {
863
+ "group_size": 64,
864
+ "bits": 8
865
+ },
866
+ "language_model.model.layers.22.mlp.down_proj": {
867
+ "group_size": 64,
868
+ "bits": 8
869
+ },
870
+ "language_model.model.layers.22.mlp.up_proj": {
871
+ "group_size": 64,
872
+ "bits": 8
873
+ },
874
  "language_model.model.layers.22.router.proj": {
875
  "group_size": 64,
876
  "bits": 8
877
  },
878
+ "language_model.model.layers.23.mlp.gate_proj": {
879
+ "group_size": 64,
880
+ "bits": 8
881
+ },
882
+ "language_model.model.layers.23.mlp.down_proj": {
883
+ "group_size": 64,
884
+ "bits": 8
885
+ },
886
+ "language_model.model.layers.23.mlp.up_proj": {
887
+ "group_size": 64,
888
+ "bits": 8
889
+ },
890
  "language_model.model.layers.23.router.proj": {
891
  "group_size": 64,
892
  "bits": 8
893
  },
894
+ "language_model.model.layers.24.mlp.gate_proj": {
895
+ "group_size": 64,
896
+ "bits": 8
897
+ },
898
+ "language_model.model.layers.24.mlp.down_proj": {
899
+ "group_size": 64,
900
+ "bits": 8
901
+ },
902
+ "language_model.model.layers.24.mlp.up_proj": {
903
+ "group_size": 64,
904
+ "bits": 8
905
+ },
906
  "language_model.model.layers.24.router.proj": {
907
  "group_size": 64,
908
  "bits": 8
909
  },
910
+ "language_model.model.layers.25.mlp.gate_proj": {
911
+ "group_size": 64,
912
+ "bits": 8
913
+ },
914
+ "language_model.model.layers.25.mlp.down_proj": {
915
+ "group_size": 64,
916
+ "bits": 8
917
+ },
918
+ "language_model.model.layers.25.mlp.up_proj": {
919
+ "group_size": 64,
920
+ "bits": 8
921
+ },
922
  "language_model.model.layers.25.router.proj": {
923
  "group_size": 64,
924
  "bits": 8
925
  },
926
+ "language_model.model.layers.26.mlp.gate_proj": {
927
+ "group_size": 64,
928
+ "bits": 8
929
+ },
930
+ "language_model.model.layers.26.mlp.down_proj": {
931
+ "group_size": 64,
932
+ "bits": 8
933
+ },
934
+ "language_model.model.layers.26.mlp.up_proj": {
935
+ "group_size": 64,
936
+ "bits": 8
937
+ },
938
  "language_model.model.layers.26.router.proj": {
939
  "group_size": 64,
940
  "bits": 8
941
  },
942
+ "language_model.model.layers.27.mlp.gate_proj": {
943
+ "group_size": 64,
944
+ "bits": 8
945
+ },
946
+ "language_model.model.layers.27.mlp.down_proj": {
947
+ "group_size": 64,
948
+ "bits": 8
949
+ },
950
+ "language_model.model.layers.27.mlp.up_proj": {
951
+ "group_size": 64,
952
+ "bits": 8
953
+ },
954
  "language_model.model.layers.27.router.proj": {
955
  "group_size": 64,
956
  "bits": 8
957
  },
958
+ "language_model.model.layers.28.mlp.gate_proj": {
959
+ "group_size": 64,
960
+ "bits": 8
961
+ },
962
+ "language_model.model.layers.28.mlp.down_proj": {
963
+ "group_size": 64,
964
+ "bits": 8
965
+ },
966
+ "language_model.model.layers.28.mlp.up_proj": {
967
+ "group_size": 64,
968
+ "bits": 8
969
+ },
970
  "language_model.model.layers.28.router.proj": {
971
  "group_size": 64,
972
  "bits": 8
973
  },
974
+ "language_model.model.layers.29.mlp.gate_proj": {
975
+ "group_size": 64,
976
+ "bits": 8
977
+ },
978
+ "language_model.model.layers.29.mlp.down_proj": {
979
+ "group_size": 64,
980
+ "bits": 8
981
+ },
982
+ "language_model.model.layers.29.mlp.up_proj": {
983
+ "group_size": 64,
984
+ "bits": 8
985
+ },
986
  "language_model.model.layers.29.router.proj": {
987
  "group_size": 64,
988
  "bits": 8
 
1070
  "tie_word_embeddings": true,
1071
  "transformers_version": "5.5.0.dev0",
1072
  "video_token_id": 258884,
1073
+ "vision_config": {
1074
+ "_name_or_path": "",
1075
+ "architectures": null,
1076
+ "attention_bias": false,
1077
+ "attention_dropout": 0.0,
1078
+ "chunk_size_feed_forward": 0,
1079
+ "default_output_length": 280,
1080
+ "dtype": "bfloat16",
1081
+ "global_head_dim": 72,
1082
+ "head_dim": 72,
1083
+ "hidden_activation": "gelu_pytorch_tanh",
1084
+ "hidden_size": 1152,
1085
+ "id2label": {
1086
+ "0": "LABEL_0",
1087
+ "1": "LABEL_1"
1088
+ },
1089
+ "initializer_range": 0.02,
1090
+ "intermediate_size": 4304,
1091
+ "is_encoder_decoder": false,
1092
+ "label2id": {
1093
+ "LABEL_0": 0,
1094
+ "LABEL_1": 1
1095
+ },
1096
+ "max_position_embeddings": 131072,
1097
+ "model_type": "gemma4_vision",
1098
+ "num_attention_heads": 16,
1099
+ "num_hidden_layers": 27,
1100
+ "num_key_value_heads": 16,
1101
+ "output_attentions": false,
1102
+ "output_hidden_states": false,
1103
+ "patch_size": 16,
1104
+ "pooling_kernel_size": 3,
1105
+ "position_embedding_size": 10240,
1106
+ "problem_type": null,
1107
+ "return_dict": true,
1108
+ "rms_norm_eps": 1e-06,
1109
+ "rope_parameters": {
1110
+ "rope_theta": 100.0,
1111
+ "rope_type": "default"
1112
+ },
1113
+ "standardize": true,
1114
+ "use_clipped_linears": false
1115
+ },
1116
  "vision_soft_tokens_per_image": 280
1117
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79044a795347440cadc6356659b328ac910063e5453d75289916db136bcf454a
3
- size 5320218487
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fda4abbfbd00608b7feb45ee1fae06ef4260d6bc621bd4d9790fe59d9b3bf91
3
+ size 5275612613
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b2fbff6ea86ca622457c6cff481404e83fba26a91ae81610e00b4ff9f137798
3
- size 5363328422
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e4a3fc2299cd049ef3e532ad142926b658be3e7f38739bcb3b72fd4e3f4779f
3
+ size 5296718228
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec1c68e5bc23c05e87b261b7a200c09d2c36b5b01ee6dcc5d8461078363aecf2
3
- size 3516685531
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42caaaf01e37ee338eb4fbabde70dd380416a9bee9b5a0ce0c1dfeb497635726
3
+ size 5036507675
model.safetensors.index.json CHANGED
@@ -1,9 +1,11 @@
1
  {
2
  "metadata": {
3
- "total_size": 14200055868,
4
- "total_parameters": 25233053440
5
  },
6
  "weight_map": {
 
 
 
7
  "language_model.model.embed_tokens.biases": "model-00001-of-00003.safetensors",
8
  "language_model.model.embed_tokens.scales": "model-00001-of-00003.safetensors",
9
  "language_model.model.embed_tokens.weight": "model-00001-of-00003.safetensors",
@@ -103,9 +105,9 @@
103
  "language_model.model.layers.10.experts.switch_glu.gate_proj.biases": "model-00001-of-00003.safetensors",
104
  "language_model.model.layers.10.experts.switch_glu.gate_proj.scales": "model-00001-of-00003.safetensors",
105
  "language_model.model.layers.10.experts.switch_glu.gate_proj.weight": "model-00001-of-00003.safetensors",
106
- "language_model.model.layers.10.experts.switch_glu.up_proj.biases": "model-00001-of-00003.safetensors",
107
- "language_model.model.layers.10.experts.switch_glu.up_proj.scales": "model-00001-of-00003.safetensors",
108
- "language_model.model.layers.10.experts.switch_glu.up_proj.weight": "model-00001-of-00003.safetensors",
109
  "language_model.model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
110
  "language_model.model.layers.10.layer_scalar": "model-00002-of-00003.safetensors",
111
  "language_model.model.layers.10.mlp.down_proj.biases": "model-00001-of-00003.safetensors",
@@ -631,9 +633,9 @@
631
  "language_model.model.layers.20.self_attn.v_proj.biases": "model-00002-of-00003.safetensors",
632
  "language_model.model.layers.20.self_attn.v_proj.scales": "model-00002-of-00003.safetensors",
633
  "language_model.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
634
- "language_model.model.layers.21.experts.switch_glu.down_proj.biases": "model-00002-of-00003.safetensors",
635
- "language_model.model.layers.21.experts.switch_glu.down_proj.scales": "model-00002-of-00003.safetensors",
636
- "language_model.model.layers.21.experts.switch_glu.down_proj.weight": "model-00002-of-00003.safetensors",
637
  "language_model.model.layers.21.experts.switch_glu.gate_proj.biases": "model-00002-of-00003.safetensors",
638
  "language_model.model.layers.21.experts.switch_glu.gate_proj.scales": "model-00002-of-00003.safetensors",
639
  "language_model.model.layers.21.experts.switch_glu.gate_proj.weight": "model-00002-of-00003.safetensors",
@@ -641,7 +643,7 @@
641
  "language_model.model.layers.21.experts.switch_glu.up_proj.scales": "model-00002-of-00003.safetensors",
642
  "language_model.model.layers.21.experts.switch_glu.up_proj.weight": "model-00002-of-00003.safetensors",
643
  "language_model.model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
644
- "language_model.model.layers.21.layer_scalar": "model-00002-of-00003.safetensors",
645
  "language_model.model.layers.21.mlp.down_proj.biases": "model-00002-of-00003.safetensors",
646
  "language_model.model.layers.21.mlp.down_proj.scales": "model-00002-of-00003.safetensors",
647
  "language_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
@@ -653,10 +655,10 @@
653
  "language_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
654
  "language_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
655
  "language_model.model.layers.21.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
656
- "language_model.model.layers.21.post_feedforward_layernorm_1.weight": "model-00002-of-00003.safetensors",
657
- "language_model.model.layers.21.post_feedforward_layernorm_2.weight": "model-00002-of-00003.safetensors",
658
  "language_model.model.layers.21.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
659
- "language_model.model.layers.21.pre_feedforward_layernorm_2.weight": "model-00002-of-00003.safetensors",
660
  "language_model.model.layers.21.router.per_expert_scale": "model-00002-of-00003.safetensors",
661
  "language_model.model.layers.21.router.proj.biases": "model-00002-of-00003.safetensors",
662
  "language_model.model.layers.21.router.proj.scales": "model-00002-of-00003.safetensors",
@@ -680,47 +682,47 @@
680
  "language_model.model.layers.22.experts.switch_glu.down_proj.scales": "model-00003-of-00003.safetensors",
681
  "language_model.model.layers.22.experts.switch_glu.down_proj.weight": "model-00003-of-00003.safetensors",
682
  "language_model.model.layers.22.experts.switch_glu.gate_proj.biases": "model-00003-of-00003.safetensors",
683
- "language_model.model.layers.22.experts.switch_glu.gate_proj.scales": "model-00002-of-00003.safetensors",
684
- "language_model.model.layers.22.experts.switch_glu.gate_proj.weight": "model-00002-of-00003.safetensors",
685
  "language_model.model.layers.22.experts.switch_glu.up_proj.biases": "model-00003-of-00003.safetensors",
686
  "language_model.model.layers.22.experts.switch_glu.up_proj.scales": "model-00003-of-00003.safetensors",
687
  "language_model.model.layers.22.experts.switch_glu.up_proj.weight": "model-00003-of-00003.safetensors",
688
- "language_model.model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
689
  "language_model.model.layers.22.layer_scalar": "model-00003-of-00003.safetensors",
690
- "language_model.model.layers.22.mlp.down_proj.biases": "model-00002-of-00003.safetensors",
691
- "language_model.model.layers.22.mlp.down_proj.scales": "model-00002-of-00003.safetensors",
692
- "language_model.model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
693
- "language_model.model.layers.22.mlp.gate_proj.biases": "model-00002-of-00003.safetensors",
694
- "language_model.model.layers.22.mlp.gate_proj.scales": "model-00002-of-00003.safetensors",
695
- "language_model.model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
696
- "language_model.model.layers.22.mlp.up_proj.biases": "model-00002-of-00003.safetensors",
697
- "language_model.model.layers.22.mlp.up_proj.scales": "model-00002-of-00003.safetensors",
698
- "language_model.model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
699
- "language_model.model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
700
- "language_model.model.layers.22.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
701
  "language_model.model.layers.22.post_feedforward_layernorm_1.weight": "model-00003-of-00003.safetensors",
702
  "language_model.model.layers.22.post_feedforward_layernorm_2.weight": "model-00003-of-00003.safetensors",
703
- "language_model.model.layers.22.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
704
  "language_model.model.layers.22.pre_feedforward_layernorm_2.weight": "model-00003-of-00003.safetensors",
705
- "language_model.model.layers.22.router.per_expert_scale": "model-00002-of-00003.safetensors",
706
- "language_model.model.layers.22.router.proj.biases": "model-00002-of-00003.safetensors",
707
- "language_model.model.layers.22.router.proj.scales": "model-00002-of-00003.safetensors",
708
- "language_model.model.layers.22.router.proj.weight": "model-00002-of-00003.safetensors",
709
- "language_model.model.layers.22.router.scale": "model-00002-of-00003.safetensors",
710
- "language_model.model.layers.22.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
711
- "language_model.model.layers.22.self_attn.k_proj.biases": "model-00002-of-00003.safetensors",
712
- "language_model.model.layers.22.self_attn.k_proj.scales": "model-00002-of-00003.safetensors",
713
- "language_model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
714
- "language_model.model.layers.22.self_attn.o_proj.biases": "model-00002-of-00003.safetensors",
715
- "language_model.model.layers.22.self_attn.o_proj.scales": "model-00002-of-00003.safetensors",
716
- "language_model.model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
717
- "language_model.model.layers.22.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
718
- "language_model.model.layers.22.self_attn.q_proj.biases": "model-00002-of-00003.safetensors",
719
- "language_model.model.layers.22.self_attn.q_proj.scales": "model-00002-of-00003.safetensors",
720
- "language_model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
721
- "language_model.model.layers.22.self_attn.v_proj.biases": "model-00002-of-00003.safetensors",
722
- "language_model.model.layers.22.self_attn.v_proj.scales": "model-00002-of-00003.safetensors",
723
- "language_model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
724
  "language_model.model.layers.23.experts.switch_glu.down_proj.biases": "model-00003-of-00003.safetensors",
725
  "language_model.model.layers.23.experts.switch_glu.down_proj.scales": "model-00003-of-00003.safetensors",
726
  "language_model.model.layers.23.experts.switch_glu.down_proj.weight": "model-00003-of-00003.safetensors",
@@ -1342,6 +1344,361 @@
1342
  "language_model.model.layers.9.self_attn.v_proj.biases": "model-00001-of-00003.safetensors",
1343
  "language_model.model.layers.9.self_attn.v_proj.scales": "model-00001-of-00003.safetensors",
1344
  "language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
1345
- "language_model.model.norm.weight": "model-00003-of-00003.safetensors"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1346
  }
1347
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 15608614044
 
4
  },
5
  "weight_map": {
6
+ "embed_vision.embedding_projection.biases": "model-00003-of-00003.safetensors",
7
+ "embed_vision.embedding_projection.scales": "model-00003-of-00003.safetensors",
8
+ "embed_vision.embedding_projection.weight": "model-00003-of-00003.safetensors",
9
  "language_model.model.embed_tokens.biases": "model-00001-of-00003.safetensors",
10
  "language_model.model.embed_tokens.scales": "model-00001-of-00003.safetensors",
11
  "language_model.model.embed_tokens.weight": "model-00001-of-00003.safetensors",
 
105
  "language_model.model.layers.10.experts.switch_glu.gate_proj.biases": "model-00001-of-00003.safetensors",
106
  "language_model.model.layers.10.experts.switch_glu.gate_proj.scales": "model-00001-of-00003.safetensors",
107
  "language_model.model.layers.10.experts.switch_glu.gate_proj.weight": "model-00001-of-00003.safetensors",
108
+ "language_model.model.layers.10.experts.switch_glu.up_proj.biases": "model-00002-of-00003.safetensors",
109
+ "language_model.model.layers.10.experts.switch_glu.up_proj.scales": "model-00002-of-00003.safetensors",
110
+ "language_model.model.layers.10.experts.switch_glu.up_proj.weight": "model-00002-of-00003.safetensors",
111
  "language_model.model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
112
  "language_model.model.layers.10.layer_scalar": "model-00002-of-00003.safetensors",
113
  "language_model.model.layers.10.mlp.down_proj.biases": "model-00001-of-00003.safetensors",
 
633
  "language_model.model.layers.20.self_attn.v_proj.biases": "model-00002-of-00003.safetensors",
634
  "language_model.model.layers.20.self_attn.v_proj.scales": "model-00002-of-00003.safetensors",
635
  "language_model.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
636
+ "language_model.model.layers.21.experts.switch_glu.down_proj.biases": "model-00003-of-00003.safetensors",
637
+ "language_model.model.layers.21.experts.switch_glu.down_proj.scales": "model-00003-of-00003.safetensors",
638
+ "language_model.model.layers.21.experts.switch_glu.down_proj.weight": "model-00003-of-00003.safetensors",
639
  "language_model.model.layers.21.experts.switch_glu.gate_proj.biases": "model-00002-of-00003.safetensors",
640
  "language_model.model.layers.21.experts.switch_glu.gate_proj.scales": "model-00002-of-00003.safetensors",
641
  "language_model.model.layers.21.experts.switch_glu.gate_proj.weight": "model-00002-of-00003.safetensors",
 
643
  "language_model.model.layers.21.experts.switch_glu.up_proj.scales": "model-00002-of-00003.safetensors",
644
  "language_model.model.layers.21.experts.switch_glu.up_proj.weight": "model-00002-of-00003.safetensors",
645
  "language_model.model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
646
+ "language_model.model.layers.21.layer_scalar": "model-00003-of-00003.safetensors",
647
  "language_model.model.layers.21.mlp.down_proj.biases": "model-00002-of-00003.safetensors",
648
  "language_model.model.layers.21.mlp.down_proj.scales": "model-00002-of-00003.safetensors",
649
  "language_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
 
655
  "language_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
656
  "language_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
657
  "language_model.model.layers.21.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
658
+ "language_model.model.layers.21.post_feedforward_layernorm_1.weight": "model-00003-of-00003.safetensors",
659
+ "language_model.model.layers.21.post_feedforward_layernorm_2.weight": "model-00003-of-00003.safetensors",
660
  "language_model.model.layers.21.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
661
+ "language_model.model.layers.21.pre_feedforward_layernorm_2.weight": "model-00003-of-00003.safetensors",
662
  "language_model.model.layers.21.router.per_expert_scale": "model-00002-of-00003.safetensors",
663
  "language_model.model.layers.21.router.proj.biases": "model-00002-of-00003.safetensors",
664
  "language_model.model.layers.21.router.proj.scales": "model-00002-of-00003.safetensors",
 
682
  "language_model.model.layers.22.experts.switch_glu.down_proj.scales": "model-00003-of-00003.safetensors",
683
  "language_model.model.layers.22.experts.switch_glu.down_proj.weight": "model-00003-of-00003.safetensors",
684
  "language_model.model.layers.22.experts.switch_glu.gate_proj.biases": "model-00003-of-00003.safetensors",
685
+ "language_model.model.layers.22.experts.switch_glu.gate_proj.scales": "model-00003-of-00003.safetensors",
686
+ "language_model.model.layers.22.experts.switch_glu.gate_proj.weight": "model-00003-of-00003.safetensors",
687
  "language_model.model.layers.22.experts.switch_glu.up_proj.biases": "model-00003-of-00003.safetensors",
688
  "language_model.model.layers.22.experts.switch_glu.up_proj.scales": "model-00003-of-00003.safetensors",
689
  "language_model.model.layers.22.experts.switch_glu.up_proj.weight": "model-00003-of-00003.safetensors",
690
+ "language_model.model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
691
  "language_model.model.layers.22.layer_scalar": "model-00003-of-00003.safetensors",
692
+ "language_model.model.layers.22.mlp.down_proj.biases": "model-00003-of-00003.safetensors",
693
+ "language_model.model.layers.22.mlp.down_proj.scales": "model-00003-of-00003.safetensors",
694
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
695
+ "language_model.model.layers.22.mlp.gate_proj.biases": "model-00003-of-00003.safetensors",
696
+ "language_model.model.layers.22.mlp.gate_proj.scales": "model-00003-of-00003.safetensors",
697
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
698
+ "language_model.model.layers.22.mlp.up_proj.biases": "model-00003-of-00003.safetensors",
699
+ "language_model.model.layers.22.mlp.up_proj.scales": "model-00003-of-00003.safetensors",
700
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
701
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
702
+ "language_model.model.layers.22.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
703
  "language_model.model.layers.22.post_feedforward_layernorm_1.weight": "model-00003-of-00003.safetensors",
704
  "language_model.model.layers.22.post_feedforward_layernorm_2.weight": "model-00003-of-00003.safetensors",
705
+ "language_model.model.layers.22.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
706
  "language_model.model.layers.22.pre_feedforward_layernorm_2.weight": "model-00003-of-00003.safetensors",
707
+ "language_model.model.layers.22.router.per_expert_scale": "model-00003-of-00003.safetensors",
708
+ "language_model.model.layers.22.router.proj.biases": "model-00003-of-00003.safetensors",
709
+ "language_model.model.layers.22.router.proj.scales": "model-00003-of-00003.safetensors",
710
+ "language_model.model.layers.22.router.proj.weight": "model-00003-of-00003.safetensors",
711
+ "language_model.model.layers.22.router.scale": "model-00003-of-00003.safetensors",
712
+ "language_model.model.layers.22.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
713
+ "language_model.model.layers.22.self_attn.k_proj.biases": "model-00003-of-00003.safetensors",
714
+ "language_model.model.layers.22.self_attn.k_proj.scales": "model-00003-of-00003.safetensors",
715
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
716
+ "language_model.model.layers.22.self_attn.o_proj.biases": "model-00003-of-00003.safetensors",
717
+ "language_model.model.layers.22.self_attn.o_proj.scales": "model-00003-of-00003.safetensors",
718
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
719
+ "language_model.model.layers.22.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
720
+ "language_model.model.layers.22.self_attn.q_proj.biases": "model-00003-of-00003.safetensors",
721
+ "language_model.model.layers.22.self_attn.q_proj.scales": "model-00003-of-00003.safetensors",
722
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
723
+ "language_model.model.layers.22.self_attn.v_proj.biases": "model-00003-of-00003.safetensors",
724
+ "language_model.model.layers.22.self_attn.v_proj.scales": "model-00003-of-00003.safetensors",
725
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
726
  "language_model.model.layers.23.experts.switch_glu.down_proj.biases": "model-00003-of-00003.safetensors",
727
  "language_model.model.layers.23.experts.switch_glu.down_proj.scales": "model-00003-of-00003.safetensors",
728
  "language_model.model.layers.23.experts.switch_glu.down_proj.weight": "model-00003-of-00003.safetensors",
 
1344
  "language_model.model.layers.9.self_attn.v_proj.biases": "model-00001-of-00003.safetensors",
1345
  "language_model.model.layers.9.self_attn.v_proj.scales": "model-00001-of-00003.safetensors",
1346
  "language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
1347
+ "language_model.model.norm.weight": "model-00003-of-00003.safetensors",
1348
+ "vision_tower.encoder.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors",
1349
+ "vision_tower.encoder.layers.0.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1350
+ "vision_tower.encoder.layers.0.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1351
+ "vision_tower.encoder.layers.0.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1352
+ "vision_tower.encoder.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1353
+ "vision_tower.encoder.layers.0.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1354
+ "vision_tower.encoder.layers.0.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1355
+ "vision_tower.encoder.layers.0.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1356
+ "vision_tower.encoder.layers.0.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1357
+ "vision_tower.encoder.layers.0.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1358
+ "vision_tower.encoder.layers.0.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1359
+ "vision_tower.encoder.layers.0.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1360
+ "vision_tower.encoder.layers.0.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1361
+ "vision_tower.encoder.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors",
1362
+ "vision_tower.encoder.layers.1.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1363
+ "vision_tower.encoder.layers.1.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1364
+ "vision_tower.encoder.layers.1.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1365
+ "vision_tower.encoder.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1366
+ "vision_tower.encoder.layers.1.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1367
+ "vision_tower.encoder.layers.1.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1368
+ "vision_tower.encoder.layers.1.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1369
+ "vision_tower.encoder.layers.1.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1370
+ "vision_tower.encoder.layers.1.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1371
+ "vision_tower.encoder.layers.1.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1372
+ "vision_tower.encoder.layers.1.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1373
+ "vision_tower.encoder.layers.1.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1374
+ "vision_tower.encoder.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors",
1375
+ "vision_tower.encoder.layers.10.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1376
+ "vision_tower.encoder.layers.10.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1377
+ "vision_tower.encoder.layers.10.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1378
+ "vision_tower.encoder.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1379
+ "vision_tower.encoder.layers.10.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1380
+ "vision_tower.encoder.layers.10.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1381
+ "vision_tower.encoder.layers.10.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1382
+ "vision_tower.encoder.layers.10.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1383
+ "vision_tower.encoder.layers.10.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1384
+ "vision_tower.encoder.layers.10.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1385
+ "vision_tower.encoder.layers.10.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1386
+ "vision_tower.encoder.layers.10.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1387
+ "vision_tower.encoder.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors",
1388
+ "vision_tower.encoder.layers.11.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1389
+ "vision_tower.encoder.layers.11.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1390
+ "vision_tower.encoder.layers.11.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1391
+ "vision_tower.encoder.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1392
+ "vision_tower.encoder.layers.11.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1393
+ "vision_tower.encoder.layers.11.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1394
+ "vision_tower.encoder.layers.11.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1395
+ "vision_tower.encoder.layers.11.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1396
+ "vision_tower.encoder.layers.11.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1397
+ "vision_tower.encoder.layers.11.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1398
+ "vision_tower.encoder.layers.11.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1399
+ "vision_tower.encoder.layers.11.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1400
+ "vision_tower.encoder.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors",
1401
+ "vision_tower.encoder.layers.12.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1402
+ "vision_tower.encoder.layers.12.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1403
+ "vision_tower.encoder.layers.12.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1404
+ "vision_tower.encoder.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1405
+ "vision_tower.encoder.layers.12.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1406
+ "vision_tower.encoder.layers.12.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1407
+ "vision_tower.encoder.layers.12.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1408
+ "vision_tower.encoder.layers.12.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1409
+ "vision_tower.encoder.layers.12.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1410
+ "vision_tower.encoder.layers.12.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1411
+ "vision_tower.encoder.layers.12.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1412
+ "vision_tower.encoder.layers.12.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1413
+ "vision_tower.encoder.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors",
1414
+ "vision_tower.encoder.layers.13.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1415
+ "vision_tower.encoder.layers.13.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1416
+ "vision_tower.encoder.layers.13.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1417
+ "vision_tower.encoder.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1418
+ "vision_tower.encoder.layers.13.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1419
+ "vision_tower.encoder.layers.13.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1420
+ "vision_tower.encoder.layers.13.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1421
+ "vision_tower.encoder.layers.13.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1422
+ "vision_tower.encoder.layers.13.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1423
+ "vision_tower.encoder.layers.13.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1424
+ "vision_tower.encoder.layers.13.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1425
+ "vision_tower.encoder.layers.13.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1426
+ "vision_tower.encoder.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors",
1427
+ "vision_tower.encoder.layers.14.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1428
+ "vision_tower.encoder.layers.14.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1429
+ "vision_tower.encoder.layers.14.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1430
+ "vision_tower.encoder.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1431
+ "vision_tower.encoder.layers.14.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1432
+ "vision_tower.encoder.layers.14.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1433
+ "vision_tower.encoder.layers.14.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1434
+ "vision_tower.encoder.layers.14.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1435
+ "vision_tower.encoder.layers.14.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1436
+ "vision_tower.encoder.layers.14.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1437
+ "vision_tower.encoder.layers.14.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1438
+ "vision_tower.encoder.layers.14.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1439
+ "vision_tower.encoder.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors",
1440
+ "vision_tower.encoder.layers.15.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1441
+ "vision_tower.encoder.layers.15.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1442
+ "vision_tower.encoder.layers.15.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1443
+ "vision_tower.encoder.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1444
+ "vision_tower.encoder.layers.15.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1445
+ "vision_tower.encoder.layers.15.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1446
+ "vision_tower.encoder.layers.15.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1447
+ "vision_tower.encoder.layers.15.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1448
+ "vision_tower.encoder.layers.15.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1449
+ "vision_tower.encoder.layers.15.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1450
+ "vision_tower.encoder.layers.15.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1451
+ "vision_tower.encoder.layers.15.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1452
+ "vision_tower.encoder.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors",
1453
+ "vision_tower.encoder.layers.16.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1454
+ "vision_tower.encoder.layers.16.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1455
+ "vision_tower.encoder.layers.16.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1456
+ "vision_tower.encoder.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1457
+ "vision_tower.encoder.layers.16.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1458
+ "vision_tower.encoder.layers.16.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1459
+ "vision_tower.encoder.layers.16.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1460
+ "vision_tower.encoder.layers.16.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1461
+ "vision_tower.encoder.layers.16.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1462
+ "vision_tower.encoder.layers.16.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1463
+ "vision_tower.encoder.layers.16.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1464
+ "vision_tower.encoder.layers.16.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1465
+ "vision_tower.encoder.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors",
1466
+ "vision_tower.encoder.layers.17.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1467
+ "vision_tower.encoder.layers.17.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1468
+ "vision_tower.encoder.layers.17.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1469
+ "vision_tower.encoder.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1470
+ "vision_tower.encoder.layers.17.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1471
+ "vision_tower.encoder.layers.17.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1472
+ "vision_tower.encoder.layers.17.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1473
+ "vision_tower.encoder.layers.17.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1474
+ "vision_tower.encoder.layers.17.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1475
+ "vision_tower.encoder.layers.17.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1476
+ "vision_tower.encoder.layers.17.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1477
+ "vision_tower.encoder.layers.17.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1478
+ "vision_tower.encoder.layers.18.input_layernorm.weight": "model-00003-of-00003.safetensors",
1479
+ "vision_tower.encoder.layers.18.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1480
+ "vision_tower.encoder.layers.18.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1481
+ "vision_tower.encoder.layers.18.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1482
+ "vision_tower.encoder.layers.18.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1483
+ "vision_tower.encoder.layers.18.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1484
+ "vision_tower.encoder.layers.18.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1485
+ "vision_tower.encoder.layers.18.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1486
+ "vision_tower.encoder.layers.18.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1487
+ "vision_tower.encoder.layers.18.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1488
+ "vision_tower.encoder.layers.18.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1489
+ "vision_tower.encoder.layers.18.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1490
+ "vision_tower.encoder.layers.18.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1491
+ "vision_tower.encoder.layers.19.input_layernorm.weight": "model-00003-of-00003.safetensors",
1492
+ "vision_tower.encoder.layers.19.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1493
+ "vision_tower.encoder.layers.19.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1494
+ "vision_tower.encoder.layers.19.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1495
+ "vision_tower.encoder.layers.19.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1496
+ "vision_tower.encoder.layers.19.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1497
+ "vision_tower.encoder.layers.19.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1498
+ "vision_tower.encoder.layers.19.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1499
+ "vision_tower.encoder.layers.19.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1500
+ "vision_tower.encoder.layers.19.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1501
+ "vision_tower.encoder.layers.19.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1502
+ "vision_tower.encoder.layers.19.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1503
+ "vision_tower.encoder.layers.19.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1504
+ "vision_tower.encoder.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors",
1505
+ "vision_tower.encoder.layers.2.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1506
+ "vision_tower.encoder.layers.2.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1507
+ "vision_tower.encoder.layers.2.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1508
+ "vision_tower.encoder.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1509
+ "vision_tower.encoder.layers.2.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1510
+ "vision_tower.encoder.layers.2.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1511
+ "vision_tower.encoder.layers.2.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1512
+ "vision_tower.encoder.layers.2.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1513
+ "vision_tower.encoder.layers.2.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1514
+ "vision_tower.encoder.layers.2.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1515
+ "vision_tower.encoder.layers.2.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1516
+ "vision_tower.encoder.layers.2.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1517
+ "vision_tower.encoder.layers.20.input_layernorm.weight": "model-00003-of-00003.safetensors",
1518
+ "vision_tower.encoder.layers.20.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1519
+ "vision_tower.encoder.layers.20.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1520
+ "vision_tower.encoder.layers.20.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1521
+ "vision_tower.encoder.layers.20.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1522
+ "vision_tower.encoder.layers.20.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1523
+ "vision_tower.encoder.layers.20.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1524
+ "vision_tower.encoder.layers.20.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1525
+ "vision_tower.encoder.layers.20.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1526
+ "vision_tower.encoder.layers.20.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1527
+ "vision_tower.encoder.layers.20.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1528
+ "vision_tower.encoder.layers.20.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1529
+ "vision_tower.encoder.layers.20.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1530
+ "vision_tower.encoder.layers.21.input_layernorm.weight": "model-00003-of-00003.safetensors",
1531
+ "vision_tower.encoder.layers.21.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1532
+ "vision_tower.encoder.layers.21.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1533
+ "vision_tower.encoder.layers.21.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1534
+ "vision_tower.encoder.layers.21.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1535
+ "vision_tower.encoder.layers.21.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1536
+ "vision_tower.encoder.layers.21.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1537
+ "vision_tower.encoder.layers.21.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1538
+ "vision_tower.encoder.layers.21.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1539
+ "vision_tower.encoder.layers.21.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1540
+ "vision_tower.encoder.layers.21.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1541
+ "vision_tower.encoder.layers.21.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1542
+ "vision_tower.encoder.layers.21.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1543
+ "vision_tower.encoder.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
1544
+ "vision_tower.encoder.layers.22.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1545
+ "vision_tower.encoder.layers.22.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1546
+ "vision_tower.encoder.layers.22.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1547
+ "vision_tower.encoder.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1548
+ "vision_tower.encoder.layers.22.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1549
+ "vision_tower.encoder.layers.22.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1550
+ "vision_tower.encoder.layers.22.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1551
+ "vision_tower.encoder.layers.22.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1552
+ "vision_tower.encoder.layers.22.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1553
+ "vision_tower.encoder.layers.22.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1554
+ "vision_tower.encoder.layers.22.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1555
+ "vision_tower.encoder.layers.22.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1556
+ "vision_tower.encoder.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
1557
+ "vision_tower.encoder.layers.23.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1558
+ "vision_tower.encoder.layers.23.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1559
+ "vision_tower.encoder.layers.23.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1560
+ "vision_tower.encoder.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1561
+ "vision_tower.encoder.layers.23.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1562
+ "vision_tower.encoder.layers.23.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1563
+ "vision_tower.encoder.layers.23.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1564
+ "vision_tower.encoder.layers.23.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1565
+ "vision_tower.encoder.layers.23.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1566
+ "vision_tower.encoder.layers.23.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1567
+ "vision_tower.encoder.layers.23.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1568
+ "vision_tower.encoder.layers.23.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1569
+ "vision_tower.encoder.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
1570
+ "vision_tower.encoder.layers.24.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1571
+ "vision_tower.encoder.layers.24.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1572
+ "vision_tower.encoder.layers.24.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1573
+ "vision_tower.encoder.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1574
+ "vision_tower.encoder.layers.24.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1575
+ "vision_tower.encoder.layers.24.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1576
+ "vision_tower.encoder.layers.24.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1577
+ "vision_tower.encoder.layers.24.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1578
+ "vision_tower.encoder.layers.24.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1579
+ "vision_tower.encoder.layers.24.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1580
+ "vision_tower.encoder.layers.24.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1581
+ "vision_tower.encoder.layers.24.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1582
+ "vision_tower.encoder.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
1583
+ "vision_tower.encoder.layers.25.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1584
+ "vision_tower.encoder.layers.25.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1585
+ "vision_tower.encoder.layers.25.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1586
+ "vision_tower.encoder.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1587
+ "vision_tower.encoder.layers.25.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1588
+ "vision_tower.encoder.layers.25.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1589
+ "vision_tower.encoder.layers.25.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1590
+ "vision_tower.encoder.layers.25.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1591
+ "vision_tower.encoder.layers.25.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1592
+ "vision_tower.encoder.layers.25.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1593
+ "vision_tower.encoder.layers.25.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1594
+ "vision_tower.encoder.layers.25.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1595
+ "vision_tower.encoder.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
1596
+ "vision_tower.encoder.layers.26.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1597
+ "vision_tower.encoder.layers.26.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1598
+ "vision_tower.encoder.layers.26.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1599
+ "vision_tower.encoder.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1600
+ "vision_tower.encoder.layers.26.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1601
+ "vision_tower.encoder.layers.26.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1602
+ "vision_tower.encoder.layers.26.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1603
+ "vision_tower.encoder.layers.26.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1604
+ "vision_tower.encoder.layers.26.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1605
+ "vision_tower.encoder.layers.26.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1606
+ "vision_tower.encoder.layers.26.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1607
+ "vision_tower.encoder.layers.26.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1608
+ "vision_tower.encoder.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors",
1609
+ "vision_tower.encoder.layers.3.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1610
+ "vision_tower.encoder.layers.3.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1611
+ "vision_tower.encoder.layers.3.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1612
+ "vision_tower.encoder.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1613
+ "vision_tower.encoder.layers.3.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1614
+ "vision_tower.encoder.layers.3.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1615
+ "vision_tower.encoder.layers.3.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1616
+ "vision_tower.encoder.layers.3.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1617
+ "vision_tower.encoder.layers.3.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1618
+ "vision_tower.encoder.layers.3.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1619
+ "vision_tower.encoder.layers.3.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1620
+ "vision_tower.encoder.layers.3.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1621
+ "vision_tower.encoder.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors",
1622
+ "vision_tower.encoder.layers.4.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1623
+ "vision_tower.encoder.layers.4.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1624
+ "vision_tower.encoder.layers.4.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1625
+ "vision_tower.encoder.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1626
+ "vision_tower.encoder.layers.4.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1627
+ "vision_tower.encoder.layers.4.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1628
+ "vision_tower.encoder.layers.4.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1629
+ "vision_tower.encoder.layers.4.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1630
+ "vision_tower.encoder.layers.4.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1631
+ "vision_tower.encoder.layers.4.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1632
+ "vision_tower.encoder.layers.4.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1633
+ "vision_tower.encoder.layers.4.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1634
+ "vision_tower.encoder.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors",
1635
+ "vision_tower.encoder.layers.5.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1636
+ "vision_tower.encoder.layers.5.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1637
+ "vision_tower.encoder.layers.5.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1638
+ "vision_tower.encoder.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1639
+ "vision_tower.encoder.layers.5.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1640
+ "vision_tower.encoder.layers.5.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1641
+ "vision_tower.encoder.layers.5.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1642
+ "vision_tower.encoder.layers.5.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1643
+ "vision_tower.encoder.layers.5.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1644
+ "vision_tower.encoder.layers.5.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1645
+ "vision_tower.encoder.layers.5.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1646
+ "vision_tower.encoder.layers.5.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1647
+ "vision_tower.encoder.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors",
1648
+ "vision_tower.encoder.layers.6.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1649
+ "vision_tower.encoder.layers.6.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1650
+ "vision_tower.encoder.layers.6.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1651
+ "vision_tower.encoder.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1652
+ "vision_tower.encoder.layers.6.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1653
+ "vision_tower.encoder.layers.6.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1654
+ "vision_tower.encoder.layers.6.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1655
+ "vision_tower.encoder.layers.6.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1656
+ "vision_tower.encoder.layers.6.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1657
+ "vision_tower.encoder.layers.6.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1658
+ "vision_tower.encoder.layers.6.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1659
+ "vision_tower.encoder.layers.6.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1660
+ "vision_tower.encoder.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors",
1661
+ "vision_tower.encoder.layers.7.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1662
+ "vision_tower.encoder.layers.7.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1663
+ "vision_tower.encoder.layers.7.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1664
+ "vision_tower.encoder.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1665
+ "vision_tower.encoder.layers.7.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1666
+ "vision_tower.encoder.layers.7.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1667
+ "vision_tower.encoder.layers.7.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1668
+ "vision_tower.encoder.layers.7.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1669
+ "vision_tower.encoder.layers.7.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1670
+ "vision_tower.encoder.layers.7.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1671
+ "vision_tower.encoder.layers.7.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1672
+ "vision_tower.encoder.layers.7.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1673
+ "vision_tower.encoder.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors",
1674
+ "vision_tower.encoder.layers.8.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1675
+ "vision_tower.encoder.layers.8.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1676
+ "vision_tower.encoder.layers.8.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1677
+ "vision_tower.encoder.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1678
+ "vision_tower.encoder.layers.8.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1679
+ "vision_tower.encoder.layers.8.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1680
+ "vision_tower.encoder.layers.8.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1681
+ "vision_tower.encoder.layers.8.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1682
+ "vision_tower.encoder.layers.8.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1683
+ "vision_tower.encoder.layers.8.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1684
+ "vision_tower.encoder.layers.8.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1685
+ "vision_tower.encoder.layers.8.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1686
+ "vision_tower.encoder.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors",
1687
+ "vision_tower.encoder.layers.9.mlp.down_proj.linear.weight": "model-00003-of-00003.safetensors",
1688
+ "vision_tower.encoder.layers.9.mlp.gate_proj.linear.weight": "model-00003-of-00003.safetensors",
1689
+ "vision_tower.encoder.layers.9.mlp.up_proj.linear.weight": "model-00003-of-00003.safetensors",
1690
+ "vision_tower.encoder.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
1691
+ "vision_tower.encoder.layers.9.post_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1692
+ "vision_tower.encoder.layers.9.pre_feedforward_layernorm.weight": "model-00003-of-00003.safetensors",
1693
+ "vision_tower.encoder.layers.9.self_attn.k_norm.weight": "model-00003-of-00003.safetensors",
1694
+ "vision_tower.encoder.layers.9.self_attn.k_proj.linear.weight": "model-00003-of-00003.safetensors",
1695
+ "vision_tower.encoder.layers.9.self_attn.o_proj.linear.weight": "model-00003-of-00003.safetensors",
1696
+ "vision_tower.encoder.layers.9.self_attn.q_norm.weight": "model-00003-of-00003.safetensors",
1697
+ "vision_tower.encoder.layers.9.self_attn.q_proj.linear.weight": "model-00003-of-00003.safetensors",
1698
+ "vision_tower.encoder.layers.9.self_attn.v_proj.linear.weight": "model-00003-of-00003.safetensors",
1699
+ "vision_tower.patch_embedder.input_proj.weight": "model-00003-of-00003.safetensors",
1700
+ "vision_tower.patch_embedder.position_embedding_table": "model-00003-of-00003.safetensors",
1701
+ "vision_tower.std_bias": "model-00003-of-00003.safetensors",
1702
+ "vision_tower.std_scale": "model-00003-of-00003.safetensors"
1703
  }
1704
  }
processor_config.json CHANGED
@@ -1,27 +1,5 @@
1
  {
2
- "audio_ms_per_token": 40,
3
  "audio_seq_length": 750,
4
- "feature_extractor": {
5
- "dither": 0.0,
6
- "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
- "feature_size": 128,
8
- "fft_length": 512,
9
- "fft_overdrive": false,
10
- "frame_length": 320,
11
- "hop_length": 160,
12
- "input_scale_factor": 1.0,
13
- "max_frequency": 8000.0,
14
- "mel_floor": 0.001,
15
- "min_frequency": 0.0,
16
- "padding_side": "right",
17
- "padding_value": 0.0,
18
- "per_bin_mean": null,
19
- "per_bin_stddev": null,
20
- "preemphasis": 0.0,
21
- "preemphasis_htk_flavor": true,
22
- "return_attention_mask": true,
23
- "sampling_rate": 16000
24
- },
25
  "image_processor": {
26
  "do_convert_rgb": true,
27
  "do_normalize": false,
@@ -43,33 +21,22 @@
43
  "patch_size": 16,
44
  "pooling_kernel_size": 3,
45
  "resample": 3,
46
- "rescale_factor": 0.00392156862745098
 
 
 
 
47
  },
48
  "image_seq_length": 280,
49
  "processor_class": "Gemma4Processor",
50
- "video_processor": {
51
- "do_convert_rgb": true,
52
- "do_normalize": true,
53
- "do_rescale": true,
54
- "do_resize": true,
55
- "do_sample_frames": true,
56
- "image_mean": [
57
- 0.0,
58
- 0.0,
59
- 0.0
60
- ],
61
- "image_std": [
62
- 1.0,
63
- 1.0,
64
- 1.0
65
- ],
66
- "max_soft_tokens": 70,
67
- "num_frames": 32,
68
- "patch_size": 16,
69
- "pooling_kernel_size": 3,
70
- "resample": 3,
71
- "rescale_factor": 0.00392156862745098,
72
- "return_metadata": false,
73
- "video_processor_type": "Gemma4VideoProcessor"
74
- }
75
- }
 
1
  {
 
2
  "audio_seq_length": 750,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "image_processor": {
4
  "do_convert_rgb": true,
5
  "do_normalize": false,
 
21
  "patch_size": 16,
22
  "pooling_kernel_size": 3,
23
  "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 224,
27
+ "width": 224
28
+ }
29
  },
30
  "image_seq_length": 280,
31
  "processor_class": "Gemma4Processor",
32
+ "feature_extractor": {
33
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
34
+ "sampling_rate": 16000,
35
+ "num_mel_filters": 128,
36
+ "fft_length": 512,
37
+ "hop_length": 160,
38
+ "chunk_duration": 8.0,
39
+ "overlap_duration": 1.0
40
+ },
41
+ "audio_ms_per_token": 40
42
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json CHANGED
@@ -17,50 +17,71 @@
17
  "<|video|>"
18
  ],
19
  "image_token": "<|image|>",
 
20
  "mask_token": "<mask>",
21
  "model_max_length": 1000000000000000019884624838656,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "pad_token": "<pad>",
23
  "padding_side": "left",
24
  "processor_class": "Gemma4Processor",
25
  "response_schema": {
26
- "type": "object",
27
  "properties": {
 
 
 
28
  "role": {
29
  "const": "assistant"
30
  },
31
  "thinking": {
32
  "type": "string"
33
  },
34
- "content": {
35
- "type": "string"
36
- },
37
  "tool_calls": {
38
- "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>",
39
- "type": "array",
40
  "items": {
41
- "type": "object",
42
  "properties": {
43
- "type": {
44
- "const": "function"
45
- },
46
  "function": {
47
- "type": "object",
48
- "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})",
49
  "properties": {
50
- "name": {
51
- "type": "string"
52
- },
53
  "arguments": {
 
54
  "type": "object",
55
- "x-parser": "gemma4-tool-call",
56
- "additionalProperties": {}
 
 
57
  }
58
- }
 
 
 
 
 
59
  }
60
- }
61
- }
 
 
 
62
  }
63
  },
 
64
  "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<content>(?:(?!\\<\\|tool_call\\>)(?!\\<turn\\|\\>).)+)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?:\\<turn\\|\\>)?"
65
  },
66
  "soc_token": "<|channel>",
 
17
  "<|video|>"
18
  ],
19
  "image_token": "<|image|>",
20
+ "is_local": true,
21
  "mask_token": "<mask>",
22
  "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
  "pad_token": "<pad>",
44
  "padding_side": "left",
45
  "processor_class": "Gemma4Processor",
46
  "response_schema": {
 
47
  "properties": {
48
+ "content": {
49
+ "type": "string"
50
+ },
51
  "role": {
52
  "const": "assistant"
53
  },
54
  "thinking": {
55
  "type": "string"
56
  },
 
 
 
57
  "tool_calls": {
 
 
58
  "items": {
 
59
  "properties": {
 
 
 
60
  "function": {
 
 
61
  "properties": {
 
 
 
62
  "arguments": {
63
+ "additionalProperties": {},
64
  "type": "object",
65
+ "x-parser": "gemma4-tool-call"
66
+ },
67
+ "name": {
68
+ "type": "string"
69
  }
70
+ },
71
+ "type": "object",
72
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
73
+ },
74
+ "type": {
75
+ "const": "function"
76
  }
77
+ },
78
+ "type": "object"
79
+ },
80
+ "type": "array",
81
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
82
  }
83
  },
84
+ "type": "object",
85
  "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<content>(?:(?!\\<\\|tool_call\\>)(?!\\<turn\\|\\>).)+)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?:\\<turn\\|\\>)?"
86
  },
87
  "soc_token": "<|channel>",