Snider Virgil commited on
Commit
826e60a
·
1 Parent(s): 4407e16

feat: merge LEK into lemrd weights

Browse files

LEK-2 LoRA merged into Gemma 4 31B Dense attention projections.
Converged at loss 0.0002 in 278 steps via patience-stop (best at 248).
Gradient checkpointing enabled. KV-shared layers restored from base.

Co-Authored-By: Virgil <virgil@lethean.io>

README.md CHANGED
@@ -1,9 +1,7 @@
1
  ---
2
- library_name: mlx
3
- license: apache-2.0
4
- license_link: https://ai.google.dev/gemma/docs/gemma_4_license
5
- pipeline_tag: text-generation
6
- base_model: google/gemma-4-31b-it
7
  tags:
8
  - mlx
 
 
9
  ---
 
1
  ---
2
+ language: en
 
 
 
 
3
  tags:
4
  - mlx
5
+ pipeline_tag: image-text-to-text
6
+ library_name: mlx
7
  ---
chat_template.jinja CHANGED
@@ -11,15 +11,34 @@
11
  description:<|"|>{{ value['description'] }}<|"|>
12
  {%- set add_comma = true -%}
13
  {%- endif -%}
 
 
 
 
14
  {%- if value['type'] | upper == 'STRING' -%}
15
  {%- if value['enum'] -%}
16
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
  enum:{{ format_argument(value['enum']) }}
18
  {%- endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {%- elif value['type'] | upper == 'ARRAY' -%}
20
  {%- if value['items'] is mapping and value['items'] -%}
21
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
- items:{
23
  {%- set ns_items = namespace(found_first=false) -%}
24
  {%- for item_key, item_value in value['items'] | dictsort -%}
25
  {%- if item_value is not none -%}
@@ -52,32 +71,6 @@
52
  }
53
  {%- endif -%}
54
  {%- endif -%}
55
- {%- if value['nullable'] %}
56
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
- nullable:true
58
- {%- endif -%}
59
- {%- if value['type'] | upper == 'OBJECT' -%}
60
- {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
- properties:{
63
- {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
- }
65
- {%- elif value is mapping -%}
66
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
- properties:{
68
- {{- format_parameters(value, value['required'] | default([])) -}}
69
- }
70
- {%- endif -%}
71
- {%- if value['required'] -%}
72
- {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
- required:[
74
- {%- for item in value['required'] | default([]) -%}
75
- <|"|>{{- item -}}<|"|>
76
- {%- if not loop.last %},{% endif -%}
77
- {%- endfor -%}
78
- ]
79
- {%- endif -%}
80
- {%- endif -%}
81
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
  type:<|"|>{{ value['type'] | upper }}<|"|>}
83
  {%- endif -%}
@@ -157,31 +150,16 @@
157
  {{- ns.result | trim -}}
158
  {%- endmacro -%}
159
 
160
- {%- macro format_tool_response_block(tool_name, response) -%}
161
- {{- '<|tool_response>' -}}
162
- {%- if response is mapping -%}
163
- {{- 'response:' + tool_name + '{' -}}
164
- {%- for key, value in response | dictsort -%}
165
- {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
- {%- if not loop.last %},{% endif -%}
167
- {%- endfor -%}
168
- {{- '}' -}}
169
- {%- else -%}
170
- {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
- {%- endif -%}
172
- {{- '<tool_response|>' -}}
173
- {%- endmacro -%}
174
-
175
  {%- set ns = namespace(prev_message_type=None) -%}
176
  {%- set loop_messages = messages -%}
177
- {{- bos_token -}}
178
  {#- Handle System/Tool Definitions Block -#}
179
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
  {{- '<|turn>system\n' -}}
181
 
182
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
183
  {%- if enable_thinking is defined and enable_thinking -%}
184
- {{- '<|think|>\n' -}}
185
  {%- set ns.prev_message_type = 'think' -%}
186
  {%- endif -%}
187
 
@@ -202,41 +180,11 @@
202
  {{- '<turn|>\n' -}}
203
  {%- endif %}
204
 
205
- {#- Pre-scan: find last user message index for reasoning guard -#}
206
- {%- set ns_turn = namespace(last_user_idx=-1) -%}
207
- {%- for i in range(loop_messages | length) -%}
208
- {%- if loop_messages[i]['role'] == 'user' -%}
209
- {%- set ns_turn.last_user_idx = i -%}
210
- {%- endif -%}
211
- {%- endfor -%}
212
-
213
  {#- Loop through messages -#}
214
  {%- for message in loop_messages -%}
215
- {%- if message['role'] != 'tool' -%}
216
  {%- set ns.prev_message_type = None -%}
217
  {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
218
- {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
219
- {%- set prev_nt = namespace(role=None, found=false) -%}
220
- {%- if loop.index0 > 0 -%}
221
- {%- for j in range(loop.index0 - 1, -1, -1) -%}
222
- {%- if not prev_nt.found -%}
223
- {%- if loop_messages[j]['role'] != 'tool' -%}
224
- {%- set prev_nt.role = loop_messages[j]['role'] -%}
225
- {%- set prev_nt.found = true -%}
226
- {%- endif -%}
227
- {%- endif -%}
228
- {%- endfor -%}
229
- {%- endif -%}
230
- {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
231
- {%- if not continue_same_model_turn -%}
232
  {{- '<|turn>' + role + '\n' }}
233
- {%- endif -%}
234
-
235
- {#- Render reasoning/reasoning_content as thinking channel -#}
236
- {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
237
- {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
238
- {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
239
- {%- endif -%}
240
 
241
  {%- if message['tool_calls'] -%}
242
  {%- for tool_call in message['tool_calls'] -%}
@@ -257,49 +205,23 @@
257
  {%- set ns.prev_message_type = 'tool_call' -%}
258
  {%- endif -%}
259
 
260
- {%- set ns_tr_out = namespace(flag=false) -%}
261
- {%- if message.get('tool_responses') -%}
262
- {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
263
  {%- for tool_response in message['tool_responses'] -%}
264
- {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
265
- {%- set ns_tr_out.flag = true -%}
266
- {%- set ns.prev_message_type = 'tool_response' -%}
267
- {%- endfor -%}
268
- {%- elif message.get('tool_calls') -%}
269
- {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
270
- {%- set ns_tool_scan = namespace(stopped=false) -%}
271
- {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
272
- {%- if ns_tool_scan.stopped -%}
273
- {%- elif loop_messages[k]['role'] != 'tool' -%}
274
- {%- set ns_tool_scan.stopped = true -%}
275
- {%- else -%}
276
- {%- set follow = loop_messages[k] -%}
277
- {#- Resolve tool_call_id to function name -#}
278
- {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
279
- {%- for tc in message['tool_calls'] -%}
280
- {%- if tc.get('id') == follow.get('tool_call_id') -%}
281
- {%- set ns_tname.name = tc['function']['name'] -%}
282
- {%- endif -%}
283
  {%- endfor -%}
284
- {#- Handle content as string or content-parts array -#}
285
- {%- set tool_body = follow.get('content') -%}
286
- {%- if tool_body is string -%}
287
- {{- format_tool_response_block(ns_tname.name, tool_body) -}}
288
- {%- elif tool_body is sequence and tool_body is not string -%}
289
- {%- set ns_txt = namespace(s='') -%}
290
- {%- for part in tool_body -%}
291
- {%- if part.get('type') == 'text' -%}
292
- {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
293
- {%- endif -%}
294
- {%- endfor -%}
295
- {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
296
- {%- else -%}
297
- {{- format_tool_response_block(ns_tname.name, tool_body) -}}
298
- {%- endif -%}
299
- {%- set ns_tr_out.flag = true -%}
300
- {%- set ns.prev_message_type = 'tool_response' -%}
301
  {%- endif -%}
 
302
  {%- endfor -%}
 
303
  {%- endif -%}
304
 
305
  {%- if message['content'] is string -%}
@@ -317,31 +239,28 @@
317
  {{- item['text'] | trim -}}
318
  {%- endif -%}
319
  {%- elif item['type'] == 'image' -%}
320
- {{- '<|image|>' -}}
321
  {%- set ns.prev_message_type = 'image' -%}
322
  {%- elif item['type'] == 'audio' -%}
323
  {{- '<|audio|>' -}}
324
  {%- set ns.prev_message_type = 'audio' -%}
325
  {%- elif item['type'] == 'video' -%}
326
- {{- '<|video|>' -}}
327
  {%- set ns.prev_message_type = 'video' -%}
328
  {%- endif -%}
329
  {%- endfor -%}
330
  {%- endif -%}
331
 
332
- {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
333
- {{- '<|tool_response>' -}}
334
- {%- elif not (ns_tr_out.flag and not message.get('content')) -%}
335
  {{- '<turn|>\n' -}}
336
  {%- endif -%}
337
- {%- endif -%}
338
  {%- endfor -%}
339
 
340
  {%- if add_generation_prompt -%}
341
- {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
342
  {{- '<|turn>model\n' -}}
343
- {%- if not enable_thinking | default(false) -%}
344
- {{- '<|channel>thought\n<channel|>' -}}
345
- {%- endif -%}
346
  {%- endif -%}
347
  {%- endif -%}
 
11
  description:<|"|>{{ value['description'] }}<|"|>
12
  {%- set add_comma = true -%}
13
  {%- endif -%}
14
+ {%- if value['nullable'] %}
15
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
16
+ nullable:true
17
+ {%- endif -%}
18
  {%- if value['type'] | upper == 'STRING' -%}
19
  {%- if value['enum'] -%}
20
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
21
  enum:{{ format_argument(value['enum']) }}
22
  {%- endif -%}
23
+ {%- elif value['type'] | upper == 'OBJECT' -%}
24
+ ,properties:{
25
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
26
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
27
+ {%- elif value is mapping -%}
28
+ {{- format_parameters(value, value['required'] | default([])) -}}
29
+ {%- endif -%}
30
+ }
31
+ {%- if value['required'] -%}
32
+ ,required:[
33
+ {%- for item in value['required'] | default([]) -%}
34
+ <|"|>{{- item -}}<|"|>
35
+ {%- if not loop.last %},{% endif -%}
36
+ {%- endfor -%}
37
+ ]
38
+ {%- endif -%}
39
  {%- elif value['type'] | upper == 'ARRAY' -%}
40
  {%- if value['items'] is mapping and value['items'] -%}
41
+ ,items:{
 
42
  {%- set ns_items = namespace(found_first=false) -%}
43
  {%- for item_key, item_value in value['items'] | dictsort -%}
44
  {%- if item_value is not none -%}
 
71
  }
72
  {%- endif -%}
73
  {%- endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
75
  type:<|"|>{{ value['type'] | upper }}<|"|>}
76
  {%- endif -%}
 
150
  {{- ns.result | trim -}}
151
  {%- endmacro -%}
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  {%- set ns = namespace(prev_message_type=None) -%}
154
  {%- set loop_messages = messages -%}
155
+ {{ bos_token }}
156
  {#- Handle System/Tool Definitions Block -#}
157
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
158
  {{- '<|turn>system\n' -}}
159
 
160
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
161
  {%- if enable_thinking is defined and enable_thinking -%}
162
+ {{- '<|think|>' -}}
163
  {%- set ns.prev_message_type = 'think' -%}
164
  {%- endif -%}
165
 
 
180
  {{- '<turn|>\n' -}}
181
  {%- endif %}
182
 
 
 
 
 
 
 
 
 
183
  {#- Loop through messages -#}
184
  {%- for message in loop_messages -%}
 
185
  {%- set ns.prev_message_type = None -%}
186
  {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  {{- '<|turn>' + role + '\n' }}
 
 
 
 
 
 
 
188
 
189
  {%- if message['tool_calls'] -%}
190
  {%- for tool_call in message['tool_calls'] -%}
 
205
  {%- set ns.prev_message_type = 'tool_call' -%}
206
  {%- endif -%}
207
 
208
+ {%- if message['tool_responses'] -%}
209
+ {#- Tool Response handling -#}
 
210
  {%- for tool_response in message['tool_responses'] -%}
211
+ {{- '<|tool_response>' -}}
212
+ {%- if tool_response['response'] is mapping -%}
213
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
214
+ {%- for key, value in tool_response['response'] | dictsort -%}
215
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
216
+ {%- if not loop.last %},{% endif -%}
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  {%- endfor -%}
218
+ {{- '}' -}}
219
+ {%- else -%}
220
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  {%- endif -%}
222
+ {{- '<tool_response|>' -}}
223
  {%- endfor -%}
224
+ {%- set ns.prev_message_type = 'tool_response' -%}
225
  {%- endif -%}
226
 
227
  {%- if message['content'] is string -%}
 
239
  {{- item['text'] | trim -}}
240
  {%- endif -%}
241
  {%- elif item['type'] == 'image' -%}
242
+ {{- '\n\n<|image|>\n\n' -}}
243
  {%- set ns.prev_message_type = 'image' -%}
244
  {%- elif item['type'] == 'audio' -%}
245
  {{- '<|audio|>' -}}
246
  {%- set ns.prev_message_type = 'audio' -%}
247
  {%- elif item['type'] == 'video' -%}
248
+ {{- '\n\n<|video|>\n\n' -}}
249
  {%- set ns.prev_message_type = 'video' -%}
250
  {%- endif -%}
251
  {%- endfor -%}
252
  {%- endif -%}
253
 
254
+ {%- if not (message['tool_responses'] and not message['content']) -%}
 
 
255
  {{- '<turn|>\n' -}}
256
  {%- endif -%}
 
257
  {%- endfor -%}
258
 
259
  {%- if add_generation_prompt -%}
260
+ {%- if ns.prev_message_type != 'tool_response' -%}
261
  {{- '<|turn>model\n' -}}
262
+ {%- endif -%}
263
+ {%- if not enable_thinking | default(false) -%}
264
+ {{- '<|channel>thought\n<channel|>' -}}
265
  {%- endif -%}
266
  {%- endif -%}
config.json CHANGED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3fe7c584ab9fb211c3071ac035628ccc6861c4a7e1b59c200f58b33ddb81435
3
  size 5258667341
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c001c0a7989a0e5929058bb6227a7779f1bd9592b4e6539bc3efb98fd39d2a72
3
  size 5258667341
model-00002-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed27cbf128295cb8c5fe462ccee517f5840f034b8310af432707c6b01089d2ce
3
  size 5328997408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b23e23aca1aed3a6e972be17e41fa1fc5091413c363d24e1e587c4dada4335b
3
  size 5328997408
model-00003-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a6cf32b03f6953bb42d863a4cc28e07289b6ea9fd501dfa9882b19e5539b048
3
  size 5270505988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:078ebc355c9dec2b23a4c41e13da14eeb201900d182a6854a808ec785704bf64
3
  size 5270505988
model-00004-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:615bb5fb4e2150da2ef820f2beae5dcf7939fae8b6405c5d8cfb376a16d18ff3
3
  size 5329041180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29916abad100fe44f0a183a38ea33f452ed9e45c155724b2c95187a26b4d9351
3
  size 5329041180
model-00005-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1f2fad40a9535464bc81e522c13021d2478413be623cd07d2db00a2b19fc01d
3
  size 5346547174
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb9b388965776b1b88183c9dca1d71024f4661211238fd0ef6f236146b669d32
3
  size 5346547174
model-00006-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dbf38950088a46ed5c9baab1921f5aa75ad4a498ee51e25765c951148983761
3
  size 5270505993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbdfdae266e2c1d4fc4ab61e25bdbda4489d8dafd16fd69e3461f93db4ae9ca
3
  size 5270505993
model-00007-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f0db64eef976c2985704d0aa7cbf4bc852d62a572bc59a02b20e62fe4f72806
3
- size 813126760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ed30c6b22197c1d2701394290c46210677050ca06a13f24d39957beb693a382
3
+ size 1958855331
model.safetensors.index.json CHANGED
@@ -1,9 +1,11 @@
1
  {
2
  "metadata": {
3
- "total_size": 32617177720,
4
- "total_parameters": 30697345280
5
  },
6
  "weight_map": {
 
 
 
7
  "language_model.model.embed_tokens.biases": "model-00001-of-00007.safetensors",
8
  "language_model.model.embed_tokens.scales": "model-00001-of-00007.safetensors",
9
  "language_model.model.embed_tokens.weight": "model-00001-of-00007.safetensors",
@@ -1657,6 +1659,361 @@
1657
  "language_model.model.layers.9.self_attn.v_proj.biases": "model-00002-of-00007.safetensors",
1658
  "language_model.model.layers.9.self_attn.v_proj.scales": "model-00002-of-00007.safetensors",
1659
  "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
1660
- "language_model.model.norm.weight": "model-00007-of-00007.safetensors"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1661
  }
1662
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 33762858712
 
4
  },
5
  "weight_map": {
6
+ "embed_vision.embedding_projection.biases": "model-00007-of-00007.safetensors",
7
+ "embed_vision.embedding_projection.scales": "model-00007-of-00007.safetensors",
8
+ "embed_vision.embedding_projection.weight": "model-00007-of-00007.safetensors",
9
  "language_model.model.embed_tokens.biases": "model-00001-of-00007.safetensors",
10
  "language_model.model.embed_tokens.scales": "model-00001-of-00007.safetensors",
11
  "language_model.model.embed_tokens.weight": "model-00001-of-00007.safetensors",
 
1659
  "language_model.model.layers.9.self_attn.v_proj.biases": "model-00002-of-00007.safetensors",
1660
  "language_model.model.layers.9.self_attn.v_proj.scales": "model-00002-of-00007.safetensors",
1661
  "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
1662
+ "language_model.model.norm.weight": "model-00007-of-00007.safetensors",
1663
+ "vision_tower.encoder.layers.0.input_layernorm.weight": "model-00007-of-00007.safetensors",
1664
+ "vision_tower.encoder.layers.0.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1665
+ "vision_tower.encoder.layers.0.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1666
+ "vision_tower.encoder.layers.0.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1667
+ "vision_tower.encoder.layers.0.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1668
+ "vision_tower.encoder.layers.0.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1669
+ "vision_tower.encoder.layers.0.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1670
+ "vision_tower.encoder.layers.0.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1671
+ "vision_tower.encoder.layers.0.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1672
+ "vision_tower.encoder.layers.0.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1673
+ "vision_tower.encoder.layers.0.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1674
+ "vision_tower.encoder.layers.0.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1675
+ "vision_tower.encoder.layers.0.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1676
+ "vision_tower.encoder.layers.1.input_layernorm.weight": "model-00007-of-00007.safetensors",
1677
+ "vision_tower.encoder.layers.1.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1678
+ "vision_tower.encoder.layers.1.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1679
+ "vision_tower.encoder.layers.1.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1680
+ "vision_tower.encoder.layers.1.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1681
+ "vision_tower.encoder.layers.1.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1682
+ "vision_tower.encoder.layers.1.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1683
+ "vision_tower.encoder.layers.1.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1684
+ "vision_tower.encoder.layers.1.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1685
+ "vision_tower.encoder.layers.1.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1686
+ "vision_tower.encoder.layers.1.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1687
+ "vision_tower.encoder.layers.1.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1688
+ "vision_tower.encoder.layers.1.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1689
+ "vision_tower.encoder.layers.10.input_layernorm.weight": "model-00007-of-00007.safetensors",
1690
+ "vision_tower.encoder.layers.10.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1691
+ "vision_tower.encoder.layers.10.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1692
+ "vision_tower.encoder.layers.10.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1693
+ "vision_tower.encoder.layers.10.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1694
+ "vision_tower.encoder.layers.10.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1695
+ "vision_tower.encoder.layers.10.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1696
+ "vision_tower.encoder.layers.10.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1697
+ "vision_tower.encoder.layers.10.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1698
+ "vision_tower.encoder.layers.10.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1699
+ "vision_tower.encoder.layers.10.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1700
+ "vision_tower.encoder.layers.10.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1701
+ "vision_tower.encoder.layers.10.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1702
+ "vision_tower.encoder.layers.11.input_layernorm.weight": "model-00007-of-00007.safetensors",
1703
+ "vision_tower.encoder.layers.11.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1704
+ "vision_tower.encoder.layers.11.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1705
+ "vision_tower.encoder.layers.11.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1706
+ "vision_tower.encoder.layers.11.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1707
+ "vision_tower.encoder.layers.11.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1708
+ "vision_tower.encoder.layers.11.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1709
+ "vision_tower.encoder.layers.11.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1710
+ "vision_tower.encoder.layers.11.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1711
+ "vision_tower.encoder.layers.11.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1712
+ "vision_tower.encoder.layers.11.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1713
+ "vision_tower.encoder.layers.11.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1714
+ "vision_tower.encoder.layers.11.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1715
+ "vision_tower.encoder.layers.12.input_layernorm.weight": "model-00007-of-00007.safetensors",
1716
+ "vision_tower.encoder.layers.12.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1717
+ "vision_tower.encoder.layers.12.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1718
+ "vision_tower.encoder.layers.12.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1719
+ "vision_tower.encoder.layers.12.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1720
+ "vision_tower.encoder.layers.12.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1721
+ "vision_tower.encoder.layers.12.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1722
+ "vision_tower.encoder.layers.12.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1723
+ "vision_tower.encoder.layers.12.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1724
+ "vision_tower.encoder.layers.12.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1725
+ "vision_tower.encoder.layers.12.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1726
+ "vision_tower.encoder.layers.12.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1727
+ "vision_tower.encoder.layers.12.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1728
+ "vision_tower.encoder.layers.13.input_layernorm.weight": "model-00007-of-00007.safetensors",
1729
+ "vision_tower.encoder.layers.13.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1730
+ "vision_tower.encoder.layers.13.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1731
+ "vision_tower.encoder.layers.13.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1732
+ "vision_tower.encoder.layers.13.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1733
+ "vision_tower.encoder.layers.13.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1734
+ "vision_tower.encoder.layers.13.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1735
+ "vision_tower.encoder.layers.13.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1736
+ "vision_tower.encoder.layers.13.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1737
+ "vision_tower.encoder.layers.13.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1738
+ "vision_tower.encoder.layers.13.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1739
+ "vision_tower.encoder.layers.13.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1740
+ "vision_tower.encoder.layers.13.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1741
+ "vision_tower.encoder.layers.14.input_layernorm.weight": "model-00007-of-00007.safetensors",
1742
+ "vision_tower.encoder.layers.14.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1743
+ "vision_tower.encoder.layers.14.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1744
+ "vision_tower.encoder.layers.14.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1745
+ "vision_tower.encoder.layers.14.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1746
+ "vision_tower.encoder.layers.14.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1747
+ "vision_tower.encoder.layers.14.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1748
+ "vision_tower.encoder.layers.14.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1749
+ "vision_tower.encoder.layers.14.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1750
+ "vision_tower.encoder.layers.14.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1751
+ "vision_tower.encoder.layers.14.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1752
+ "vision_tower.encoder.layers.14.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1753
+ "vision_tower.encoder.layers.14.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1754
+ "vision_tower.encoder.layers.15.input_layernorm.weight": "model-00007-of-00007.safetensors",
1755
+ "vision_tower.encoder.layers.15.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1756
+ "vision_tower.encoder.layers.15.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1757
+ "vision_tower.encoder.layers.15.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1758
+ "vision_tower.encoder.layers.15.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1759
+ "vision_tower.encoder.layers.15.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1760
+ "vision_tower.encoder.layers.15.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1761
+ "vision_tower.encoder.layers.15.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1762
+ "vision_tower.encoder.layers.15.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1763
+ "vision_tower.encoder.layers.15.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1764
+ "vision_tower.encoder.layers.15.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1765
+ "vision_tower.encoder.layers.15.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1766
+ "vision_tower.encoder.layers.15.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1767
+ "vision_tower.encoder.layers.16.input_layernorm.weight": "model-00007-of-00007.safetensors",
1768
+ "vision_tower.encoder.layers.16.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1769
+ "vision_tower.encoder.layers.16.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1770
+ "vision_tower.encoder.layers.16.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1771
+ "vision_tower.encoder.layers.16.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1772
+ "vision_tower.encoder.layers.16.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1773
+ "vision_tower.encoder.layers.16.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1774
+ "vision_tower.encoder.layers.16.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1775
+ "vision_tower.encoder.layers.16.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1776
+ "vision_tower.encoder.layers.16.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1777
+ "vision_tower.encoder.layers.16.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1778
+ "vision_tower.encoder.layers.16.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1779
+ "vision_tower.encoder.layers.16.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1780
+ "vision_tower.encoder.layers.17.input_layernorm.weight": "model-00007-of-00007.safetensors",
1781
+ "vision_tower.encoder.layers.17.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1782
+ "vision_tower.encoder.layers.17.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1783
+ "vision_tower.encoder.layers.17.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1784
+ "vision_tower.encoder.layers.17.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1785
+ "vision_tower.encoder.layers.17.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1786
+ "vision_tower.encoder.layers.17.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1787
+ "vision_tower.encoder.layers.17.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1788
+ "vision_tower.encoder.layers.17.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1789
+ "vision_tower.encoder.layers.17.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1790
+ "vision_tower.encoder.layers.17.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1791
+ "vision_tower.encoder.layers.17.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1792
+ "vision_tower.encoder.layers.17.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1793
+ "vision_tower.encoder.layers.18.input_layernorm.weight": "model-00007-of-00007.safetensors",
1794
+ "vision_tower.encoder.layers.18.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1795
+ "vision_tower.encoder.layers.18.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1796
+ "vision_tower.encoder.layers.18.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1797
+ "vision_tower.encoder.layers.18.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1798
+ "vision_tower.encoder.layers.18.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1799
+ "vision_tower.encoder.layers.18.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1800
+ "vision_tower.encoder.layers.18.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1801
+ "vision_tower.encoder.layers.18.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1802
+ "vision_tower.encoder.layers.18.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1803
+ "vision_tower.encoder.layers.18.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1804
+ "vision_tower.encoder.layers.18.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1805
+ "vision_tower.encoder.layers.18.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1806
+ "vision_tower.encoder.layers.19.input_layernorm.weight": "model-00007-of-00007.safetensors",
1807
+ "vision_tower.encoder.layers.19.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1808
+ "vision_tower.encoder.layers.19.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1809
+ "vision_tower.encoder.layers.19.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1810
+ "vision_tower.encoder.layers.19.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1811
+ "vision_tower.encoder.layers.19.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1812
+ "vision_tower.encoder.layers.19.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1813
+ "vision_tower.encoder.layers.19.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1814
+ "vision_tower.encoder.layers.19.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1815
+ "vision_tower.encoder.layers.19.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1816
+ "vision_tower.encoder.layers.19.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1817
+ "vision_tower.encoder.layers.19.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1818
+ "vision_tower.encoder.layers.19.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1819
+ "vision_tower.encoder.layers.2.input_layernorm.weight": "model-00007-of-00007.safetensors",
1820
+ "vision_tower.encoder.layers.2.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1821
+ "vision_tower.encoder.layers.2.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1822
+ "vision_tower.encoder.layers.2.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1823
+ "vision_tower.encoder.layers.2.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1824
+ "vision_tower.encoder.layers.2.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1825
+ "vision_tower.encoder.layers.2.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1826
+ "vision_tower.encoder.layers.2.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1827
+ "vision_tower.encoder.layers.2.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1828
+ "vision_tower.encoder.layers.2.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1829
+ "vision_tower.encoder.layers.2.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1830
+ "vision_tower.encoder.layers.2.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1831
+ "vision_tower.encoder.layers.2.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1832
+ "vision_tower.encoder.layers.20.input_layernorm.weight": "model-00007-of-00007.safetensors",
1833
+ "vision_tower.encoder.layers.20.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1834
+ "vision_tower.encoder.layers.20.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1835
+ "vision_tower.encoder.layers.20.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1836
+ "vision_tower.encoder.layers.20.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1837
+ "vision_tower.encoder.layers.20.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1838
+ "vision_tower.encoder.layers.20.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1839
+ "vision_tower.encoder.layers.20.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1840
+ "vision_tower.encoder.layers.20.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1841
+ "vision_tower.encoder.layers.20.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1842
+ "vision_tower.encoder.layers.20.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1843
+ "vision_tower.encoder.layers.20.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1844
+ "vision_tower.encoder.layers.20.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1845
+ "vision_tower.encoder.layers.21.input_layernorm.weight": "model-00007-of-00007.safetensors",
1846
+ "vision_tower.encoder.layers.21.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1847
+ "vision_tower.encoder.layers.21.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1848
+ "vision_tower.encoder.layers.21.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1849
+ "vision_tower.encoder.layers.21.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1850
+ "vision_tower.encoder.layers.21.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1851
+ "vision_tower.encoder.layers.21.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1852
+ "vision_tower.encoder.layers.21.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1853
+ "vision_tower.encoder.layers.21.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1854
+ "vision_tower.encoder.layers.21.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1855
+ "vision_tower.encoder.layers.21.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1856
+ "vision_tower.encoder.layers.21.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1857
+ "vision_tower.encoder.layers.21.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1858
+ "vision_tower.encoder.layers.22.input_layernorm.weight": "model-00007-of-00007.safetensors",
1859
+ "vision_tower.encoder.layers.22.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1860
+ "vision_tower.encoder.layers.22.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1861
+ "vision_tower.encoder.layers.22.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1862
+ "vision_tower.encoder.layers.22.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1863
+ "vision_tower.encoder.layers.22.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1864
+ "vision_tower.encoder.layers.22.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1865
+ "vision_tower.encoder.layers.22.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1866
+ "vision_tower.encoder.layers.22.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1867
+ "vision_tower.encoder.layers.22.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1868
+ "vision_tower.encoder.layers.22.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1869
+ "vision_tower.encoder.layers.22.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1870
+ "vision_tower.encoder.layers.22.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1871
+ "vision_tower.encoder.layers.23.input_layernorm.weight": "model-00007-of-00007.safetensors",
1872
+ "vision_tower.encoder.layers.23.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1873
+ "vision_tower.encoder.layers.23.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1874
+ "vision_tower.encoder.layers.23.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1875
+ "vision_tower.encoder.layers.23.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1876
+ "vision_tower.encoder.layers.23.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1877
+ "vision_tower.encoder.layers.23.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1878
+ "vision_tower.encoder.layers.23.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1879
+ "vision_tower.encoder.layers.23.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1880
+ "vision_tower.encoder.layers.23.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1881
+ "vision_tower.encoder.layers.23.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1882
+ "vision_tower.encoder.layers.23.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1883
+ "vision_tower.encoder.layers.23.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1884
+ "vision_tower.encoder.layers.24.input_layernorm.weight": "model-00007-of-00007.safetensors",
1885
+ "vision_tower.encoder.layers.24.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1886
+ "vision_tower.encoder.layers.24.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1887
+ "vision_tower.encoder.layers.24.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1888
+ "vision_tower.encoder.layers.24.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1889
+ "vision_tower.encoder.layers.24.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1890
+ "vision_tower.encoder.layers.24.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1891
+ "vision_tower.encoder.layers.24.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1892
+ "vision_tower.encoder.layers.24.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1893
+ "vision_tower.encoder.layers.24.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1894
+ "vision_tower.encoder.layers.24.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1895
+ "vision_tower.encoder.layers.24.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1896
+ "vision_tower.encoder.layers.24.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1897
+ "vision_tower.encoder.layers.25.input_layernorm.weight": "model-00007-of-00007.safetensors",
1898
+ "vision_tower.encoder.layers.25.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1899
+ "vision_tower.encoder.layers.25.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1900
+ "vision_tower.encoder.layers.25.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1901
+ "vision_tower.encoder.layers.25.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1902
+ "vision_tower.encoder.layers.25.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1903
+ "vision_tower.encoder.layers.25.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1904
+ "vision_tower.encoder.layers.25.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1905
+ "vision_tower.encoder.layers.25.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1906
+ "vision_tower.encoder.layers.25.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1907
+ "vision_tower.encoder.layers.25.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1908
+ "vision_tower.encoder.layers.25.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1909
+ "vision_tower.encoder.layers.25.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1910
+ "vision_tower.encoder.layers.26.input_layernorm.weight": "model-00007-of-00007.safetensors",
1911
+ "vision_tower.encoder.layers.26.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1912
+ "vision_tower.encoder.layers.26.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1913
+ "vision_tower.encoder.layers.26.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1914
+ "vision_tower.encoder.layers.26.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1915
+ "vision_tower.encoder.layers.26.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1916
+ "vision_tower.encoder.layers.26.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1917
+ "vision_tower.encoder.layers.26.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1918
+ "vision_tower.encoder.layers.26.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1919
+ "vision_tower.encoder.layers.26.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1920
+ "vision_tower.encoder.layers.26.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1921
+ "vision_tower.encoder.layers.26.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1922
+ "vision_tower.encoder.layers.26.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1923
+ "vision_tower.encoder.layers.3.input_layernorm.weight": "model-00007-of-00007.safetensors",
1924
+ "vision_tower.encoder.layers.3.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1925
+ "vision_tower.encoder.layers.3.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1926
+ "vision_tower.encoder.layers.3.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1927
+ "vision_tower.encoder.layers.3.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1928
+ "vision_tower.encoder.layers.3.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1929
+ "vision_tower.encoder.layers.3.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1930
+ "vision_tower.encoder.layers.3.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1931
+ "vision_tower.encoder.layers.3.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1932
+ "vision_tower.encoder.layers.3.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1933
+ "vision_tower.encoder.layers.3.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1934
+ "vision_tower.encoder.layers.3.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1935
+ "vision_tower.encoder.layers.3.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1936
+ "vision_tower.encoder.layers.4.input_layernorm.weight": "model-00007-of-00007.safetensors",
1937
+ "vision_tower.encoder.layers.4.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1938
+ "vision_tower.encoder.layers.4.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1939
+ "vision_tower.encoder.layers.4.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1940
+ "vision_tower.encoder.layers.4.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1941
+ "vision_tower.encoder.layers.4.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1942
+ "vision_tower.encoder.layers.4.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1943
+ "vision_tower.encoder.layers.4.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1944
+ "vision_tower.encoder.layers.4.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1945
+ "vision_tower.encoder.layers.4.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1946
+ "vision_tower.encoder.layers.4.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1947
+ "vision_tower.encoder.layers.4.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1948
+ "vision_tower.encoder.layers.4.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1949
+ "vision_tower.encoder.layers.5.input_layernorm.weight": "model-00007-of-00007.safetensors",
1950
+ "vision_tower.encoder.layers.5.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1951
+ "vision_tower.encoder.layers.5.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1952
+ "vision_tower.encoder.layers.5.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1953
+ "vision_tower.encoder.layers.5.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1954
+ "vision_tower.encoder.layers.5.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1955
+ "vision_tower.encoder.layers.5.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1956
+ "vision_tower.encoder.layers.5.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1957
+ "vision_tower.encoder.layers.5.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1958
+ "vision_tower.encoder.layers.5.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1959
+ "vision_tower.encoder.layers.5.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1960
+ "vision_tower.encoder.layers.5.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1961
+ "vision_tower.encoder.layers.5.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1962
+ "vision_tower.encoder.layers.6.input_layernorm.weight": "model-00007-of-00007.safetensors",
1963
+ "vision_tower.encoder.layers.6.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1964
+ "vision_tower.encoder.layers.6.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1965
+ "vision_tower.encoder.layers.6.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1966
+ "vision_tower.encoder.layers.6.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1967
+ "vision_tower.encoder.layers.6.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1968
+ "vision_tower.encoder.layers.6.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1969
+ "vision_tower.encoder.layers.6.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1970
+ "vision_tower.encoder.layers.6.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1971
+ "vision_tower.encoder.layers.6.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1972
+ "vision_tower.encoder.layers.6.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1973
+ "vision_tower.encoder.layers.6.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1974
+ "vision_tower.encoder.layers.6.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1975
+ "vision_tower.encoder.layers.7.input_layernorm.weight": "model-00007-of-00007.safetensors",
1976
+ "vision_tower.encoder.layers.7.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1977
+ "vision_tower.encoder.layers.7.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1978
+ "vision_tower.encoder.layers.7.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1979
+ "vision_tower.encoder.layers.7.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1980
+ "vision_tower.encoder.layers.7.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1981
+ "vision_tower.encoder.layers.7.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1982
+ "vision_tower.encoder.layers.7.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1983
+ "vision_tower.encoder.layers.7.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1984
+ "vision_tower.encoder.layers.7.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1985
+ "vision_tower.encoder.layers.7.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1986
+ "vision_tower.encoder.layers.7.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
1987
+ "vision_tower.encoder.layers.7.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
1988
+ "vision_tower.encoder.layers.8.input_layernorm.weight": "model-00007-of-00007.safetensors",
1989
+ "vision_tower.encoder.layers.8.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
1990
+ "vision_tower.encoder.layers.8.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
1991
+ "vision_tower.encoder.layers.8.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
1992
+ "vision_tower.encoder.layers.8.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
1993
+ "vision_tower.encoder.layers.8.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1994
+ "vision_tower.encoder.layers.8.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
1995
+ "vision_tower.encoder.layers.8.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
1996
+ "vision_tower.encoder.layers.8.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
1997
+ "vision_tower.encoder.layers.8.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
1998
+ "vision_tower.encoder.layers.8.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
1999
+ "vision_tower.encoder.layers.8.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
2000
+ "vision_tower.encoder.layers.8.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
2001
+ "vision_tower.encoder.layers.9.input_layernorm.weight": "model-00007-of-00007.safetensors",
2002
+ "vision_tower.encoder.layers.9.mlp.down_proj.linear.weight": "model-00007-of-00007.safetensors",
2003
+ "vision_tower.encoder.layers.9.mlp.gate_proj.linear.weight": "model-00007-of-00007.safetensors",
2004
+ "vision_tower.encoder.layers.9.mlp.up_proj.linear.weight": "model-00007-of-00007.safetensors",
2005
+ "vision_tower.encoder.layers.9.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
2006
+ "vision_tower.encoder.layers.9.post_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
2007
+ "vision_tower.encoder.layers.9.pre_feedforward_layernorm.weight": "model-00007-of-00007.safetensors",
2008
+ "vision_tower.encoder.layers.9.self_attn.k_norm.weight": "model-00007-of-00007.safetensors",
2009
+ "vision_tower.encoder.layers.9.self_attn.k_proj.linear.weight": "model-00007-of-00007.safetensors",
2010
+ "vision_tower.encoder.layers.9.self_attn.o_proj.linear.weight": "model-00007-of-00007.safetensors",
2011
+ "vision_tower.encoder.layers.9.self_attn.q_norm.weight": "model-00007-of-00007.safetensors",
2012
+ "vision_tower.encoder.layers.9.self_attn.q_proj.linear.weight": "model-00007-of-00007.safetensors",
2013
+ "vision_tower.encoder.layers.9.self_attn.v_proj.linear.weight": "model-00007-of-00007.safetensors",
2014
+ "vision_tower.patch_embedder.input_proj.weight": "model-00007-of-00007.safetensors",
2015
+ "vision_tower.patch_embedder.position_embedding_table": "model-00007-of-00007.safetensors",
2016
+ "vision_tower.std_bias": "model-00007-of-00007.safetensors",
2017
+ "vision_tower.std_scale": "model-00007-of-00007.safetensors"
2018
  }
2019
  }
processor_config.json CHANGED
@@ -28,5 +28,15 @@
28
  }
29
  },
30
  "image_seq_length": 280,
31
- "processor_class": "Gemma4Processor"
32
- }
 
 
 
 
 
 
 
 
 
 
 
28
  }
29
  },
30
  "image_seq_length": 280,
31
+ "processor_class": "Gemma4Processor",
32
+ "feature_extractor": {
33
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
34
+ "sampling_rate": 16000,
35
+ "num_mel_filters": 128,
36
+ "fft_length": 512,
37
+ "hop_length": 160,
38
+ "chunk_duration": 8.0,
39
+ "overlap_duration": 1.0
40
+ },
41
+ "audio_ms_per_token": 40
42
+ }
tokenizer_config.json CHANGED
@@ -13,6 +13,9 @@
13
  "etc_token": "<tool_call|>",
14
  "etd_token": "<tool|>",
15
  "etr_token": "<tool_response|>",
 
 
 
16
  "image_token": "<|image|>",
17
  "is_local": true,
18
  "mask_token": "<mask>",
@@ -35,8 +38,7 @@
35
  "stc_token": "<|tool_call>",
36
  "std_token": "<|tool>",
37
  "str_token": "<|tool_response>",
38
- "think_token": "<|think|>",
39
- "video_token": "<|video|>"
40
  },
41
  "pad_token": "<pad>",
42
  "padding_side": "left",
@@ -89,6 +91,5 @@
89
  "str_token": "<|tool_response>",
90
  "think_token": "<|think|>",
91
  "tokenizer_class": "GemmaTokenizer",
92
- "unk_token": "<unk>",
93
- "video_token": "<|video|>"
94
  }
 
13
  "etc_token": "<tool_call|>",
14
  "etd_token": "<tool|>",
15
  "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
  "image_token": "<|image|>",
20
  "is_local": true,
21
  "mask_token": "<mask>",
 
38
  "stc_token": "<|tool_call>",
39
  "std_token": "<|tool>",
40
  "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
 
42
  },
43
  "pad_token": "<pad>",
44
  "padding_side": "left",
 
91
  "str_token": "<|tool_response>",
92
  "think_token": "<|think|>",
93
  "tokenizer_class": "GemmaTokenizer",
94
+ "unk_token": "<unk>"
 
95
  }