evergyu commited on
Commit
fe80455
·
verified ·
1 Parent(s): b7d709f

Upload finetuned model

Browse files
added_tokens.json CHANGED
@@ -1,10 +1,38 @@
1
  {
2
- "</box>": 262151,
3
- "</quad>": 262147,
4
- "</ref>": 262149,
5
- "<IMG_CONTEXT>": 262145,
6
- "<box>": 262150,
7
- "<image_soft_token>": 262144,
8
- "<quad>": 262146,
9
- "<ref>": 262148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  }
 
1
  {
2
+ "</box>": 151677,
3
+ "</img>": 151670,
4
+ "</quad>": 151673,
5
+ "</ref>": 151675,
6
+ "</think>": 151668,
7
+ "</tool_call>": 151658,
8
+ "</tool_response>": 151666,
9
+ "<IMG_CONTEXT>": 151671,
10
+ "<box>": 151676,
11
+ "<img>": 151669,
12
+ "<quad>": 151672,
13
+ "<ref>": 151674,
14
+ "<think>": 151667,
15
+ "<tool_call>": 151657,
16
+ "<tool_response>": 151665,
17
+ "<video>": 151678,
18
+ "<|box_end|>": 151649,
19
+ "<|box_start|>": 151648,
20
+ "<|endoftext|>": 151643,
21
+ "<|file_sep|>": 151664,
22
+ "<|fim_middle|>": 151660,
23
+ "<|fim_pad|>": 151662,
24
+ "<|fim_prefix|>": 151659,
25
+ "<|fim_suffix|>": 151661,
26
+ "<|im_end|>": 151645,
27
+ "<|im_start|>": 151644,
28
+ "<|image_pad|>": 151655,
29
+ "<|object_ref_end|>": 151647,
30
+ "<|object_ref_start|>": 151646,
31
+ "<|quad_end|>": 151651,
32
+ "<|quad_start|>": 151650,
33
+ "<|repo_name|>": 151663,
34
+ "<|video_pad|>": 151656,
35
+ "<|vision_end|>": 151653,
36
+ "<|vision_pad|>": 151654,
37
+ "<|vision_start|>": 151652
38
  }
chat_template.jinja CHANGED
@@ -1,47 +1,6 @@
1
- {{ bos_token }}
2
- {%- if messages[0]['role'] == 'system' -%}
3
- {%- if messages[0]['content'] is string -%}
4
- {%- set first_user_prefix = messages[0]['content'] + '
5
-
6
- ' -%}
7
- {%- else -%}
8
- {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
-
10
- ' -%}
11
- {%- endif -%}
12
- {%- set loop_messages = messages[1:] -%}
13
- {%- else -%}
14
- {%- set first_user_prefix = "" -%}
15
- {%- set loop_messages = messages -%}
16
- {%- endif -%}
17
- {%- for message in loop_messages -%}
18
- {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
- {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
- {%- endif -%}
21
- {%- if (message['role'] == 'assistant') -%}
22
- {%- set role = "model" -%}
23
- {%- else -%}
24
- {%- set role = message['role'] -%}
25
- {%- endif -%}
26
- {{ '<start_of_turn>' + role + '
27
- ' + (first_user_prefix if loop.first else "") }}
28
- {%- if message['content'] is string -%}
29
- {{ message['content'] | trim }}
30
- {%- elif message['content'] is iterable -%}
31
- {%- for item in message['content'] -%}
32
- {%- if item['type'] == 'image' -%}
33
- {{ '<start_of_image>' }}
34
- {%- elif item['type'] == 'text' -%}
35
- {{ item['text'] | trim }}
36
- {%- endif -%}
37
- {%- endfor -%}
38
- {%- else -%}
39
- {{ raise_exception("Invalid content type") }}
40
- {%- endif -%}
41
- {{ '<end_of_turn>
42
- ' }}
43
- {%- endfor -%}
44
- {%- if add_generation_prompt -%}
45
- {{'<start_of_turn>model
46
- '}}
47
- {%- endif -%}
 
1
+ {% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ '}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<IMG_CONTEXT>
3
+ ' }}{% elif content['type'] == 'video' %}{{ '<video>
4
+ ' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
6
+ ' }}{% endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -8,20 +8,18 @@
8
  "model_type": "internvl",
9
  "projector_hidden_act": "gelu",
10
  "text_config": {
11
- "_sliding_window_pattern": 6,
12
  "architectures": [
13
- "Gemma3ForCausalLM"
14
  ],
15
  "attention_bias": false,
16
  "attention_dropout": 0.0,
17
- "attn_logit_softcapping": null,
18
  "bos_token_id": 2,
19
- "cache_implementation": "hybrid",
20
  "eos_token_id": 1,
21
- "final_logit_softcapping": null,
22
  "head_dim": 256,
23
  "hidden_act": "silu",
24
- "hidden_activation": "gelu_pytorch_tanh",
25
  "hidden_size": 2560,
26
  "initializer_range": 0.02,
27
  "intermediate_size": 10240,
@@ -63,23 +61,19 @@
63
  ],
64
  "max_position_embeddings": 131072,
65
  "max_window_layers": 28,
 
66
  "model_type": "qwen3",
67
  "num_attention_heads": 8,
68
  "num_hidden_layers": 34,
69
  "num_key_value_heads": 4,
70
- "pad_token_id": 0,
71
- "query_pre_attn_scalar": 256,
72
  "rms_norm_eps": 1e-06,
73
- "rope_local_base_freq": 10000.0,
74
- "rope_scaling": {
75
- "factor": 8.0,
76
- "rope_type": "linear"
77
- },
78
- "rope_theta": 1000000.0,
79
  "sliding_window": null,
80
- "tie_word_embeddings": true,
81
  "torch_dtype": "bfloat16",
82
- "use_cache": false,
 
83
  "use_sliding_window": false,
84
  "vocab_size": 262152
85
  },
@@ -89,9 +83,8 @@
89
  "architectures": [
90
  "InternVisionModel"
91
  ],
92
- "attention_bias": false,
93
  "attention_dropout": 0.0,
94
- "drop_path_rate": 0.1,
95
  "dropout": 0.0,
96
  "hidden_act": "gelu",
97
  "hidden_dropout_prob": 0.0,
@@ -100,8 +93,8 @@
100
  448,
101
  448
102
  ],
103
- "initializer_factor": 1.0,
104
- "initializer_range": 0.02,
105
  "intermediate_size": 4096,
106
  "layer_norm_eps": 1e-06,
107
  "layer_scale_init_value": 0.1,
@@ -115,11 +108,8 @@
115
  14
116
  ],
117
  "projection_dropout": 0.0,
118
- "qk_normalization": false,
119
- "qkv_bias": true,
120
  "torch_dtype": "bfloat16",
121
  "use_absolute_position_embeddings": true,
122
- "use_flash_attn": true,
123
  "use_mask_token": false,
124
  "use_mean_pooling": true,
125
  "use_qk_norm": false
 
8
  "model_type": "internvl",
9
  "projector_hidden_act": "gelu",
10
  "text_config": {
11
+ "_name_or_path": "/root/codespace/checkpoints/Qwen3-0.6B",
12
  "architectures": [
13
+ "Qwen3ForCausalLM"
14
  ],
15
  "attention_bias": false,
16
  "attention_dropout": 0.0,
 
17
  "bos_token_id": 2,
18
+ "debug": false,
19
  "eos_token_id": 1,
20
+ "ep_size": 1,
21
  "head_dim": 256,
22
  "hidden_act": "silu",
 
23
  "hidden_size": 2560,
24
  "initializer_range": 0.02,
25
  "intermediate_size": 10240,
 
61
  ],
62
  "max_position_embeddings": 131072,
63
  "max_window_layers": 28,
64
+ "micro_forward": false,
65
  "model_type": "qwen3",
66
  "num_attention_heads": 8,
67
  "num_hidden_layers": 34,
68
  "num_key_value_heads": 4,
 
 
69
  "rms_norm_eps": 1e-06,
70
+ "rope_scaling": null,
71
+ "rope_theta": 1000000,
72
+ "skip_checkpoint": false,
 
 
 
73
  "sliding_window": null,
 
74
  "torch_dtype": "bfloat16",
75
+ "use_cache": true,
76
+ "use_deepep": false,
77
  "use_sliding_window": false,
78
  "vocab_size": 262152
79
  },
 
83
  "architectures": [
84
  "InternVisionModel"
85
  ],
86
+ "attention_bias": true,
87
  "attention_dropout": 0.0,
 
88
  "dropout": 0.0,
89
  "hidden_act": "gelu",
90
  "hidden_dropout_prob": 0.0,
 
93
  448,
94
  448
95
  ],
96
+ "initializer_factor": 0.1,
97
+ "initializer_range": 1e-10,
98
  "intermediate_size": 4096,
99
  "layer_norm_eps": 1e-06,
100
  "layer_scale_init_value": 0.1,
 
108
  14
109
  ],
110
  "projection_dropout": 0.0,
 
 
111
  "torch_dtype": "bfloat16",
112
  "use_absolute_position_embeddings": true,
 
113
  "use_mask_token": false,
114
  "use_mean_pooling": true,
115
  "use_qk_norm": false
generation_config.json CHANGED
@@ -2,7 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 2,
4
  "eos_token_id": 1,
5
- "pad_token_id": 0,
6
- "transformers_version": "4.55.0",
7
- "use_cache": false
8
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 2,
4
  "eos_token_id": 1,
5
+ "transformers_version": "4.55.0"
 
 
6
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8778027066ad533f17902037e617a64b88a704507c5d448cb0fefd1eb285b98
3
- size 4951909536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5774f23db9a6e31eda4de4f71c14ceae0d8b7357428245ed91ec45d5b3a524d
3
+ size 4952065960
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:915d374154c8a48ae93204f43a893a43b68bcbb0ab39ff48b5947a664f56315c
3
- size 3450059472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1be1040a14a07e8092bfd5d0a48dccaf00499d7b6b3102625d4a891a8aba3eb
3
+ size 4792277960
model.safetensors.index.json CHANGED
@@ -1,9 +1,10 @@
1
  {
2
  "metadata": {
3
- "total_parameters": 4200936960,
4
- "total_size": 8401873920
5
  },
6
  "weight_map": {
 
7
  "language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
  "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
  "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
@@ -390,10 +391,13 @@
390
  "vision_tower.embeddings.patch_embeddings.projection.bias": "model-00001-of-00002.safetensors",
391
  "vision_tower.embeddings.patch_embeddings.projection.weight": "model-00001-of-00002.safetensors",
392
  "vision_tower.embeddings.position_embeddings": "model-00001-of-00002.safetensors",
 
393
  "vision_tower.encoder.layer.0.attention.k_proj.weight": "model-00001-of-00002.safetensors",
394
  "vision_tower.encoder.layer.0.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
395
  "vision_tower.encoder.layer.0.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
396
  "vision_tower.encoder.layer.0.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
397
  "vision_tower.encoder.layer.0.attention.v_proj.weight": "model-00001-of-00002.safetensors",
398
  "vision_tower.encoder.layer.0.lambda_1": "model-00001-of-00002.safetensors",
399
  "vision_tower.encoder.layer.0.lambda_2": "model-00001-of-00002.safetensors",
@@ -405,10 +409,13 @@
405
  "vision_tower.encoder.layer.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
406
  "vision_tower.encoder.layer.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
407
  "vision_tower.encoder.layer.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
408
  "vision_tower.encoder.layer.1.attention.k_proj.weight": "model-00001-of-00002.safetensors",
409
  "vision_tower.encoder.layer.1.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
410
  "vision_tower.encoder.layer.1.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
411
  "vision_tower.encoder.layer.1.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
412
  "vision_tower.encoder.layer.1.attention.v_proj.weight": "model-00001-of-00002.safetensors",
413
  "vision_tower.encoder.layer.1.lambda_1": "model-00001-of-00002.safetensors",
414
  "vision_tower.encoder.layer.1.lambda_2": "model-00001-of-00002.safetensors",
@@ -420,10 +427,13 @@
420
  "vision_tower.encoder.layer.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
421
  "vision_tower.encoder.layer.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
422
  "vision_tower.encoder.layer.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
423
  "vision_tower.encoder.layer.10.attention.k_proj.weight": "model-00001-of-00002.safetensors",
424
  "vision_tower.encoder.layer.10.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
425
  "vision_tower.encoder.layer.10.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
426
  "vision_tower.encoder.layer.10.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
427
  "vision_tower.encoder.layer.10.attention.v_proj.weight": "model-00001-of-00002.safetensors",
428
  "vision_tower.encoder.layer.10.lambda_1": "model-00001-of-00002.safetensors",
429
  "vision_tower.encoder.layer.10.lambda_2": "model-00001-of-00002.safetensors",
@@ -435,10 +445,13 @@
435
  "vision_tower.encoder.layer.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
436
  "vision_tower.encoder.layer.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
437
  "vision_tower.encoder.layer.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
438
  "vision_tower.encoder.layer.11.attention.k_proj.weight": "model-00001-of-00002.safetensors",
439
  "vision_tower.encoder.layer.11.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
440
  "vision_tower.encoder.layer.11.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
441
  "vision_tower.encoder.layer.11.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
442
  "vision_tower.encoder.layer.11.attention.v_proj.weight": "model-00001-of-00002.safetensors",
443
  "vision_tower.encoder.layer.11.lambda_1": "model-00001-of-00002.safetensors",
444
  "vision_tower.encoder.layer.11.lambda_2": "model-00001-of-00002.safetensors",
@@ -450,10 +463,13 @@
450
  "vision_tower.encoder.layer.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
451
  "vision_tower.encoder.layer.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
452
  "vision_tower.encoder.layer.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
453
  "vision_tower.encoder.layer.12.attention.k_proj.weight": "model-00001-of-00002.safetensors",
454
  "vision_tower.encoder.layer.12.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
455
  "vision_tower.encoder.layer.12.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
456
  "vision_tower.encoder.layer.12.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
457
  "vision_tower.encoder.layer.12.attention.v_proj.weight": "model-00001-of-00002.safetensors",
458
  "vision_tower.encoder.layer.12.lambda_1": "model-00001-of-00002.safetensors",
459
  "vision_tower.encoder.layer.12.lambda_2": "model-00001-of-00002.safetensors",
@@ -465,10 +481,13 @@
465
  "vision_tower.encoder.layer.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
466
  "vision_tower.encoder.layer.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
467
  "vision_tower.encoder.layer.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
468
  "vision_tower.encoder.layer.13.attention.k_proj.weight": "model-00001-of-00002.safetensors",
469
  "vision_tower.encoder.layer.13.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
470
  "vision_tower.encoder.layer.13.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
471
  "vision_tower.encoder.layer.13.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
472
  "vision_tower.encoder.layer.13.attention.v_proj.weight": "model-00001-of-00002.safetensors",
473
  "vision_tower.encoder.layer.13.lambda_1": "model-00001-of-00002.safetensors",
474
  "vision_tower.encoder.layer.13.lambda_2": "model-00001-of-00002.safetensors",
@@ -480,10 +499,13 @@
480
  "vision_tower.encoder.layer.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
481
  "vision_tower.encoder.layer.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
482
  "vision_tower.encoder.layer.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
483
  "vision_tower.encoder.layer.14.attention.k_proj.weight": "model-00001-of-00002.safetensors",
484
  "vision_tower.encoder.layer.14.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
485
  "vision_tower.encoder.layer.14.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
486
  "vision_tower.encoder.layer.14.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
487
  "vision_tower.encoder.layer.14.attention.v_proj.weight": "model-00001-of-00002.safetensors",
488
  "vision_tower.encoder.layer.14.lambda_1": "model-00001-of-00002.safetensors",
489
  "vision_tower.encoder.layer.14.lambda_2": "model-00001-of-00002.safetensors",
@@ -495,10 +517,13 @@
495
  "vision_tower.encoder.layer.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
496
  "vision_tower.encoder.layer.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
497
  "vision_tower.encoder.layer.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
498
  "vision_tower.encoder.layer.15.attention.k_proj.weight": "model-00001-of-00002.safetensors",
499
  "vision_tower.encoder.layer.15.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
500
  "vision_tower.encoder.layer.15.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
501
  "vision_tower.encoder.layer.15.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
502
  "vision_tower.encoder.layer.15.attention.v_proj.weight": "model-00001-of-00002.safetensors",
503
  "vision_tower.encoder.layer.15.lambda_1": "model-00001-of-00002.safetensors",
504
  "vision_tower.encoder.layer.15.lambda_2": "model-00001-of-00002.safetensors",
@@ -510,10 +535,13 @@
510
  "vision_tower.encoder.layer.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
511
  "vision_tower.encoder.layer.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
512
  "vision_tower.encoder.layer.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
513
  "vision_tower.encoder.layer.16.attention.k_proj.weight": "model-00001-of-00002.safetensors",
514
  "vision_tower.encoder.layer.16.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
515
  "vision_tower.encoder.layer.16.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
516
  "vision_tower.encoder.layer.16.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
517
  "vision_tower.encoder.layer.16.attention.v_proj.weight": "model-00001-of-00002.safetensors",
518
  "vision_tower.encoder.layer.16.lambda_1": "model-00001-of-00002.safetensors",
519
  "vision_tower.encoder.layer.16.lambda_2": "model-00001-of-00002.safetensors",
@@ -525,10 +553,13 @@
525
  "vision_tower.encoder.layer.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
526
  "vision_tower.encoder.layer.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
527
  "vision_tower.encoder.layer.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
528
  "vision_tower.encoder.layer.17.attention.k_proj.weight": "model-00001-of-00002.safetensors",
529
  "vision_tower.encoder.layer.17.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
530
  "vision_tower.encoder.layer.17.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
531
  "vision_tower.encoder.layer.17.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
532
  "vision_tower.encoder.layer.17.attention.v_proj.weight": "model-00001-of-00002.safetensors",
533
  "vision_tower.encoder.layer.17.lambda_1": "model-00001-of-00002.safetensors",
534
  "vision_tower.encoder.layer.17.lambda_2": "model-00001-of-00002.safetensors",
@@ -540,10 +571,13 @@
540
  "vision_tower.encoder.layer.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
541
  "vision_tower.encoder.layer.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
542
  "vision_tower.encoder.layer.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
543
  "vision_tower.encoder.layer.18.attention.k_proj.weight": "model-00001-of-00002.safetensors",
544
  "vision_tower.encoder.layer.18.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
545
  "vision_tower.encoder.layer.18.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
546
  "vision_tower.encoder.layer.18.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
547
  "vision_tower.encoder.layer.18.attention.v_proj.weight": "model-00001-of-00002.safetensors",
548
  "vision_tower.encoder.layer.18.lambda_1": "model-00001-of-00002.safetensors",
549
  "vision_tower.encoder.layer.18.lambda_2": "model-00001-of-00002.safetensors",
@@ -555,10 +589,13 @@
555
  "vision_tower.encoder.layer.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
556
  "vision_tower.encoder.layer.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
557
  "vision_tower.encoder.layer.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
558
  "vision_tower.encoder.layer.19.attention.k_proj.weight": "model-00001-of-00002.safetensors",
559
  "vision_tower.encoder.layer.19.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
560
  "vision_tower.encoder.layer.19.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
561
  "vision_tower.encoder.layer.19.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
562
  "vision_tower.encoder.layer.19.attention.v_proj.weight": "model-00001-of-00002.safetensors",
563
  "vision_tower.encoder.layer.19.lambda_1": "model-00001-of-00002.safetensors",
564
  "vision_tower.encoder.layer.19.lambda_2": "model-00001-of-00002.safetensors",
@@ -570,10 +607,13 @@
570
  "vision_tower.encoder.layer.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
571
  "vision_tower.encoder.layer.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
572
  "vision_tower.encoder.layer.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
573
  "vision_tower.encoder.layer.2.attention.k_proj.weight": "model-00001-of-00002.safetensors",
574
  "vision_tower.encoder.layer.2.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
575
  "vision_tower.encoder.layer.2.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
576
  "vision_tower.encoder.layer.2.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
577
  "vision_tower.encoder.layer.2.attention.v_proj.weight": "model-00001-of-00002.safetensors",
578
  "vision_tower.encoder.layer.2.lambda_1": "model-00001-of-00002.safetensors",
579
  "vision_tower.encoder.layer.2.lambda_2": "model-00001-of-00002.safetensors",
@@ -585,10 +625,13 @@
585
  "vision_tower.encoder.layer.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
586
  "vision_tower.encoder.layer.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
587
  "vision_tower.encoder.layer.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
588
  "vision_tower.encoder.layer.20.attention.k_proj.weight": "model-00001-of-00002.safetensors",
589
  "vision_tower.encoder.layer.20.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
590
  "vision_tower.encoder.layer.20.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
591
  "vision_tower.encoder.layer.20.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
592
  "vision_tower.encoder.layer.20.attention.v_proj.weight": "model-00001-of-00002.safetensors",
593
  "vision_tower.encoder.layer.20.lambda_1": "model-00001-of-00002.safetensors",
594
  "vision_tower.encoder.layer.20.lambda_2": "model-00001-of-00002.safetensors",
@@ -600,10 +643,13 @@
600
  "vision_tower.encoder.layer.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
601
  "vision_tower.encoder.layer.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
602
  "vision_tower.encoder.layer.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
603
  "vision_tower.encoder.layer.21.attention.k_proj.weight": "model-00001-of-00002.safetensors",
604
  "vision_tower.encoder.layer.21.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
605
  "vision_tower.encoder.layer.21.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
606
  "vision_tower.encoder.layer.21.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
607
  "vision_tower.encoder.layer.21.attention.v_proj.weight": "model-00001-of-00002.safetensors",
608
  "vision_tower.encoder.layer.21.lambda_1": "model-00001-of-00002.safetensors",
609
  "vision_tower.encoder.layer.21.lambda_2": "model-00001-of-00002.safetensors",
@@ -615,10 +661,13 @@
615
  "vision_tower.encoder.layer.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
616
  "vision_tower.encoder.layer.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
617
  "vision_tower.encoder.layer.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
618
  "vision_tower.encoder.layer.22.attention.k_proj.weight": "model-00001-of-00002.safetensors",
619
  "vision_tower.encoder.layer.22.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
620
  "vision_tower.encoder.layer.22.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
621
  "vision_tower.encoder.layer.22.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
622
  "vision_tower.encoder.layer.22.attention.v_proj.weight": "model-00001-of-00002.safetensors",
623
  "vision_tower.encoder.layer.22.lambda_1": "model-00001-of-00002.safetensors",
624
  "vision_tower.encoder.layer.22.lambda_2": "model-00001-of-00002.safetensors",
@@ -630,10 +679,13 @@
630
  "vision_tower.encoder.layer.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
631
  "vision_tower.encoder.layer.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
632
  "vision_tower.encoder.layer.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
633
  "vision_tower.encoder.layer.23.attention.k_proj.weight": "model-00001-of-00002.safetensors",
634
  "vision_tower.encoder.layer.23.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
635
  "vision_tower.encoder.layer.23.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
636
  "vision_tower.encoder.layer.23.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
637
  "vision_tower.encoder.layer.23.attention.v_proj.weight": "model-00001-of-00002.safetensors",
638
  "vision_tower.encoder.layer.23.lambda_1": "model-00001-of-00002.safetensors",
639
  "vision_tower.encoder.layer.23.lambda_2": "model-00001-of-00002.safetensors",
@@ -645,10 +697,13 @@
645
  "vision_tower.encoder.layer.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
646
  "vision_tower.encoder.layer.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
647
  "vision_tower.encoder.layer.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
648
  "vision_tower.encoder.layer.3.attention.k_proj.weight": "model-00001-of-00002.safetensors",
649
  "vision_tower.encoder.layer.3.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
650
  "vision_tower.encoder.layer.3.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
651
  "vision_tower.encoder.layer.3.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
652
  "vision_tower.encoder.layer.3.attention.v_proj.weight": "model-00001-of-00002.safetensors",
653
  "vision_tower.encoder.layer.3.lambda_1": "model-00001-of-00002.safetensors",
654
  "vision_tower.encoder.layer.3.lambda_2": "model-00001-of-00002.safetensors",
@@ -660,10 +715,13 @@
660
  "vision_tower.encoder.layer.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
661
  "vision_tower.encoder.layer.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
662
  "vision_tower.encoder.layer.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
663
  "vision_tower.encoder.layer.4.attention.k_proj.weight": "model-00001-of-00002.safetensors",
664
  "vision_tower.encoder.layer.4.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
665
  "vision_tower.encoder.layer.4.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
666
  "vision_tower.encoder.layer.4.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
667
  "vision_tower.encoder.layer.4.attention.v_proj.weight": "model-00001-of-00002.safetensors",
668
  "vision_tower.encoder.layer.4.lambda_1": "model-00001-of-00002.safetensors",
669
  "vision_tower.encoder.layer.4.lambda_2": "model-00001-of-00002.safetensors",
@@ -675,10 +733,13 @@
675
  "vision_tower.encoder.layer.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
676
  "vision_tower.encoder.layer.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
677
  "vision_tower.encoder.layer.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
678
  "vision_tower.encoder.layer.5.attention.k_proj.weight": "model-00001-of-00002.safetensors",
679
  "vision_tower.encoder.layer.5.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
680
  "vision_tower.encoder.layer.5.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
681
  "vision_tower.encoder.layer.5.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
682
  "vision_tower.encoder.layer.5.attention.v_proj.weight": "model-00001-of-00002.safetensors",
683
  "vision_tower.encoder.layer.5.lambda_1": "model-00001-of-00002.safetensors",
684
  "vision_tower.encoder.layer.5.lambda_2": "model-00001-of-00002.safetensors",
@@ -690,10 +751,13 @@
690
  "vision_tower.encoder.layer.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
691
  "vision_tower.encoder.layer.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
692
  "vision_tower.encoder.layer.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
693
  "vision_tower.encoder.layer.6.attention.k_proj.weight": "model-00001-of-00002.safetensors",
694
  "vision_tower.encoder.layer.6.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
695
  "vision_tower.encoder.layer.6.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
696
  "vision_tower.encoder.layer.6.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
697
  "vision_tower.encoder.layer.6.attention.v_proj.weight": "model-00001-of-00002.safetensors",
698
  "vision_tower.encoder.layer.6.lambda_1": "model-00001-of-00002.safetensors",
699
  "vision_tower.encoder.layer.6.lambda_2": "model-00001-of-00002.safetensors",
@@ -705,10 +769,13 @@
705
  "vision_tower.encoder.layer.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
706
  "vision_tower.encoder.layer.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
707
  "vision_tower.encoder.layer.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
708
  "vision_tower.encoder.layer.7.attention.k_proj.weight": "model-00001-of-00002.safetensors",
709
  "vision_tower.encoder.layer.7.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
710
  "vision_tower.encoder.layer.7.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
711
  "vision_tower.encoder.layer.7.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
712
  "vision_tower.encoder.layer.7.attention.v_proj.weight": "model-00001-of-00002.safetensors",
713
  "vision_tower.encoder.layer.7.lambda_1": "model-00001-of-00002.safetensors",
714
  "vision_tower.encoder.layer.7.lambda_2": "model-00001-of-00002.safetensors",
@@ -720,10 +787,13 @@
720
  "vision_tower.encoder.layer.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
721
  "vision_tower.encoder.layer.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
722
  "vision_tower.encoder.layer.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
723
  "vision_tower.encoder.layer.8.attention.k_proj.weight": "model-00001-of-00002.safetensors",
724
  "vision_tower.encoder.layer.8.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
725
  "vision_tower.encoder.layer.8.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
726
  "vision_tower.encoder.layer.8.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
727
  "vision_tower.encoder.layer.8.attention.v_proj.weight": "model-00001-of-00002.safetensors",
728
  "vision_tower.encoder.layer.8.lambda_1": "model-00001-of-00002.safetensors",
729
  "vision_tower.encoder.layer.8.lambda_2": "model-00001-of-00002.safetensors",
@@ -735,10 +805,13 @@
735
  "vision_tower.encoder.layer.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
736
  "vision_tower.encoder.layer.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
737
  "vision_tower.encoder.layer.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
 
738
  "vision_tower.encoder.layer.9.attention.k_proj.weight": "model-00001-of-00002.safetensors",
739
  "vision_tower.encoder.layer.9.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
740
  "vision_tower.encoder.layer.9.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
 
741
  "vision_tower.encoder.layer.9.attention.q_proj.weight": "model-00001-of-00002.safetensors",
 
742
  "vision_tower.encoder.layer.9.attention.v_proj.weight": "model-00001-of-00002.safetensors",
743
  "vision_tower.encoder.layer.9.lambda_1": "model-00001-of-00002.safetensors",
744
  "vision_tower.encoder.layer.9.lambda_2": "model-00001-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_parameters": 4872119808,
4
+ "total_size": 9744239616
5
  },
6
  "weight_map": {
7
+ "language_model.lm_head.weight": "model-00002-of-00002.safetensors",
8
  "language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
9
  "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
10
  "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
 
391
  "vision_tower.embeddings.patch_embeddings.projection.bias": "model-00001-of-00002.safetensors",
392
  "vision_tower.embeddings.patch_embeddings.projection.weight": "model-00001-of-00002.safetensors",
393
  "vision_tower.embeddings.position_embeddings": "model-00001-of-00002.safetensors",
394
+ "vision_tower.encoder.layer.0.attention.k_proj.bias": "model-00001-of-00002.safetensors",
395
  "vision_tower.encoder.layer.0.attention.k_proj.weight": "model-00001-of-00002.safetensors",
396
  "vision_tower.encoder.layer.0.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
397
  "vision_tower.encoder.layer.0.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
398
+ "vision_tower.encoder.layer.0.attention.q_proj.bias": "model-00001-of-00002.safetensors",
399
  "vision_tower.encoder.layer.0.attention.q_proj.weight": "model-00001-of-00002.safetensors",
400
+ "vision_tower.encoder.layer.0.attention.v_proj.bias": "model-00001-of-00002.safetensors",
401
  "vision_tower.encoder.layer.0.attention.v_proj.weight": "model-00001-of-00002.safetensors",
402
  "vision_tower.encoder.layer.0.lambda_1": "model-00001-of-00002.safetensors",
403
  "vision_tower.encoder.layer.0.lambda_2": "model-00001-of-00002.safetensors",
 
409
  "vision_tower.encoder.layer.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
410
  "vision_tower.encoder.layer.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
411
  "vision_tower.encoder.layer.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
412
+ "vision_tower.encoder.layer.1.attention.k_proj.bias": "model-00001-of-00002.safetensors",
413
  "vision_tower.encoder.layer.1.attention.k_proj.weight": "model-00001-of-00002.safetensors",
414
  "vision_tower.encoder.layer.1.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
415
  "vision_tower.encoder.layer.1.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
416
+ "vision_tower.encoder.layer.1.attention.q_proj.bias": "model-00001-of-00002.safetensors",
417
  "vision_tower.encoder.layer.1.attention.q_proj.weight": "model-00001-of-00002.safetensors",
418
+ "vision_tower.encoder.layer.1.attention.v_proj.bias": "model-00001-of-00002.safetensors",
419
  "vision_tower.encoder.layer.1.attention.v_proj.weight": "model-00001-of-00002.safetensors",
420
  "vision_tower.encoder.layer.1.lambda_1": "model-00001-of-00002.safetensors",
421
  "vision_tower.encoder.layer.1.lambda_2": "model-00001-of-00002.safetensors",
 
427
  "vision_tower.encoder.layer.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
428
  "vision_tower.encoder.layer.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
429
  "vision_tower.encoder.layer.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
430
+ "vision_tower.encoder.layer.10.attention.k_proj.bias": "model-00001-of-00002.safetensors",
431
  "vision_tower.encoder.layer.10.attention.k_proj.weight": "model-00001-of-00002.safetensors",
432
  "vision_tower.encoder.layer.10.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
433
  "vision_tower.encoder.layer.10.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
434
+ "vision_tower.encoder.layer.10.attention.q_proj.bias": "model-00001-of-00002.safetensors",
435
  "vision_tower.encoder.layer.10.attention.q_proj.weight": "model-00001-of-00002.safetensors",
436
+ "vision_tower.encoder.layer.10.attention.v_proj.bias": "model-00001-of-00002.safetensors",
437
  "vision_tower.encoder.layer.10.attention.v_proj.weight": "model-00001-of-00002.safetensors",
438
  "vision_tower.encoder.layer.10.lambda_1": "model-00001-of-00002.safetensors",
439
  "vision_tower.encoder.layer.10.lambda_2": "model-00001-of-00002.safetensors",
 
445
  "vision_tower.encoder.layer.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
446
  "vision_tower.encoder.layer.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
447
  "vision_tower.encoder.layer.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
448
+ "vision_tower.encoder.layer.11.attention.k_proj.bias": "model-00001-of-00002.safetensors",
449
  "vision_tower.encoder.layer.11.attention.k_proj.weight": "model-00001-of-00002.safetensors",
450
  "vision_tower.encoder.layer.11.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
451
  "vision_tower.encoder.layer.11.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
452
+ "vision_tower.encoder.layer.11.attention.q_proj.bias": "model-00001-of-00002.safetensors",
453
  "vision_tower.encoder.layer.11.attention.q_proj.weight": "model-00001-of-00002.safetensors",
454
+ "vision_tower.encoder.layer.11.attention.v_proj.bias": "model-00001-of-00002.safetensors",
455
  "vision_tower.encoder.layer.11.attention.v_proj.weight": "model-00001-of-00002.safetensors",
456
  "vision_tower.encoder.layer.11.lambda_1": "model-00001-of-00002.safetensors",
457
  "vision_tower.encoder.layer.11.lambda_2": "model-00001-of-00002.safetensors",
 
463
  "vision_tower.encoder.layer.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
464
  "vision_tower.encoder.layer.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
465
  "vision_tower.encoder.layer.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
466
+ "vision_tower.encoder.layer.12.attention.k_proj.bias": "model-00001-of-00002.safetensors",
467
  "vision_tower.encoder.layer.12.attention.k_proj.weight": "model-00001-of-00002.safetensors",
468
  "vision_tower.encoder.layer.12.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
469
  "vision_tower.encoder.layer.12.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
470
+ "vision_tower.encoder.layer.12.attention.q_proj.bias": "model-00001-of-00002.safetensors",
471
  "vision_tower.encoder.layer.12.attention.q_proj.weight": "model-00001-of-00002.safetensors",
472
+ "vision_tower.encoder.layer.12.attention.v_proj.bias": "model-00001-of-00002.safetensors",
473
  "vision_tower.encoder.layer.12.attention.v_proj.weight": "model-00001-of-00002.safetensors",
474
  "vision_tower.encoder.layer.12.lambda_1": "model-00001-of-00002.safetensors",
475
  "vision_tower.encoder.layer.12.lambda_2": "model-00001-of-00002.safetensors",
 
481
  "vision_tower.encoder.layer.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
482
  "vision_tower.encoder.layer.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
483
  "vision_tower.encoder.layer.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
484
+ "vision_tower.encoder.layer.13.attention.k_proj.bias": "model-00001-of-00002.safetensors",
485
  "vision_tower.encoder.layer.13.attention.k_proj.weight": "model-00001-of-00002.safetensors",
486
  "vision_tower.encoder.layer.13.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
487
  "vision_tower.encoder.layer.13.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
488
+ "vision_tower.encoder.layer.13.attention.q_proj.bias": "model-00001-of-00002.safetensors",
489
  "vision_tower.encoder.layer.13.attention.q_proj.weight": "model-00001-of-00002.safetensors",
490
+ "vision_tower.encoder.layer.13.attention.v_proj.bias": "model-00001-of-00002.safetensors",
491
  "vision_tower.encoder.layer.13.attention.v_proj.weight": "model-00001-of-00002.safetensors",
492
  "vision_tower.encoder.layer.13.lambda_1": "model-00001-of-00002.safetensors",
493
  "vision_tower.encoder.layer.13.lambda_2": "model-00001-of-00002.safetensors",
 
499
  "vision_tower.encoder.layer.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
500
  "vision_tower.encoder.layer.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
501
  "vision_tower.encoder.layer.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
502
+ "vision_tower.encoder.layer.14.attention.k_proj.bias": "model-00001-of-00002.safetensors",
503
  "vision_tower.encoder.layer.14.attention.k_proj.weight": "model-00001-of-00002.safetensors",
504
  "vision_tower.encoder.layer.14.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
505
  "vision_tower.encoder.layer.14.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
506
+ "vision_tower.encoder.layer.14.attention.q_proj.bias": "model-00001-of-00002.safetensors",
507
  "vision_tower.encoder.layer.14.attention.q_proj.weight": "model-00001-of-00002.safetensors",
508
+ "vision_tower.encoder.layer.14.attention.v_proj.bias": "model-00001-of-00002.safetensors",
509
  "vision_tower.encoder.layer.14.attention.v_proj.weight": "model-00001-of-00002.safetensors",
510
  "vision_tower.encoder.layer.14.lambda_1": "model-00001-of-00002.safetensors",
511
  "vision_tower.encoder.layer.14.lambda_2": "model-00001-of-00002.safetensors",
 
517
  "vision_tower.encoder.layer.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
518
  "vision_tower.encoder.layer.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
519
  "vision_tower.encoder.layer.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
520
+ "vision_tower.encoder.layer.15.attention.k_proj.bias": "model-00001-of-00002.safetensors",
521
  "vision_tower.encoder.layer.15.attention.k_proj.weight": "model-00001-of-00002.safetensors",
522
  "vision_tower.encoder.layer.15.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
523
  "vision_tower.encoder.layer.15.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
524
+ "vision_tower.encoder.layer.15.attention.q_proj.bias": "model-00001-of-00002.safetensors",
525
  "vision_tower.encoder.layer.15.attention.q_proj.weight": "model-00001-of-00002.safetensors",
526
+ "vision_tower.encoder.layer.15.attention.v_proj.bias": "model-00001-of-00002.safetensors",
527
  "vision_tower.encoder.layer.15.attention.v_proj.weight": "model-00001-of-00002.safetensors",
528
  "vision_tower.encoder.layer.15.lambda_1": "model-00001-of-00002.safetensors",
529
  "vision_tower.encoder.layer.15.lambda_2": "model-00001-of-00002.safetensors",
 
535
  "vision_tower.encoder.layer.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
536
  "vision_tower.encoder.layer.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
537
  "vision_tower.encoder.layer.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
538
+ "vision_tower.encoder.layer.16.attention.k_proj.bias": "model-00001-of-00002.safetensors",
539
  "vision_tower.encoder.layer.16.attention.k_proj.weight": "model-00001-of-00002.safetensors",
540
  "vision_tower.encoder.layer.16.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
541
  "vision_tower.encoder.layer.16.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
542
+ "vision_tower.encoder.layer.16.attention.q_proj.bias": "model-00001-of-00002.safetensors",
543
  "vision_tower.encoder.layer.16.attention.q_proj.weight": "model-00001-of-00002.safetensors",
544
+ "vision_tower.encoder.layer.16.attention.v_proj.bias": "model-00001-of-00002.safetensors",
545
  "vision_tower.encoder.layer.16.attention.v_proj.weight": "model-00001-of-00002.safetensors",
546
  "vision_tower.encoder.layer.16.lambda_1": "model-00001-of-00002.safetensors",
547
  "vision_tower.encoder.layer.16.lambda_2": "model-00001-of-00002.safetensors",
 
553
  "vision_tower.encoder.layer.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
554
  "vision_tower.encoder.layer.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
555
  "vision_tower.encoder.layer.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
556
+ "vision_tower.encoder.layer.17.attention.k_proj.bias": "model-00001-of-00002.safetensors",
557
  "vision_tower.encoder.layer.17.attention.k_proj.weight": "model-00001-of-00002.safetensors",
558
  "vision_tower.encoder.layer.17.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
559
  "vision_tower.encoder.layer.17.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
560
+ "vision_tower.encoder.layer.17.attention.q_proj.bias": "model-00001-of-00002.safetensors",
561
  "vision_tower.encoder.layer.17.attention.q_proj.weight": "model-00001-of-00002.safetensors",
562
+ "vision_tower.encoder.layer.17.attention.v_proj.bias": "model-00001-of-00002.safetensors",
563
  "vision_tower.encoder.layer.17.attention.v_proj.weight": "model-00001-of-00002.safetensors",
564
  "vision_tower.encoder.layer.17.lambda_1": "model-00001-of-00002.safetensors",
565
  "vision_tower.encoder.layer.17.lambda_2": "model-00001-of-00002.safetensors",
 
571
  "vision_tower.encoder.layer.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
572
  "vision_tower.encoder.layer.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
573
  "vision_tower.encoder.layer.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
574
+ "vision_tower.encoder.layer.18.attention.k_proj.bias": "model-00001-of-00002.safetensors",
575
  "vision_tower.encoder.layer.18.attention.k_proj.weight": "model-00001-of-00002.safetensors",
576
  "vision_tower.encoder.layer.18.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
577
  "vision_tower.encoder.layer.18.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
578
+ "vision_tower.encoder.layer.18.attention.q_proj.bias": "model-00001-of-00002.safetensors",
579
  "vision_tower.encoder.layer.18.attention.q_proj.weight": "model-00001-of-00002.safetensors",
580
+ "vision_tower.encoder.layer.18.attention.v_proj.bias": "model-00001-of-00002.safetensors",
581
  "vision_tower.encoder.layer.18.attention.v_proj.weight": "model-00001-of-00002.safetensors",
582
  "vision_tower.encoder.layer.18.lambda_1": "model-00001-of-00002.safetensors",
583
  "vision_tower.encoder.layer.18.lambda_2": "model-00001-of-00002.safetensors",
 
589
  "vision_tower.encoder.layer.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
590
  "vision_tower.encoder.layer.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
591
  "vision_tower.encoder.layer.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
592
+ "vision_tower.encoder.layer.19.attention.k_proj.bias": "model-00001-of-00002.safetensors",
593
  "vision_tower.encoder.layer.19.attention.k_proj.weight": "model-00001-of-00002.safetensors",
594
  "vision_tower.encoder.layer.19.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
595
  "vision_tower.encoder.layer.19.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
596
+ "vision_tower.encoder.layer.19.attention.q_proj.bias": "model-00001-of-00002.safetensors",
597
  "vision_tower.encoder.layer.19.attention.q_proj.weight": "model-00001-of-00002.safetensors",
598
+ "vision_tower.encoder.layer.19.attention.v_proj.bias": "model-00001-of-00002.safetensors",
599
  "vision_tower.encoder.layer.19.attention.v_proj.weight": "model-00001-of-00002.safetensors",
600
  "vision_tower.encoder.layer.19.lambda_1": "model-00001-of-00002.safetensors",
601
  "vision_tower.encoder.layer.19.lambda_2": "model-00001-of-00002.safetensors",
 
607
  "vision_tower.encoder.layer.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
608
  "vision_tower.encoder.layer.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
609
  "vision_tower.encoder.layer.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
610
+ "vision_tower.encoder.layer.2.attention.k_proj.bias": "model-00001-of-00002.safetensors",
611
  "vision_tower.encoder.layer.2.attention.k_proj.weight": "model-00001-of-00002.safetensors",
612
  "vision_tower.encoder.layer.2.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
613
  "vision_tower.encoder.layer.2.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
614
+ "vision_tower.encoder.layer.2.attention.q_proj.bias": "model-00001-of-00002.safetensors",
615
  "vision_tower.encoder.layer.2.attention.q_proj.weight": "model-00001-of-00002.safetensors",
616
+ "vision_tower.encoder.layer.2.attention.v_proj.bias": "model-00001-of-00002.safetensors",
617
  "vision_tower.encoder.layer.2.attention.v_proj.weight": "model-00001-of-00002.safetensors",
618
  "vision_tower.encoder.layer.2.lambda_1": "model-00001-of-00002.safetensors",
619
  "vision_tower.encoder.layer.2.lambda_2": "model-00001-of-00002.safetensors",
 
625
  "vision_tower.encoder.layer.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
626
  "vision_tower.encoder.layer.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
627
  "vision_tower.encoder.layer.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
628
+ "vision_tower.encoder.layer.20.attention.k_proj.bias": "model-00001-of-00002.safetensors",
629
  "vision_tower.encoder.layer.20.attention.k_proj.weight": "model-00001-of-00002.safetensors",
630
  "vision_tower.encoder.layer.20.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
631
  "vision_tower.encoder.layer.20.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
632
+ "vision_tower.encoder.layer.20.attention.q_proj.bias": "model-00001-of-00002.safetensors",
633
  "vision_tower.encoder.layer.20.attention.q_proj.weight": "model-00001-of-00002.safetensors",
634
+ "vision_tower.encoder.layer.20.attention.v_proj.bias": "model-00001-of-00002.safetensors",
635
  "vision_tower.encoder.layer.20.attention.v_proj.weight": "model-00001-of-00002.safetensors",
636
  "vision_tower.encoder.layer.20.lambda_1": "model-00001-of-00002.safetensors",
637
  "vision_tower.encoder.layer.20.lambda_2": "model-00001-of-00002.safetensors",
 
643
  "vision_tower.encoder.layer.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
644
  "vision_tower.encoder.layer.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
645
  "vision_tower.encoder.layer.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
646
+ "vision_tower.encoder.layer.21.attention.k_proj.bias": "model-00001-of-00002.safetensors",
647
  "vision_tower.encoder.layer.21.attention.k_proj.weight": "model-00001-of-00002.safetensors",
648
  "vision_tower.encoder.layer.21.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
649
  "vision_tower.encoder.layer.21.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
650
+ "vision_tower.encoder.layer.21.attention.q_proj.bias": "model-00001-of-00002.safetensors",
651
  "vision_tower.encoder.layer.21.attention.q_proj.weight": "model-00001-of-00002.safetensors",
652
+ "vision_tower.encoder.layer.21.attention.v_proj.bias": "model-00001-of-00002.safetensors",
653
  "vision_tower.encoder.layer.21.attention.v_proj.weight": "model-00001-of-00002.safetensors",
654
  "vision_tower.encoder.layer.21.lambda_1": "model-00001-of-00002.safetensors",
655
  "vision_tower.encoder.layer.21.lambda_2": "model-00001-of-00002.safetensors",
 
661
  "vision_tower.encoder.layer.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
662
  "vision_tower.encoder.layer.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
663
  "vision_tower.encoder.layer.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
664
+ "vision_tower.encoder.layer.22.attention.k_proj.bias": "model-00001-of-00002.safetensors",
665
  "vision_tower.encoder.layer.22.attention.k_proj.weight": "model-00001-of-00002.safetensors",
666
  "vision_tower.encoder.layer.22.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
667
  "vision_tower.encoder.layer.22.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
668
+ "vision_tower.encoder.layer.22.attention.q_proj.bias": "model-00001-of-00002.safetensors",
669
  "vision_tower.encoder.layer.22.attention.q_proj.weight": "model-00001-of-00002.safetensors",
670
+ "vision_tower.encoder.layer.22.attention.v_proj.bias": "model-00001-of-00002.safetensors",
671
  "vision_tower.encoder.layer.22.attention.v_proj.weight": "model-00001-of-00002.safetensors",
672
  "vision_tower.encoder.layer.22.lambda_1": "model-00001-of-00002.safetensors",
673
  "vision_tower.encoder.layer.22.lambda_2": "model-00001-of-00002.safetensors",
 
679
  "vision_tower.encoder.layer.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
680
  "vision_tower.encoder.layer.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
681
  "vision_tower.encoder.layer.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
682
+ "vision_tower.encoder.layer.23.attention.k_proj.bias": "model-00001-of-00002.safetensors",
683
  "vision_tower.encoder.layer.23.attention.k_proj.weight": "model-00001-of-00002.safetensors",
684
  "vision_tower.encoder.layer.23.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
685
  "vision_tower.encoder.layer.23.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
686
+ "vision_tower.encoder.layer.23.attention.q_proj.bias": "model-00001-of-00002.safetensors",
687
  "vision_tower.encoder.layer.23.attention.q_proj.weight": "model-00001-of-00002.safetensors",
688
+ "vision_tower.encoder.layer.23.attention.v_proj.bias": "model-00001-of-00002.safetensors",
689
  "vision_tower.encoder.layer.23.attention.v_proj.weight": "model-00001-of-00002.safetensors",
690
  "vision_tower.encoder.layer.23.lambda_1": "model-00001-of-00002.safetensors",
691
  "vision_tower.encoder.layer.23.lambda_2": "model-00001-of-00002.safetensors",
 
697
  "vision_tower.encoder.layer.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
698
  "vision_tower.encoder.layer.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
699
  "vision_tower.encoder.layer.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
700
+ "vision_tower.encoder.layer.3.attention.k_proj.bias": "model-00001-of-00002.safetensors",
701
  "vision_tower.encoder.layer.3.attention.k_proj.weight": "model-00001-of-00002.safetensors",
702
  "vision_tower.encoder.layer.3.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
703
  "vision_tower.encoder.layer.3.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
704
+ "vision_tower.encoder.layer.3.attention.q_proj.bias": "model-00001-of-00002.safetensors",
705
  "vision_tower.encoder.layer.3.attention.q_proj.weight": "model-00001-of-00002.safetensors",
706
+ "vision_tower.encoder.layer.3.attention.v_proj.bias": "model-00001-of-00002.safetensors",
707
  "vision_tower.encoder.layer.3.attention.v_proj.weight": "model-00001-of-00002.safetensors",
708
  "vision_tower.encoder.layer.3.lambda_1": "model-00001-of-00002.safetensors",
709
  "vision_tower.encoder.layer.3.lambda_2": "model-00001-of-00002.safetensors",
 
715
  "vision_tower.encoder.layer.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
716
  "vision_tower.encoder.layer.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
717
  "vision_tower.encoder.layer.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
718
+ "vision_tower.encoder.layer.4.attention.k_proj.bias": "model-00001-of-00002.safetensors",
719
  "vision_tower.encoder.layer.4.attention.k_proj.weight": "model-00001-of-00002.safetensors",
720
  "vision_tower.encoder.layer.4.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
721
  "vision_tower.encoder.layer.4.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
722
+ "vision_tower.encoder.layer.4.attention.q_proj.bias": "model-00001-of-00002.safetensors",
723
  "vision_tower.encoder.layer.4.attention.q_proj.weight": "model-00001-of-00002.safetensors",
724
+ "vision_tower.encoder.layer.4.attention.v_proj.bias": "model-00001-of-00002.safetensors",
725
  "vision_tower.encoder.layer.4.attention.v_proj.weight": "model-00001-of-00002.safetensors",
726
  "vision_tower.encoder.layer.4.lambda_1": "model-00001-of-00002.safetensors",
727
  "vision_tower.encoder.layer.4.lambda_2": "model-00001-of-00002.safetensors",
 
733
  "vision_tower.encoder.layer.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
734
  "vision_tower.encoder.layer.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
735
  "vision_tower.encoder.layer.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
736
+ "vision_tower.encoder.layer.5.attention.k_proj.bias": "model-00001-of-00002.safetensors",
737
  "vision_tower.encoder.layer.5.attention.k_proj.weight": "model-00001-of-00002.safetensors",
738
  "vision_tower.encoder.layer.5.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
739
  "vision_tower.encoder.layer.5.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
740
+ "vision_tower.encoder.layer.5.attention.q_proj.bias": "model-00001-of-00002.safetensors",
741
  "vision_tower.encoder.layer.5.attention.q_proj.weight": "model-00001-of-00002.safetensors",
742
+ "vision_tower.encoder.layer.5.attention.v_proj.bias": "model-00001-of-00002.safetensors",
743
  "vision_tower.encoder.layer.5.attention.v_proj.weight": "model-00001-of-00002.safetensors",
744
  "vision_tower.encoder.layer.5.lambda_1": "model-00001-of-00002.safetensors",
745
  "vision_tower.encoder.layer.5.lambda_2": "model-00001-of-00002.safetensors",
 
751
  "vision_tower.encoder.layer.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
752
  "vision_tower.encoder.layer.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
753
  "vision_tower.encoder.layer.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
754
+ "vision_tower.encoder.layer.6.attention.k_proj.bias": "model-00001-of-00002.safetensors",
755
  "vision_tower.encoder.layer.6.attention.k_proj.weight": "model-00001-of-00002.safetensors",
756
  "vision_tower.encoder.layer.6.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
757
  "vision_tower.encoder.layer.6.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
758
+ "vision_tower.encoder.layer.6.attention.q_proj.bias": "model-00001-of-00002.safetensors",
759
  "vision_tower.encoder.layer.6.attention.q_proj.weight": "model-00001-of-00002.safetensors",
760
+ "vision_tower.encoder.layer.6.attention.v_proj.bias": "model-00001-of-00002.safetensors",
761
  "vision_tower.encoder.layer.6.attention.v_proj.weight": "model-00001-of-00002.safetensors",
762
  "vision_tower.encoder.layer.6.lambda_1": "model-00001-of-00002.safetensors",
763
  "vision_tower.encoder.layer.6.lambda_2": "model-00001-of-00002.safetensors",
 
769
  "vision_tower.encoder.layer.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
770
  "vision_tower.encoder.layer.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
771
  "vision_tower.encoder.layer.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
772
+ "vision_tower.encoder.layer.7.attention.k_proj.bias": "model-00001-of-00002.safetensors",
773
  "vision_tower.encoder.layer.7.attention.k_proj.weight": "model-00001-of-00002.safetensors",
774
  "vision_tower.encoder.layer.7.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
775
  "vision_tower.encoder.layer.7.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
776
+ "vision_tower.encoder.layer.7.attention.q_proj.bias": "model-00001-of-00002.safetensors",
777
  "vision_tower.encoder.layer.7.attention.q_proj.weight": "model-00001-of-00002.safetensors",
778
+ "vision_tower.encoder.layer.7.attention.v_proj.bias": "model-00001-of-00002.safetensors",
779
  "vision_tower.encoder.layer.7.attention.v_proj.weight": "model-00001-of-00002.safetensors",
780
  "vision_tower.encoder.layer.7.lambda_1": "model-00001-of-00002.safetensors",
781
  "vision_tower.encoder.layer.7.lambda_2": "model-00001-of-00002.safetensors",
 
787
  "vision_tower.encoder.layer.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
788
  "vision_tower.encoder.layer.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
789
  "vision_tower.encoder.layer.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
790
+ "vision_tower.encoder.layer.8.attention.k_proj.bias": "model-00001-of-00002.safetensors",
791
  "vision_tower.encoder.layer.8.attention.k_proj.weight": "model-00001-of-00002.safetensors",
792
  "vision_tower.encoder.layer.8.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
793
  "vision_tower.encoder.layer.8.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
794
+ "vision_tower.encoder.layer.8.attention.q_proj.bias": "model-00001-of-00002.safetensors",
795
  "vision_tower.encoder.layer.8.attention.q_proj.weight": "model-00001-of-00002.safetensors",
796
+ "vision_tower.encoder.layer.8.attention.v_proj.bias": "model-00001-of-00002.safetensors",
797
  "vision_tower.encoder.layer.8.attention.v_proj.weight": "model-00001-of-00002.safetensors",
798
  "vision_tower.encoder.layer.8.lambda_1": "model-00001-of-00002.safetensors",
799
  "vision_tower.encoder.layer.8.lambda_2": "model-00001-of-00002.safetensors",
 
805
  "vision_tower.encoder.layer.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
806
  "vision_tower.encoder.layer.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
807
  "vision_tower.encoder.layer.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
808
+ "vision_tower.encoder.layer.9.attention.k_proj.bias": "model-00001-of-00002.safetensors",
809
  "vision_tower.encoder.layer.9.attention.k_proj.weight": "model-00001-of-00002.safetensors",
810
  "vision_tower.encoder.layer.9.attention.projection_layer.bias": "model-00001-of-00002.safetensors",
811
  "vision_tower.encoder.layer.9.attention.projection_layer.weight": "model-00001-of-00002.safetensors",
812
+ "vision_tower.encoder.layer.9.attention.q_proj.bias": "model-00001-of-00002.safetensors",
813
  "vision_tower.encoder.layer.9.attention.q_proj.weight": "model-00001-of-00002.safetensors",
814
+ "vision_tower.encoder.layer.9.attention.v_proj.bias": "model-00001-of-00002.safetensors",
815
  "vision_tower.encoder.layer.9.attention.v_proj.weight": "model-00001-of-00002.safetensors",
816
  "vision_tower.encoder.layer.9.lambda_1": "model-00001-of-00002.safetensors",
817
  "vision_tower.encoder.layer.9.lambda_2": "model-00001-of-00002.safetensors",
special_tokens_map.json CHANGED
@@ -1,84 +1,44 @@
1
  {
2
  "additional_special_tokens": [
3
- {
4
- "content": "<IMG_CONTEXT>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<quad>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- {
18
- "content": "</quad>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- {
25
- "content": "<ref>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- {
32
- "content": "</ref>",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- },
38
- {
39
- "content": "<box>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false
44
- },
45
- {
46
- "content": "</box>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false
51
- }
52
  ],
53
- "boi_token": "<start_of_image>",
54
- "bos_token": {
55
- "content": "<bos>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": false,
59
- "single_word": false
60
- },
61
- "eoi_token": "<end_of_image>",
62
  "eos_token": {
63
- "content": "<eos>",
64
  "lstrip": false,
65
  "normalized": false,
66
  "rstrip": false,
67
  "single_word": false
68
  },
69
- "image_token": "<image_soft_token>",
70
  "pad_token": {
71
- "content": "<pad>",
72
  "lstrip": false,
73
  "normalized": false,
74
  "rstrip": false,
75
  "single_word": false
76
  },
77
- "unk_token": {
78
- "content": "<unk>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false
83
- }
84
  }
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<img>",
17
+ "</img>",
18
+ "<IMG_CONTEXT>",
19
+ "<quad>",
20
+ "</quad>",
21
+ "<ref>",
22
+ "</ref>",
23
+ "<box>",
24
+ "</box>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  ],
26
+ "context_image_token": "<IMG_CONTEXT>",
27
+ "end_image_token": "</img>",
 
 
 
 
 
 
 
28
  "eos_token": {
29
+ "content": "<|im_end|>",
30
  "lstrip": false,
31
  "normalized": false,
32
  "rstrip": false,
33
  "single_word": false
34
  },
 
35
  "pad_token": {
36
+ "content": "<|endoftext|>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false
41
  },
42
+ "start_image_token": "<img>",
43
+ "video_token": "<video>"
 
 
 
 
 
44
  }
tokenizer_config.json CHANGED
@@ -336,4 +336,4 @@
336
  "tokenizer_class": "Qwen2Tokenizer",
337
  "unk_token": null,
338
  "video_token": "<video>"
339
- }
 
336
  "tokenizer_class": "Qwen2Tokenizer",
337
  "unk_token": null,
338
  "video_token": "<video>"
339
+ }