chrisc36 commited on
Commit
ccfc55c
·
verified ·
1 Parent(s): a9b8aa1

Upload folder using huggingface_hub

Browse files
convert_molmo_point_to_hf.py CHANGED
@@ -204,9 +204,9 @@ def convert_molmo2(
204
  new_state_dict = {}
205
  for key, val in state_dict.items():
206
  if key == "transformer.ff_out.new_weight":
207
- new_key = "new_output_embeddings"
208
  elif key == "transformer.ff_out.weight":
209
- new_key = "output_embeddings"
210
  else:
211
  new_key = f"{base_model_prefix}.{key}"
212
  new_state_dict[new_key] = val
 
204
  new_state_dict = {}
205
  for key, val in state_dict.items():
206
  if key == "transformer.ff_out.new_weight":
207
+ new_key = "lm_head.new_output_embeddings"
208
  elif key == "transformer.ff_out.weight":
209
+ new_key = "lm_head.output_embeddings"
210
  else:
211
  new_key = f"{base_model_prefix}.{key}"
212
  new_state_dict[new_key] = val
model-00001-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f3e72190ef32e8a730642be569f673e3db747342d1233337b8839f805c02ce8
3
- size 4982833608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19ab63a529d72b101000396a03df34f2dc32e8d744b0c5cec50f0542eea6b8db
3
+ size 4974567112
model-00002-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:396c71733dec4db53150621ba67aea1c8444e3cbdc06410427ee52c9b34090ff
3
- size 4798510440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19fbea500ac2cb32175131e3009b4aebfc3c373e48b07ea51783b1049df34761
3
+ size 4630720272
model-00003-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a8d305edb19cd8bae64920bc893e636612e4722e986558338b097246b84efd6
3
- size 4630720272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c66ec2e00cd832d67a5f24ace0e4330e13627b82444784986062df6a2a973ac4
3
+ size 4630720296
model-00004-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e09b72e931382e8a0d7566ce791059f0d1e307358ff445aeab940083ae5b8ba2
3
  size 4630720320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20b4298b55a145d9ccdef463ffe3c344cbfd90e7fa67d0778d50f66688dacd27
3
  size 4630720320
model-00005-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d583e6ccfd62c1a710bd6405ce81ae965612fe814019004fd9357a4f2810f91
3
  size 4630720320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeac368f9670bad559e45aceea971ff78db86fafee604f17e28c1e58932a27de
3
  size 4630720320
model-00006-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:957fd289c1ce380f606ac7c66fcf88fd071e59359e1824b34c56e040f118a302
3
  size 4630720320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3738d142536fdc8e9217e7a853240006aff3fe5c157d982cff3e52addb05f011
3
  size 4630720320
model-00007-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f14ba921030350d6277d794066529298e9db125b763f109c17037611c5f379cb
3
- size 4997804128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30485f5c86cefbfccb5eef21be09adf1149e4ec6f9cc75072c162900e6972226
3
+ size 4091924852
model-00008-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3727a21ae90c0f0fc3e59f9a42354db0edafa6bc8577baed9b7c362446b15dd4
3
- size 1409480580
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b87b59f8beb9a114e4867e522e3ec5116d05442f199901b48b1beb4fda6962d1
3
+ size 2491416816
model.safetensors.index.json CHANGED
@@ -4,44 +4,46 @@
4
  "total_size": 34711420260
5
  },
6
  "weight_map": {
7
- "model.add_no_point_class_embed.vector": "model-00008-of-00008.safetensors",
8
- "model.build_vit_embedding.bias": "model-00008-of-00008.safetensors",
9
- "model.build_vit_embedding.weight": "model-00008-of-00008.safetensors",
10
- "model.connector.image_pooling_2d.wk.bias": "model-00008-of-00008.safetensors",
11
- "model.connector.image_pooling_2d.wk.weight": "model-00008-of-00008.safetensors",
12
- "model.connector.image_pooling_2d.wq.bias": "model-00008-of-00008.safetensors",
13
- "model.connector.image_pooling_2d.wq.weight": "model-00008-of-00008.safetensors",
14
- "model.connector.image_pooling_2d.wv.bias": "model-00008-of-00008.safetensors",
15
- "model.connector.image_pooling_2d.wv.weight": "model-00008-of-00008.safetensors",
16
- "model.connector.image_projector.w1.weight": "model-00008-of-00008.safetensors",
17
- "model.connector.image_projector.w2.weight": "model-00008-of-00008.safetensors",
18
- "model.connector.image_projector.w3.weight": "model-00008-of-00008.safetensors",
19
- "model.patch_k.bias": "model-00008-of-00008.safetensors",
20
- "model.patch_k.weight": "model-00008-of-00008.safetensors",
21
- "model.patch_q.bias": "model-00008-of-00008.safetensors",
22
- "model.patch_q.weight": "model-00008-of-00008.safetensors",
23
- "model.subpatch_k.bias": "model-00008-of-00008.safetensors",
24
- "model.subpatch_k.weight": "model-00008-of-00008.safetensors",
25
- "model.subpatch_loc_k.bias": "model-00008-of-00008.safetensors",
26
- "model.subpatch_loc_k.weight": "model-00008-of-00008.safetensors",
27
- "model.subpatch_q.bias": "model-00008-of-00008.safetensors",
28
- "model.subpatch_q.weight": "model-00008-of-00008.safetensors",
29
- "model.transformer.blocks.0.attn_norm.weight": "model-00002-of-00008.safetensors",
30
- "model.transformer.blocks.0.ff_norm.weight": "model-00002-of-00008.safetensors",
31
- "model.transformer.blocks.0.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
32
- "model.transformer.blocks.0.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
33
- "model.transformer.blocks.0.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
34
- "model.transformer.blocks.0.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
35
- "model.transformer.blocks.0.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
36
- "model.transformer.blocks.0.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
37
- "model.transformer.blocks.1.attn_norm.weight": "model-00002-of-00008.safetensors",
38
- "model.transformer.blocks.1.ff_norm.weight": "model-00002-of-00008.safetensors",
39
- "model.transformer.blocks.1.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
40
- "model.transformer.blocks.1.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
41
- "model.transformer.blocks.1.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
42
- "model.transformer.blocks.1.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
43
- "model.transformer.blocks.1.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
44
- "model.transformer.blocks.1.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
 
 
45
  "model.transformer.blocks.10.attn_norm.weight": "model-00003-of-00008.safetensors",
46
  "model.transformer.blocks.10.ff_norm.weight": "model-00003-of-00008.safetensors",
47
  "model.transformer.blocks.10.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
@@ -59,37 +61,37 @@
59
  "model.transformer.blocks.11.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
60
  "model.transformer.blocks.11.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
61
  "model.transformer.blocks.12.attn_norm.weight": "model-00003-of-00008.safetensors",
62
- "model.transformer.blocks.12.ff_norm.weight": "model-00004-of-00008.safetensors",
63
- "model.transformer.blocks.12.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
64
- "model.transformer.blocks.12.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
65
  "model.transformer.blocks.12.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
66
  "model.transformer.blocks.12.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
67
  "model.transformer.blocks.12.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
68
  "model.transformer.blocks.12.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
69
- "model.transformer.blocks.13.attn_norm.weight": "model-00004-of-00008.safetensors",
70
- "model.transformer.blocks.13.ff_norm.weight": "model-00004-of-00008.safetensors",
71
- "model.transformer.blocks.13.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
72
- "model.transformer.blocks.13.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
73
- "model.transformer.blocks.13.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
74
- "model.transformer.blocks.13.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
75
- "model.transformer.blocks.13.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
76
- "model.transformer.blocks.13.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
77
- "model.transformer.blocks.14.attn_norm.weight": "model-00004-of-00008.safetensors",
78
- "model.transformer.blocks.14.ff_norm.weight": "model-00004-of-00008.safetensors",
79
- "model.transformer.blocks.14.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
80
- "model.transformer.blocks.14.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
81
- "model.transformer.blocks.14.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
82
- "model.transformer.blocks.14.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
83
- "model.transformer.blocks.14.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
84
- "model.transformer.blocks.14.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
85
- "model.transformer.blocks.15.attn_norm.weight": "model-00004-of-00008.safetensors",
86
  "model.transformer.blocks.15.ff_norm.weight": "model-00004-of-00008.safetensors",
87
  "model.transformer.blocks.15.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
88
  "model.transformer.blocks.15.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
89
- "model.transformer.blocks.15.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
90
- "model.transformer.blocks.15.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
91
- "model.transformer.blocks.15.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
92
- "model.transformer.blocks.15.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
93
  "model.transformer.blocks.16.attn_norm.weight": "model-00004-of-00008.safetensors",
94
  "model.transformer.blocks.16.ff_norm.weight": "model-00004-of-00008.safetensors",
95
  "model.transformer.blocks.16.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
@@ -107,45 +109,45 @@
107
  "model.transformer.blocks.17.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
108
  "model.transformer.blocks.17.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
109
  "model.transformer.blocks.18.attn_norm.weight": "model-00004-of-00008.safetensors",
110
- "model.transformer.blocks.18.ff_norm.weight": "model-00005-of-00008.safetensors",
111
- "model.transformer.blocks.18.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
112
- "model.transformer.blocks.18.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
113
  "model.transformer.blocks.18.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
114
  "model.transformer.blocks.18.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
115
  "model.transformer.blocks.18.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
116
  "model.transformer.blocks.18.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
117
- "model.transformer.blocks.19.attn_norm.weight": "model-00005-of-00008.safetensors",
118
- "model.transformer.blocks.19.ff_norm.weight": "model-00005-of-00008.safetensors",
119
- "model.transformer.blocks.19.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
120
- "model.transformer.blocks.19.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
121
- "model.transformer.blocks.19.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
122
- "model.transformer.blocks.19.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
123
- "model.transformer.blocks.19.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
124
- "model.transformer.blocks.19.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
125
- "model.transformer.blocks.2.attn_norm.weight": "model-00002-of-00008.safetensors",
126
- "model.transformer.blocks.2.ff_norm.weight": "model-00002-of-00008.safetensors",
127
- "model.transformer.blocks.2.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
128
- "model.transformer.blocks.2.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
129
- "model.transformer.blocks.2.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
130
- "model.transformer.blocks.2.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
131
- "model.transformer.blocks.2.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
132
- "model.transformer.blocks.2.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
133
- "model.transformer.blocks.20.attn_norm.weight": "model-00005-of-00008.safetensors",
134
- "model.transformer.blocks.20.ff_norm.weight": "model-00005-of-00008.safetensors",
135
- "model.transformer.blocks.20.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
136
- "model.transformer.blocks.20.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
137
- "model.transformer.blocks.20.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
138
- "model.transformer.blocks.20.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
139
- "model.transformer.blocks.20.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
140
- "model.transformer.blocks.20.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
141
- "model.transformer.blocks.21.attn_norm.weight": "model-00005-of-00008.safetensors",
142
  "model.transformer.blocks.21.ff_norm.weight": "model-00005-of-00008.safetensors",
143
  "model.transformer.blocks.21.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
144
  "model.transformer.blocks.21.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
145
- "model.transformer.blocks.21.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
146
- "model.transformer.blocks.21.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
147
- "model.transformer.blocks.21.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
148
- "model.transformer.blocks.21.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
149
  "model.transformer.blocks.22.attn_norm.weight": "model-00005-of-00008.safetensors",
150
  "model.transformer.blocks.22.ff_norm.weight": "model-00005-of-00008.safetensors",
151
  "model.transformer.blocks.22.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
@@ -163,37 +165,37 @@
163
  "model.transformer.blocks.23.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
164
  "model.transformer.blocks.23.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
165
  "model.transformer.blocks.24.attn_norm.weight": "model-00005-of-00008.safetensors",
166
- "model.transformer.blocks.24.ff_norm.weight": "model-00006-of-00008.safetensors",
167
- "model.transformer.blocks.24.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
168
- "model.transformer.blocks.24.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
169
  "model.transformer.blocks.24.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
170
  "model.transformer.blocks.24.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
171
  "model.transformer.blocks.24.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
172
  "model.transformer.blocks.24.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
173
- "model.transformer.blocks.25.attn_norm.weight": "model-00006-of-00008.safetensors",
174
- "model.transformer.blocks.25.ff_norm.weight": "model-00006-of-00008.safetensors",
175
- "model.transformer.blocks.25.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
176
- "model.transformer.blocks.25.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
177
- "model.transformer.blocks.25.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
178
- "model.transformer.blocks.25.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
179
- "model.transformer.blocks.25.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
180
- "model.transformer.blocks.25.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
181
- "model.transformer.blocks.26.attn_norm.weight": "model-00006-of-00008.safetensors",
182
- "model.transformer.blocks.26.ff_norm.weight": "model-00006-of-00008.safetensors",
183
- "model.transformer.blocks.26.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
184
- "model.transformer.blocks.26.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
185
- "model.transformer.blocks.26.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
186
- "model.transformer.blocks.26.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
187
- "model.transformer.blocks.26.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
188
- "model.transformer.blocks.26.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
189
- "model.transformer.blocks.27.attn_norm.weight": "model-00006-of-00008.safetensors",
190
  "model.transformer.blocks.27.ff_norm.weight": "model-00006-of-00008.safetensors",
191
  "model.transformer.blocks.27.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
192
  "model.transformer.blocks.27.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
193
- "model.transformer.blocks.27.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
194
- "model.transformer.blocks.27.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
195
- "model.transformer.blocks.27.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
196
- "model.transformer.blocks.27.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
197
  "model.transformer.blocks.28.attn_norm.weight": "model-00006-of-00008.safetensors",
198
  "model.transformer.blocks.28.ff_norm.weight": "model-00006-of-00008.safetensors",
199
  "model.transformer.blocks.28.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
@@ -210,46 +212,46 @@
210
  "model.transformer.blocks.29.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
211
  "model.transformer.blocks.29.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
212
  "model.transformer.blocks.29.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
213
- "model.transformer.blocks.3.attn_norm.weight": "model-00002-of-00008.safetensors",
214
  "model.transformer.blocks.3.ff_norm.weight": "model-00002-of-00008.safetensors",
215
  "model.transformer.blocks.3.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
216
  "model.transformer.blocks.3.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
217
- "model.transformer.blocks.3.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
218
- "model.transformer.blocks.3.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
219
- "model.transformer.blocks.3.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
220
- "model.transformer.blocks.3.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
221
  "model.transformer.blocks.30.attn_norm.weight": "model-00006-of-00008.safetensors",
222
- "model.transformer.blocks.30.ff_norm.weight": "model-00007-of-00008.safetensors",
223
- "model.transformer.blocks.30.mlp.ff_out.weight": "model-00007-of-00008.safetensors",
224
- "model.transformer.blocks.30.mlp.ff_proj.weight": "model-00007-of-00008.safetensors",
225
  "model.transformer.blocks.30.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
226
  "model.transformer.blocks.30.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
227
  "model.transformer.blocks.30.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
228
  "model.transformer.blocks.30.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
229
- "model.transformer.blocks.31.attn_norm.weight": "model-00007-of-00008.safetensors",
230
- "model.transformer.blocks.31.ff_norm.weight": "model-00007-of-00008.safetensors",
231
- "model.transformer.blocks.31.mlp.ff_out.weight": "model-00007-of-00008.safetensors",
232
- "model.transformer.blocks.31.mlp.ff_proj.weight": "model-00007-of-00008.safetensors",
233
- "model.transformer.blocks.31.self_attn.att_proj.weight": "model-00007-of-00008.safetensors",
234
- "model.transformer.blocks.31.self_attn.attn_out.weight": "model-00007-of-00008.safetensors",
235
- "model.transformer.blocks.31.self_attn.k_norm.weight": "model-00007-of-00008.safetensors",
236
- "model.transformer.blocks.31.self_attn.q_norm.weight": "model-00007-of-00008.safetensors",
237
- "model.transformer.blocks.32.attn_norm.weight": "model-00007-of-00008.safetensors",
238
- "model.transformer.blocks.32.ff_norm.weight": "model-00007-of-00008.safetensors",
239
- "model.transformer.blocks.32.mlp.ff_out.weight": "model-00007-of-00008.safetensors",
240
- "model.transformer.blocks.32.mlp.ff_proj.weight": "model-00007-of-00008.safetensors",
241
- "model.transformer.blocks.32.self_attn.att_proj.weight": "model-00007-of-00008.safetensors",
242
- "model.transformer.blocks.32.self_attn.attn_out.weight": "model-00007-of-00008.safetensors",
243
- "model.transformer.blocks.32.self_attn.k_norm.weight": "model-00007-of-00008.safetensors",
244
- "model.transformer.blocks.32.self_attn.q_norm.weight": "model-00007-of-00008.safetensors",
245
- "model.transformer.blocks.33.attn_norm.weight": "model-00007-of-00008.safetensors",
246
  "model.transformer.blocks.33.ff_norm.weight": "model-00007-of-00008.safetensors",
247
  "model.transformer.blocks.33.mlp.ff_out.weight": "model-00007-of-00008.safetensors",
248
  "model.transformer.blocks.33.mlp.ff_proj.weight": "model-00007-of-00008.safetensors",
249
- "model.transformer.blocks.33.self_attn.att_proj.weight": "model-00007-of-00008.safetensors",
250
- "model.transformer.blocks.33.self_attn.attn_out.weight": "model-00007-of-00008.safetensors",
251
- "model.transformer.blocks.33.self_attn.k_norm.weight": "model-00007-of-00008.safetensors",
252
- "model.transformer.blocks.33.self_attn.q_norm.weight": "model-00007-of-00008.safetensors",
253
  "model.transformer.blocks.34.attn_norm.weight": "model-00007-of-00008.safetensors",
254
  "model.transformer.blocks.34.ff_norm.weight": "model-00007-of-00008.safetensors",
255
  "model.transformer.blocks.34.mlp.ff_out.weight": "model-00007-of-00008.safetensors",
@@ -283,37 +285,37 @@
283
  "model.transformer.blocks.5.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
284
  "model.transformer.blocks.5.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
285
  "model.transformer.blocks.6.attn_norm.weight": "model-00002-of-00008.safetensors",
286
- "model.transformer.blocks.6.ff_norm.weight": "model-00003-of-00008.safetensors",
287
- "model.transformer.blocks.6.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
288
- "model.transformer.blocks.6.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
289
  "model.transformer.blocks.6.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
290
  "model.transformer.blocks.6.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
291
  "model.transformer.blocks.6.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
292
  "model.transformer.blocks.6.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
293
- "model.transformer.blocks.7.attn_norm.weight": "model-00003-of-00008.safetensors",
294
- "model.transformer.blocks.7.ff_norm.weight": "model-00003-of-00008.safetensors",
295
- "model.transformer.blocks.7.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
296
- "model.transformer.blocks.7.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
297
- "model.transformer.blocks.7.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
298
- "model.transformer.blocks.7.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
299
- "model.transformer.blocks.7.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
300
- "model.transformer.blocks.7.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
301
- "model.transformer.blocks.8.attn_norm.weight": "model-00003-of-00008.safetensors",
302
- "model.transformer.blocks.8.ff_norm.weight": "model-00003-of-00008.safetensors",
303
- "model.transformer.blocks.8.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
304
- "model.transformer.blocks.8.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
305
- "model.transformer.blocks.8.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
306
- "model.transformer.blocks.8.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
307
- "model.transformer.blocks.8.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
308
- "model.transformer.blocks.8.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
309
- "model.transformer.blocks.9.attn_norm.weight": "model-00003-of-00008.safetensors",
310
  "model.transformer.blocks.9.ff_norm.weight": "model-00003-of-00008.safetensors",
311
  "model.transformer.blocks.9.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
312
  "model.transformer.blocks.9.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
313
- "model.transformer.blocks.9.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
314
- "model.transformer.blocks.9.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
315
- "model.transformer.blocks.9.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
316
- "model.transformer.blocks.9.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
317
  "model.transformer.ln_f.weight": "model-00007-of-00008.safetensors",
318
  "model.transformer.wte.embedding": "model-00001-of-00008.safetensors",
319
  "model.transformer.wte.new_embedding": "model-00001-of-00008.safetensors",
@@ -352,166 +354,166 @@
352
  "model.vit.transformer.resblocks.1.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
353
  "model.vit.transformer.resblocks.1.ffn_norm.bias": "model-00007-of-00008.safetensors",
354
  "model.vit.transformer.resblocks.1.ffn_norm.weight": "model-00007-of-00008.safetensors",
355
- "model.vit.transformer.resblocks.10.attention.wk.bias": "model-00008-of-00008.safetensors",
356
- "model.vit.transformer.resblocks.10.attention.wk.weight": "model-00008-of-00008.safetensors",
357
- "model.vit.transformer.resblocks.10.attention.wo.bias": "model-00008-of-00008.safetensors",
358
- "model.vit.transformer.resblocks.10.attention.wo.weight": "model-00008-of-00008.safetensors",
359
- "model.vit.transformer.resblocks.10.attention.wq.bias": "model-00008-of-00008.safetensors",
360
- "model.vit.transformer.resblocks.10.attention.wq.weight": "model-00008-of-00008.safetensors",
361
- "model.vit.transformer.resblocks.10.attention.wv.bias": "model-00008-of-00008.safetensors",
362
- "model.vit.transformer.resblocks.10.attention.wv.weight": "model-00008-of-00008.safetensors",
363
- "model.vit.transformer.resblocks.10.attention_norm.bias": "model-00008-of-00008.safetensors",
364
- "model.vit.transformer.resblocks.10.attention_norm.weight": "model-00008-of-00008.safetensors",
365
- "model.vit.transformer.resblocks.10.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
366
- "model.vit.transformer.resblocks.10.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
367
- "model.vit.transformer.resblocks.10.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
368
- "model.vit.transformer.resblocks.10.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
369
- "model.vit.transformer.resblocks.10.ffn_norm.bias": "model-00008-of-00008.safetensors",
370
- "model.vit.transformer.resblocks.10.ffn_norm.weight": "model-00008-of-00008.safetensors",
371
- "model.vit.transformer.resblocks.11.attention.wk.bias": "model-00008-of-00008.safetensors",
372
- "model.vit.transformer.resblocks.11.attention.wk.weight": "model-00008-of-00008.safetensors",
373
- "model.vit.transformer.resblocks.11.attention.wo.bias": "model-00008-of-00008.safetensors",
374
- "model.vit.transformer.resblocks.11.attention.wo.weight": "model-00008-of-00008.safetensors",
375
- "model.vit.transformer.resblocks.11.attention.wq.bias": "model-00008-of-00008.safetensors",
376
- "model.vit.transformer.resblocks.11.attention.wq.weight": "model-00008-of-00008.safetensors",
377
- "model.vit.transformer.resblocks.11.attention.wv.bias": "model-00008-of-00008.safetensors",
378
- "model.vit.transformer.resblocks.11.attention.wv.weight": "model-00008-of-00008.safetensors",
379
- "model.vit.transformer.resblocks.11.attention_norm.bias": "model-00008-of-00008.safetensors",
380
- "model.vit.transformer.resblocks.11.attention_norm.weight": "model-00008-of-00008.safetensors",
381
- "model.vit.transformer.resblocks.11.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
382
- "model.vit.transformer.resblocks.11.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
383
- "model.vit.transformer.resblocks.11.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
384
- "model.vit.transformer.resblocks.11.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
385
- "model.vit.transformer.resblocks.11.ffn_norm.bias": "model-00008-of-00008.safetensors",
386
- "model.vit.transformer.resblocks.11.ffn_norm.weight": "model-00008-of-00008.safetensors",
387
- "model.vit.transformer.resblocks.12.attention.wk.bias": "model-00008-of-00008.safetensors",
388
- "model.vit.transformer.resblocks.12.attention.wk.weight": "model-00008-of-00008.safetensors",
389
- "model.vit.transformer.resblocks.12.attention.wo.bias": "model-00008-of-00008.safetensors",
390
- "model.vit.transformer.resblocks.12.attention.wo.weight": "model-00008-of-00008.safetensors",
391
- "model.vit.transformer.resblocks.12.attention.wq.bias": "model-00008-of-00008.safetensors",
392
- "model.vit.transformer.resblocks.12.attention.wq.weight": "model-00008-of-00008.safetensors",
393
- "model.vit.transformer.resblocks.12.attention.wv.bias": "model-00008-of-00008.safetensors",
394
- "model.vit.transformer.resblocks.12.attention.wv.weight": "model-00008-of-00008.safetensors",
395
- "model.vit.transformer.resblocks.12.attention_norm.bias": "model-00008-of-00008.safetensors",
396
- "model.vit.transformer.resblocks.12.attention_norm.weight": "model-00008-of-00008.safetensors",
397
- "model.vit.transformer.resblocks.12.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
398
- "model.vit.transformer.resblocks.12.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
399
- "model.vit.transformer.resblocks.12.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
400
- "model.vit.transformer.resblocks.12.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
401
- "model.vit.transformer.resblocks.12.ffn_norm.bias": "model-00008-of-00008.safetensors",
402
- "model.vit.transformer.resblocks.12.ffn_norm.weight": "model-00008-of-00008.safetensors",
403
- "model.vit.transformer.resblocks.13.attention.wk.bias": "model-00008-of-00008.safetensors",
404
- "model.vit.transformer.resblocks.13.attention.wk.weight": "model-00008-of-00008.safetensors",
405
- "model.vit.transformer.resblocks.13.attention.wo.bias": "model-00008-of-00008.safetensors",
406
- "model.vit.transformer.resblocks.13.attention.wo.weight": "model-00008-of-00008.safetensors",
407
- "model.vit.transformer.resblocks.13.attention.wq.bias": "model-00008-of-00008.safetensors",
408
- "model.vit.transformer.resblocks.13.attention.wq.weight": "model-00008-of-00008.safetensors",
409
- "model.vit.transformer.resblocks.13.attention.wv.bias": "model-00008-of-00008.safetensors",
410
- "model.vit.transformer.resblocks.13.attention.wv.weight": "model-00008-of-00008.safetensors",
411
- "model.vit.transformer.resblocks.13.attention_norm.bias": "model-00008-of-00008.safetensors",
412
- "model.vit.transformer.resblocks.13.attention_norm.weight": "model-00008-of-00008.safetensors",
413
- "model.vit.transformer.resblocks.13.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
414
- "model.vit.transformer.resblocks.13.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
415
- "model.vit.transformer.resblocks.13.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
416
- "model.vit.transformer.resblocks.13.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
417
- "model.vit.transformer.resblocks.13.ffn_norm.bias": "model-00008-of-00008.safetensors",
418
- "model.vit.transformer.resblocks.13.ffn_norm.weight": "model-00008-of-00008.safetensors",
419
- "model.vit.transformer.resblocks.14.attention.wk.bias": "model-00008-of-00008.safetensors",
420
- "model.vit.transformer.resblocks.14.attention.wk.weight": "model-00008-of-00008.safetensors",
421
- "model.vit.transformer.resblocks.14.attention.wo.bias": "model-00008-of-00008.safetensors",
422
- "model.vit.transformer.resblocks.14.attention.wo.weight": "model-00008-of-00008.safetensors",
423
- "model.vit.transformer.resblocks.14.attention.wq.bias": "model-00008-of-00008.safetensors",
424
- "model.vit.transformer.resblocks.14.attention.wq.weight": "model-00008-of-00008.safetensors",
425
- "model.vit.transformer.resblocks.14.attention.wv.bias": "model-00008-of-00008.safetensors",
426
- "model.vit.transformer.resblocks.14.attention.wv.weight": "model-00008-of-00008.safetensors",
427
- "model.vit.transformer.resblocks.14.attention_norm.bias": "model-00008-of-00008.safetensors",
428
- "model.vit.transformer.resblocks.14.attention_norm.weight": "model-00008-of-00008.safetensors",
429
- "model.vit.transformer.resblocks.14.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
430
- "model.vit.transformer.resblocks.14.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
431
- "model.vit.transformer.resblocks.14.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
432
- "model.vit.transformer.resblocks.14.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
433
- "model.vit.transformer.resblocks.14.ffn_norm.bias": "model-00008-of-00008.safetensors",
434
- "model.vit.transformer.resblocks.14.ffn_norm.weight": "model-00008-of-00008.safetensors",
435
- "model.vit.transformer.resblocks.15.attention.wk.bias": "model-00008-of-00008.safetensors",
436
- "model.vit.transformer.resblocks.15.attention.wk.weight": "model-00008-of-00008.safetensors",
437
- "model.vit.transformer.resblocks.15.attention.wo.bias": "model-00008-of-00008.safetensors",
438
- "model.vit.transformer.resblocks.15.attention.wo.weight": "model-00008-of-00008.safetensors",
439
- "model.vit.transformer.resblocks.15.attention.wq.bias": "model-00008-of-00008.safetensors",
440
- "model.vit.transformer.resblocks.15.attention.wq.weight": "model-00008-of-00008.safetensors",
441
- "model.vit.transformer.resblocks.15.attention.wv.bias": "model-00008-of-00008.safetensors",
442
- "model.vit.transformer.resblocks.15.attention.wv.weight": "model-00008-of-00008.safetensors",
443
- "model.vit.transformer.resblocks.15.attention_norm.bias": "model-00008-of-00008.safetensors",
444
- "model.vit.transformer.resblocks.15.attention_norm.weight": "model-00008-of-00008.safetensors",
445
- "model.vit.transformer.resblocks.15.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
446
- "model.vit.transformer.resblocks.15.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
447
- "model.vit.transformer.resblocks.15.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
448
- "model.vit.transformer.resblocks.15.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
449
- "model.vit.transformer.resblocks.15.ffn_norm.bias": "model-00008-of-00008.safetensors",
450
- "model.vit.transformer.resblocks.15.ffn_norm.weight": "model-00008-of-00008.safetensors",
451
- "model.vit.transformer.resblocks.16.attention.wk.bias": "model-00008-of-00008.safetensors",
452
- "model.vit.transformer.resblocks.16.attention.wk.weight": "model-00008-of-00008.safetensors",
453
- "model.vit.transformer.resblocks.16.attention.wo.bias": "model-00008-of-00008.safetensors",
454
- "model.vit.transformer.resblocks.16.attention.wo.weight": "model-00008-of-00008.safetensors",
455
- "model.vit.transformer.resblocks.16.attention.wq.bias": "model-00008-of-00008.safetensors",
456
- "model.vit.transformer.resblocks.16.attention.wq.weight": "model-00008-of-00008.safetensors",
457
- "model.vit.transformer.resblocks.16.attention.wv.bias": "model-00008-of-00008.safetensors",
458
- "model.vit.transformer.resblocks.16.attention.wv.weight": "model-00008-of-00008.safetensors",
459
- "model.vit.transformer.resblocks.16.attention_norm.bias": "model-00008-of-00008.safetensors",
460
- "model.vit.transformer.resblocks.16.attention_norm.weight": "model-00008-of-00008.safetensors",
461
- "model.vit.transformer.resblocks.16.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
462
- "model.vit.transformer.resblocks.16.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
463
- "model.vit.transformer.resblocks.16.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
464
- "model.vit.transformer.resblocks.16.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
465
- "model.vit.transformer.resblocks.16.ffn_norm.bias": "model-00008-of-00008.safetensors",
466
- "model.vit.transformer.resblocks.16.ffn_norm.weight": "model-00008-of-00008.safetensors",
467
- "model.vit.transformer.resblocks.17.attention.wk.bias": "model-00008-of-00008.safetensors",
468
- "model.vit.transformer.resblocks.17.attention.wk.weight": "model-00008-of-00008.safetensors",
469
- "model.vit.transformer.resblocks.17.attention.wo.bias": "model-00008-of-00008.safetensors",
470
- "model.vit.transformer.resblocks.17.attention.wo.weight": "model-00008-of-00008.safetensors",
471
- "model.vit.transformer.resblocks.17.attention.wq.bias": "model-00008-of-00008.safetensors",
472
- "model.vit.transformer.resblocks.17.attention.wq.weight": "model-00008-of-00008.safetensors",
473
- "model.vit.transformer.resblocks.17.attention.wv.bias": "model-00008-of-00008.safetensors",
474
- "model.vit.transformer.resblocks.17.attention.wv.weight": "model-00008-of-00008.safetensors",
475
- "model.vit.transformer.resblocks.17.attention_norm.bias": "model-00008-of-00008.safetensors",
476
- "model.vit.transformer.resblocks.17.attention_norm.weight": "model-00008-of-00008.safetensors",
477
- "model.vit.transformer.resblocks.17.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
478
- "model.vit.transformer.resblocks.17.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
479
- "model.vit.transformer.resblocks.17.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
480
- "model.vit.transformer.resblocks.17.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
481
- "model.vit.transformer.resblocks.17.ffn_norm.bias": "model-00008-of-00008.safetensors",
482
- "model.vit.transformer.resblocks.17.ffn_norm.weight": "model-00008-of-00008.safetensors",
483
- "model.vit.transformer.resblocks.18.attention.wk.bias": "model-00008-of-00008.safetensors",
484
- "model.vit.transformer.resblocks.18.attention.wk.weight": "model-00008-of-00008.safetensors",
485
- "model.vit.transformer.resblocks.18.attention.wo.bias": "model-00008-of-00008.safetensors",
486
- "model.vit.transformer.resblocks.18.attention.wo.weight": "model-00008-of-00008.safetensors",
487
- "model.vit.transformer.resblocks.18.attention.wq.bias": "model-00008-of-00008.safetensors",
488
- "model.vit.transformer.resblocks.18.attention.wq.weight": "model-00008-of-00008.safetensors",
489
- "model.vit.transformer.resblocks.18.attention.wv.bias": "model-00008-of-00008.safetensors",
490
- "model.vit.transformer.resblocks.18.attention.wv.weight": "model-00008-of-00008.safetensors",
491
- "model.vit.transformer.resblocks.18.attention_norm.bias": "model-00008-of-00008.safetensors",
492
- "model.vit.transformer.resblocks.18.attention_norm.weight": "model-00008-of-00008.safetensors",
493
- "model.vit.transformer.resblocks.18.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
494
- "model.vit.transformer.resblocks.18.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
495
- "model.vit.transformer.resblocks.18.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
496
- "model.vit.transformer.resblocks.18.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
497
- "model.vit.transformer.resblocks.18.ffn_norm.bias": "model-00008-of-00008.safetensors",
498
- "model.vit.transformer.resblocks.18.ffn_norm.weight": "model-00008-of-00008.safetensors",
499
- "model.vit.transformer.resblocks.19.attention.wk.bias": "model-00008-of-00008.safetensors",
500
- "model.vit.transformer.resblocks.19.attention.wk.weight": "model-00008-of-00008.safetensors",
501
- "model.vit.transformer.resblocks.19.attention.wo.bias": "model-00008-of-00008.safetensors",
502
- "model.vit.transformer.resblocks.19.attention.wo.weight": "model-00008-of-00008.safetensors",
503
- "model.vit.transformer.resblocks.19.attention.wq.bias": "model-00008-of-00008.safetensors",
504
- "model.vit.transformer.resblocks.19.attention.wq.weight": "model-00008-of-00008.safetensors",
505
- "model.vit.transformer.resblocks.19.attention.wv.bias": "model-00008-of-00008.safetensors",
506
- "model.vit.transformer.resblocks.19.attention.wv.weight": "model-00008-of-00008.safetensors",
507
- "model.vit.transformer.resblocks.19.attention_norm.bias": "model-00008-of-00008.safetensors",
508
- "model.vit.transformer.resblocks.19.attention_norm.weight": "model-00008-of-00008.safetensors",
509
- "model.vit.transformer.resblocks.19.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
510
- "model.vit.transformer.resblocks.19.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
511
- "model.vit.transformer.resblocks.19.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
512
- "model.vit.transformer.resblocks.19.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
513
- "model.vit.transformer.resblocks.19.ffn_norm.bias": "model-00008-of-00008.safetensors",
514
- "model.vit.transformer.resblocks.19.ffn_norm.weight": "model-00008-of-00008.safetensors",
515
  "model.vit.transformer.resblocks.2.attention.wk.bias": "model-00007-of-00008.safetensors",
516
  "model.vit.transformer.resblocks.2.attention.wk.weight": "model-00007-of-00008.safetensors",
517
  "model.vit.transformer.resblocks.2.attention.wo.bias": "model-00007-of-00008.safetensors",
@@ -528,86 +530,86 @@
528
  "model.vit.transformer.resblocks.2.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
529
  "model.vit.transformer.resblocks.2.ffn_norm.bias": "model-00007-of-00008.safetensors",
530
  "model.vit.transformer.resblocks.2.ffn_norm.weight": "model-00007-of-00008.safetensors",
531
- "model.vit.transformer.resblocks.20.attention.wk.bias": "model-00008-of-00008.safetensors",
532
- "model.vit.transformer.resblocks.20.attention.wk.weight": "model-00008-of-00008.safetensors",
533
- "model.vit.transformer.resblocks.20.attention.wo.bias": "model-00008-of-00008.safetensors",
534
- "model.vit.transformer.resblocks.20.attention.wo.weight": "model-00008-of-00008.safetensors",
535
- "model.vit.transformer.resblocks.20.attention.wq.bias": "model-00008-of-00008.safetensors",
536
- "model.vit.transformer.resblocks.20.attention.wq.weight": "model-00008-of-00008.safetensors",
537
- "model.vit.transformer.resblocks.20.attention.wv.bias": "model-00008-of-00008.safetensors",
538
- "model.vit.transformer.resblocks.20.attention.wv.weight": "model-00008-of-00008.safetensors",
539
- "model.vit.transformer.resblocks.20.attention_norm.bias": "model-00008-of-00008.safetensors",
540
- "model.vit.transformer.resblocks.20.attention_norm.weight": "model-00008-of-00008.safetensors",
541
- "model.vit.transformer.resblocks.20.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
542
- "model.vit.transformer.resblocks.20.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
543
- "model.vit.transformer.resblocks.20.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
544
- "model.vit.transformer.resblocks.20.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
545
- "model.vit.transformer.resblocks.20.ffn_norm.bias": "model-00008-of-00008.safetensors",
546
- "model.vit.transformer.resblocks.20.ffn_norm.weight": "model-00008-of-00008.safetensors",
547
- "model.vit.transformer.resblocks.21.attention.wk.bias": "model-00008-of-00008.safetensors",
548
- "model.vit.transformer.resblocks.21.attention.wk.weight": "model-00008-of-00008.safetensors",
549
- "model.vit.transformer.resblocks.21.attention.wo.bias": "model-00008-of-00008.safetensors",
550
- "model.vit.transformer.resblocks.21.attention.wo.weight": "model-00008-of-00008.safetensors",
551
- "model.vit.transformer.resblocks.21.attention.wq.bias": "model-00008-of-00008.safetensors",
552
- "model.vit.transformer.resblocks.21.attention.wq.weight": "model-00008-of-00008.safetensors",
553
- "model.vit.transformer.resblocks.21.attention.wv.bias": "model-00008-of-00008.safetensors",
554
- "model.vit.transformer.resblocks.21.attention.wv.weight": "model-00008-of-00008.safetensors",
555
- "model.vit.transformer.resblocks.21.attention_norm.bias": "model-00008-of-00008.safetensors",
556
- "model.vit.transformer.resblocks.21.attention_norm.weight": "model-00008-of-00008.safetensors",
557
- "model.vit.transformer.resblocks.21.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
558
- "model.vit.transformer.resblocks.21.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
559
- "model.vit.transformer.resblocks.21.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
560
- "model.vit.transformer.resblocks.21.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
561
- "model.vit.transformer.resblocks.21.ffn_norm.bias": "model-00008-of-00008.safetensors",
562
- "model.vit.transformer.resblocks.21.ffn_norm.weight": "model-00008-of-00008.safetensors",
563
- "model.vit.transformer.resblocks.22.attention.wk.bias": "model-00008-of-00008.safetensors",
564
- "model.vit.transformer.resblocks.22.attention.wk.weight": "model-00008-of-00008.safetensors",
565
- "model.vit.transformer.resblocks.22.attention.wo.bias": "model-00008-of-00008.safetensors",
566
- "model.vit.transformer.resblocks.22.attention.wo.weight": "model-00008-of-00008.safetensors",
567
- "model.vit.transformer.resblocks.22.attention.wq.bias": "model-00008-of-00008.safetensors",
568
- "model.vit.transformer.resblocks.22.attention.wq.weight": "model-00008-of-00008.safetensors",
569
- "model.vit.transformer.resblocks.22.attention.wv.bias": "model-00008-of-00008.safetensors",
570
- "model.vit.transformer.resblocks.22.attention.wv.weight": "model-00008-of-00008.safetensors",
571
- "model.vit.transformer.resblocks.22.attention_norm.bias": "model-00008-of-00008.safetensors",
572
- "model.vit.transformer.resblocks.22.attention_norm.weight": "model-00008-of-00008.safetensors",
573
- "model.vit.transformer.resblocks.22.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
574
- "model.vit.transformer.resblocks.22.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
575
- "model.vit.transformer.resblocks.22.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
576
- "model.vit.transformer.resblocks.22.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
577
- "model.vit.transformer.resblocks.22.ffn_norm.bias": "model-00008-of-00008.safetensors",
578
- "model.vit.transformer.resblocks.22.ffn_norm.weight": "model-00008-of-00008.safetensors",
579
- "model.vit.transformer.resblocks.23.attention.wk.bias": "model-00008-of-00008.safetensors",
580
- "model.vit.transformer.resblocks.23.attention.wk.weight": "model-00008-of-00008.safetensors",
581
- "model.vit.transformer.resblocks.23.attention.wo.bias": "model-00008-of-00008.safetensors",
582
- "model.vit.transformer.resblocks.23.attention.wo.weight": "model-00008-of-00008.safetensors",
583
- "model.vit.transformer.resblocks.23.attention.wq.bias": "model-00008-of-00008.safetensors",
584
- "model.vit.transformer.resblocks.23.attention.wq.weight": "model-00008-of-00008.safetensors",
585
- "model.vit.transformer.resblocks.23.attention.wv.bias": "model-00008-of-00008.safetensors",
586
- "model.vit.transformer.resblocks.23.attention.wv.weight": "model-00008-of-00008.safetensors",
587
- "model.vit.transformer.resblocks.23.attention_norm.bias": "model-00008-of-00008.safetensors",
588
- "model.vit.transformer.resblocks.23.attention_norm.weight": "model-00008-of-00008.safetensors",
589
- "model.vit.transformer.resblocks.23.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
590
- "model.vit.transformer.resblocks.23.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
591
- "model.vit.transformer.resblocks.23.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
592
- "model.vit.transformer.resblocks.23.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
593
- "model.vit.transformer.resblocks.23.ffn_norm.bias": "model-00008-of-00008.safetensors",
594
- "model.vit.transformer.resblocks.23.ffn_norm.weight": "model-00008-of-00008.safetensors",
595
- "model.vit.transformer.resblocks.24.attention.wk.bias": "model-00008-of-00008.safetensors",
596
- "model.vit.transformer.resblocks.24.attention.wk.weight": "model-00008-of-00008.safetensors",
597
- "model.vit.transformer.resblocks.24.attention.wo.bias": "model-00008-of-00008.safetensors",
598
- "model.vit.transformer.resblocks.24.attention.wo.weight": "model-00008-of-00008.safetensors",
599
- "model.vit.transformer.resblocks.24.attention.wq.bias": "model-00008-of-00008.safetensors",
600
- "model.vit.transformer.resblocks.24.attention.wq.weight": "model-00008-of-00008.safetensors",
601
- "model.vit.transformer.resblocks.24.attention.wv.bias": "model-00008-of-00008.safetensors",
602
- "model.vit.transformer.resblocks.24.attention.wv.weight": "model-00008-of-00008.safetensors",
603
- "model.vit.transformer.resblocks.24.attention_norm.bias": "model-00008-of-00008.safetensors",
604
- "model.vit.transformer.resblocks.24.attention_norm.weight": "model-00008-of-00008.safetensors",
605
- "model.vit.transformer.resblocks.24.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
606
- "model.vit.transformer.resblocks.24.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
607
- "model.vit.transformer.resblocks.24.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
608
- "model.vit.transformer.resblocks.24.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
609
- "model.vit.transformer.resblocks.24.ffn_norm.bias": "model-00008-of-00008.safetensors",
610
- "model.vit.transformer.resblocks.24.ffn_norm.weight": "model-00008-of-00008.safetensors",
611
  "model.vit.transformer.resblocks.3.attention.wk.bias": "model-00007-of-00008.safetensors",
612
  "model.vit.transformer.resblocks.3.attention.wk.weight": "model-00007-of-00008.safetensors",
613
  "model.vit.transformer.resblocks.3.attention.wo.bias": "model-00007-of-00008.safetensors",
@@ -696,32 +698,30 @@
696
  "model.vit.transformer.resblocks.8.attention.wq.weight": "model-00007-of-00008.safetensors",
697
  "model.vit.transformer.resblocks.8.attention.wv.bias": "model-00007-of-00008.safetensors",
698
  "model.vit.transformer.resblocks.8.attention.wv.weight": "model-00007-of-00008.safetensors",
699
- "model.vit.transformer.resblocks.8.attention_norm.bias": "model-00008-of-00008.safetensors",
700
- "model.vit.transformer.resblocks.8.attention_norm.weight": "model-00008-of-00008.safetensors",
701
  "model.vit.transformer.resblocks.8.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
702
  "model.vit.transformer.resblocks.8.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
703
- "model.vit.transformer.resblocks.8.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
704
- "model.vit.transformer.resblocks.8.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
705
- "model.vit.transformer.resblocks.8.ffn_norm.bias": "model-00008-of-00008.safetensors",
706
- "model.vit.transformer.resblocks.8.ffn_norm.weight": "model-00008-of-00008.safetensors",
707
- "model.vit.transformer.resblocks.9.attention.wk.bias": "model-00008-of-00008.safetensors",
708
- "model.vit.transformer.resblocks.9.attention.wk.weight": "model-00008-of-00008.safetensors",
709
- "model.vit.transformer.resblocks.9.attention.wo.bias": "model-00008-of-00008.safetensors",
710
- "model.vit.transformer.resblocks.9.attention.wo.weight": "model-00008-of-00008.safetensors",
711
- "model.vit.transformer.resblocks.9.attention.wq.bias": "model-00008-of-00008.safetensors",
712
- "model.vit.transformer.resblocks.9.attention.wq.weight": "model-00008-of-00008.safetensors",
713
- "model.vit.transformer.resblocks.9.attention.wv.bias": "model-00008-of-00008.safetensors",
714
- "model.vit.transformer.resblocks.9.attention.wv.weight": "model-00008-of-00008.safetensors",
715
- "model.vit.transformer.resblocks.9.attention_norm.bias": "model-00008-of-00008.safetensors",
716
- "model.vit.transformer.resblocks.9.attention_norm.weight": "model-00008-of-00008.safetensors",
717
- "model.vit.transformer.resblocks.9.feed_forward.w1.bias": "model-00008-of-00008.safetensors",
718
- "model.vit.transformer.resblocks.9.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
719
- "model.vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00008-of-00008.safetensors",
720
- "model.vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
721
- "model.vit.transformer.resblocks.9.ffn_norm.bias": "model-00008-of-00008.safetensors",
722
- "model.vit.transformer.resblocks.9.ffn_norm.weight": "model-00008-of-00008.safetensors",
723
- "model.x_norm.weight": "model-00008-of-00008.safetensors",
724
- "new_output_embeddings": "model-00001-of-00008.safetensors",
725
- "output_embeddings": "model-00001-of-00008.safetensors"
726
  }
727
  }
 
4
  "total_size": 34711420260
5
  },
6
  "weight_map": {
7
+ "lm_head.new_output_embeddings": "model-00008-of-00008.safetensors",
8
+ "lm_head.output_embeddings": "model-00008-of-00008.safetensors",
9
+ "model.add_no_point_class_embed.vector": "model-00007-of-00008.safetensors",
10
+ "model.build_vit_embedding.bias": "model-00007-of-00008.safetensors",
11
+ "model.build_vit_embedding.weight": "model-00007-of-00008.safetensors",
12
+ "model.connector.image_pooling_2d.wk.bias": "model-00007-of-00008.safetensors",
13
+ "model.connector.image_pooling_2d.wk.weight": "model-00007-of-00008.safetensors",
14
+ "model.connector.image_pooling_2d.wq.bias": "model-00007-of-00008.safetensors",
15
+ "model.connector.image_pooling_2d.wq.weight": "model-00007-of-00008.safetensors",
16
+ "model.connector.image_pooling_2d.wv.bias": "model-00007-of-00008.safetensors",
17
+ "model.connector.image_pooling_2d.wv.weight": "model-00007-of-00008.safetensors",
18
+ "model.connector.image_projector.w1.weight": "model-00007-of-00008.safetensors",
19
+ "model.connector.image_projector.w2.weight": "model-00007-of-00008.safetensors",
20
+ "model.connector.image_projector.w3.weight": "model-00007-of-00008.safetensors",
21
+ "model.patch_k.bias": "model-00007-of-00008.safetensors",
22
+ "model.patch_k.weight": "model-00007-of-00008.safetensors",
23
+ "model.patch_q.bias": "model-00007-of-00008.safetensors",
24
+ "model.patch_q.weight": "model-00007-of-00008.safetensors",
25
+ "model.subpatch_k.bias": "model-00007-of-00008.safetensors",
26
+ "model.subpatch_k.weight": "model-00007-of-00008.safetensors",
27
+ "model.subpatch_loc_k.bias": "model-00007-of-00008.safetensors",
28
+ "model.subpatch_loc_k.weight": "model-00007-of-00008.safetensors",
29
+ "model.subpatch_q.bias": "model-00007-of-00008.safetensors",
30
+ "model.subpatch_q.weight": "model-00007-of-00008.safetensors",
31
+ "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00008.safetensors",
32
+ "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00008.safetensors",
33
+ "model.transformer.blocks.0.mlp.ff_out.weight": "model-00001-of-00008.safetensors",
34
+ "model.transformer.blocks.0.mlp.ff_proj.weight": "model-00001-of-00008.safetensors",
35
+ "model.transformer.blocks.0.self_attn.att_proj.weight": "model-00001-of-00008.safetensors",
36
+ "model.transformer.blocks.0.self_attn.attn_out.weight": "model-00001-of-00008.safetensors",
37
+ "model.transformer.blocks.0.self_attn.k_norm.weight": "model-00001-of-00008.safetensors",
38
+ "model.transformer.blocks.0.self_attn.q_norm.weight": "model-00001-of-00008.safetensors",
39
+ "model.transformer.blocks.1.attn_norm.weight": "model-00001-of-00008.safetensors",
40
+ "model.transformer.blocks.1.ff_norm.weight": "model-00001-of-00008.safetensors",
41
+ "model.transformer.blocks.1.mlp.ff_out.weight": "model-00001-of-00008.safetensors",
42
+ "model.transformer.blocks.1.mlp.ff_proj.weight": "model-00001-of-00008.safetensors",
43
+ "model.transformer.blocks.1.self_attn.att_proj.weight": "model-00001-of-00008.safetensors",
44
+ "model.transformer.blocks.1.self_attn.attn_out.weight": "model-00001-of-00008.safetensors",
45
+ "model.transformer.blocks.1.self_attn.k_norm.weight": "model-00001-of-00008.safetensors",
46
+ "model.transformer.blocks.1.self_attn.q_norm.weight": "model-00001-of-00008.safetensors",
47
  "model.transformer.blocks.10.attn_norm.weight": "model-00003-of-00008.safetensors",
48
  "model.transformer.blocks.10.ff_norm.weight": "model-00003-of-00008.safetensors",
49
  "model.transformer.blocks.10.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
 
61
  "model.transformer.blocks.11.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
62
  "model.transformer.blocks.11.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
63
  "model.transformer.blocks.12.attn_norm.weight": "model-00003-of-00008.safetensors",
64
+ "model.transformer.blocks.12.ff_norm.weight": "model-00003-of-00008.safetensors",
65
+ "model.transformer.blocks.12.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
66
+ "model.transformer.blocks.12.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
67
  "model.transformer.blocks.12.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
68
  "model.transformer.blocks.12.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
69
  "model.transformer.blocks.12.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
70
  "model.transformer.blocks.12.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
71
+ "model.transformer.blocks.13.attn_norm.weight": "model-00003-of-00008.safetensors",
72
+ "model.transformer.blocks.13.ff_norm.weight": "model-00003-of-00008.safetensors",
73
+ "model.transformer.blocks.13.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
74
+ "model.transformer.blocks.13.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
75
+ "model.transformer.blocks.13.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
76
+ "model.transformer.blocks.13.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
77
+ "model.transformer.blocks.13.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
78
+ "model.transformer.blocks.13.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
79
+ "model.transformer.blocks.14.attn_norm.weight": "model-00003-of-00008.safetensors",
80
+ "model.transformer.blocks.14.ff_norm.weight": "model-00003-of-00008.safetensors",
81
+ "model.transformer.blocks.14.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
82
+ "model.transformer.blocks.14.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
83
+ "model.transformer.blocks.14.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
84
+ "model.transformer.blocks.14.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
85
+ "model.transformer.blocks.14.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
86
+ "model.transformer.blocks.14.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
87
+ "model.transformer.blocks.15.attn_norm.weight": "model-00003-of-00008.safetensors",
88
  "model.transformer.blocks.15.ff_norm.weight": "model-00004-of-00008.safetensors",
89
  "model.transformer.blocks.15.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
90
  "model.transformer.blocks.15.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
91
+ "model.transformer.blocks.15.self_attn.att_proj.weight": "model-00003-of-00008.safetensors",
92
+ "model.transformer.blocks.15.self_attn.attn_out.weight": "model-00003-of-00008.safetensors",
93
+ "model.transformer.blocks.15.self_attn.k_norm.weight": "model-00003-of-00008.safetensors",
94
+ "model.transformer.blocks.15.self_attn.q_norm.weight": "model-00003-of-00008.safetensors",
95
  "model.transformer.blocks.16.attn_norm.weight": "model-00004-of-00008.safetensors",
96
  "model.transformer.blocks.16.ff_norm.weight": "model-00004-of-00008.safetensors",
97
  "model.transformer.blocks.16.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
 
109
  "model.transformer.blocks.17.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
110
  "model.transformer.blocks.17.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
111
  "model.transformer.blocks.18.attn_norm.weight": "model-00004-of-00008.safetensors",
112
+ "model.transformer.blocks.18.ff_norm.weight": "model-00004-of-00008.safetensors",
113
+ "model.transformer.blocks.18.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
114
+ "model.transformer.blocks.18.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
115
  "model.transformer.blocks.18.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
116
  "model.transformer.blocks.18.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
117
  "model.transformer.blocks.18.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
118
  "model.transformer.blocks.18.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
119
+ "model.transformer.blocks.19.attn_norm.weight": "model-00004-of-00008.safetensors",
120
+ "model.transformer.blocks.19.ff_norm.weight": "model-00004-of-00008.safetensors",
121
+ "model.transformer.blocks.19.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
122
+ "model.transformer.blocks.19.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
123
+ "model.transformer.blocks.19.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
124
+ "model.transformer.blocks.19.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
125
+ "model.transformer.blocks.19.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
126
+ "model.transformer.blocks.19.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
127
+ "model.transformer.blocks.2.attn_norm.weight": "model-00001-of-00008.safetensors",
128
+ "model.transformer.blocks.2.ff_norm.weight": "model-00001-of-00008.safetensors",
129
+ "model.transformer.blocks.2.mlp.ff_out.weight": "model-00001-of-00008.safetensors",
130
+ "model.transformer.blocks.2.mlp.ff_proj.weight": "model-00001-of-00008.safetensors",
131
+ "model.transformer.blocks.2.self_attn.att_proj.weight": "model-00001-of-00008.safetensors",
132
+ "model.transformer.blocks.2.self_attn.attn_out.weight": "model-00001-of-00008.safetensors",
133
+ "model.transformer.blocks.2.self_attn.k_norm.weight": "model-00001-of-00008.safetensors",
134
+ "model.transformer.blocks.2.self_attn.q_norm.weight": "model-00001-of-00008.safetensors",
135
+ "model.transformer.blocks.20.attn_norm.weight": "model-00004-of-00008.safetensors",
136
+ "model.transformer.blocks.20.ff_norm.weight": "model-00004-of-00008.safetensors",
137
+ "model.transformer.blocks.20.mlp.ff_out.weight": "model-00004-of-00008.safetensors",
138
+ "model.transformer.blocks.20.mlp.ff_proj.weight": "model-00004-of-00008.safetensors",
139
+ "model.transformer.blocks.20.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
140
+ "model.transformer.blocks.20.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
141
+ "model.transformer.blocks.20.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
142
+ "model.transformer.blocks.20.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
143
+ "model.transformer.blocks.21.attn_norm.weight": "model-00004-of-00008.safetensors",
144
  "model.transformer.blocks.21.ff_norm.weight": "model-00005-of-00008.safetensors",
145
  "model.transformer.blocks.21.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
146
  "model.transformer.blocks.21.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
147
+ "model.transformer.blocks.21.self_attn.att_proj.weight": "model-00004-of-00008.safetensors",
148
+ "model.transformer.blocks.21.self_attn.attn_out.weight": "model-00004-of-00008.safetensors",
149
+ "model.transformer.blocks.21.self_attn.k_norm.weight": "model-00004-of-00008.safetensors",
150
+ "model.transformer.blocks.21.self_attn.q_norm.weight": "model-00004-of-00008.safetensors",
151
  "model.transformer.blocks.22.attn_norm.weight": "model-00005-of-00008.safetensors",
152
  "model.transformer.blocks.22.ff_norm.weight": "model-00005-of-00008.safetensors",
153
  "model.transformer.blocks.22.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
 
165
  "model.transformer.blocks.23.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
166
  "model.transformer.blocks.23.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
167
  "model.transformer.blocks.24.attn_norm.weight": "model-00005-of-00008.safetensors",
168
+ "model.transformer.blocks.24.ff_norm.weight": "model-00005-of-00008.safetensors",
169
+ "model.transformer.blocks.24.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
170
+ "model.transformer.blocks.24.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
171
  "model.transformer.blocks.24.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
172
  "model.transformer.blocks.24.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
173
  "model.transformer.blocks.24.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
174
  "model.transformer.blocks.24.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
175
+ "model.transformer.blocks.25.attn_norm.weight": "model-00005-of-00008.safetensors",
176
+ "model.transformer.blocks.25.ff_norm.weight": "model-00005-of-00008.safetensors",
177
+ "model.transformer.blocks.25.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
178
+ "model.transformer.blocks.25.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
179
+ "model.transformer.blocks.25.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
180
+ "model.transformer.blocks.25.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
181
+ "model.transformer.blocks.25.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
182
+ "model.transformer.blocks.25.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
183
+ "model.transformer.blocks.26.attn_norm.weight": "model-00005-of-00008.safetensors",
184
+ "model.transformer.blocks.26.ff_norm.weight": "model-00005-of-00008.safetensors",
185
+ "model.transformer.blocks.26.mlp.ff_out.weight": "model-00005-of-00008.safetensors",
186
+ "model.transformer.blocks.26.mlp.ff_proj.weight": "model-00005-of-00008.safetensors",
187
+ "model.transformer.blocks.26.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
188
+ "model.transformer.blocks.26.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
189
+ "model.transformer.blocks.26.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
190
+ "model.transformer.blocks.26.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
191
+ "model.transformer.blocks.27.attn_norm.weight": "model-00005-of-00008.safetensors",
192
  "model.transformer.blocks.27.ff_norm.weight": "model-00006-of-00008.safetensors",
193
  "model.transformer.blocks.27.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
194
  "model.transformer.blocks.27.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
195
+ "model.transformer.blocks.27.self_attn.att_proj.weight": "model-00005-of-00008.safetensors",
196
+ "model.transformer.blocks.27.self_attn.attn_out.weight": "model-00005-of-00008.safetensors",
197
+ "model.transformer.blocks.27.self_attn.k_norm.weight": "model-00005-of-00008.safetensors",
198
+ "model.transformer.blocks.27.self_attn.q_norm.weight": "model-00005-of-00008.safetensors",
199
  "model.transformer.blocks.28.attn_norm.weight": "model-00006-of-00008.safetensors",
200
  "model.transformer.blocks.28.ff_norm.weight": "model-00006-of-00008.safetensors",
201
  "model.transformer.blocks.28.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
 
212
  "model.transformer.blocks.29.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
213
  "model.transformer.blocks.29.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
214
  "model.transformer.blocks.29.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
215
+ "model.transformer.blocks.3.attn_norm.weight": "model-00001-of-00008.safetensors",
216
  "model.transformer.blocks.3.ff_norm.weight": "model-00002-of-00008.safetensors",
217
  "model.transformer.blocks.3.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
218
  "model.transformer.blocks.3.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
219
+ "model.transformer.blocks.3.self_attn.att_proj.weight": "model-00001-of-00008.safetensors",
220
+ "model.transformer.blocks.3.self_attn.attn_out.weight": "model-00001-of-00008.safetensors",
221
+ "model.transformer.blocks.3.self_attn.k_norm.weight": "model-00001-of-00008.safetensors",
222
+ "model.transformer.blocks.3.self_attn.q_norm.weight": "model-00001-of-00008.safetensors",
223
  "model.transformer.blocks.30.attn_norm.weight": "model-00006-of-00008.safetensors",
224
+ "model.transformer.blocks.30.ff_norm.weight": "model-00006-of-00008.safetensors",
225
+ "model.transformer.blocks.30.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
226
+ "model.transformer.blocks.30.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
227
  "model.transformer.blocks.30.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
228
  "model.transformer.blocks.30.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
229
  "model.transformer.blocks.30.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
230
  "model.transformer.blocks.30.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
231
+ "model.transformer.blocks.31.attn_norm.weight": "model-00006-of-00008.safetensors",
232
+ "model.transformer.blocks.31.ff_norm.weight": "model-00006-of-00008.safetensors",
233
+ "model.transformer.blocks.31.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
234
+ "model.transformer.blocks.31.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
235
+ "model.transformer.blocks.31.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
236
+ "model.transformer.blocks.31.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
237
+ "model.transformer.blocks.31.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
238
+ "model.transformer.blocks.31.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
239
+ "model.transformer.blocks.32.attn_norm.weight": "model-00006-of-00008.safetensors",
240
+ "model.transformer.blocks.32.ff_norm.weight": "model-00006-of-00008.safetensors",
241
+ "model.transformer.blocks.32.mlp.ff_out.weight": "model-00006-of-00008.safetensors",
242
+ "model.transformer.blocks.32.mlp.ff_proj.weight": "model-00006-of-00008.safetensors",
243
+ "model.transformer.blocks.32.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
244
+ "model.transformer.blocks.32.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
245
+ "model.transformer.blocks.32.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
246
+ "model.transformer.blocks.32.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
247
+ "model.transformer.blocks.33.attn_norm.weight": "model-00006-of-00008.safetensors",
248
  "model.transformer.blocks.33.ff_norm.weight": "model-00007-of-00008.safetensors",
249
  "model.transformer.blocks.33.mlp.ff_out.weight": "model-00007-of-00008.safetensors",
250
  "model.transformer.blocks.33.mlp.ff_proj.weight": "model-00007-of-00008.safetensors",
251
+ "model.transformer.blocks.33.self_attn.att_proj.weight": "model-00006-of-00008.safetensors",
252
+ "model.transformer.blocks.33.self_attn.attn_out.weight": "model-00006-of-00008.safetensors",
253
+ "model.transformer.blocks.33.self_attn.k_norm.weight": "model-00006-of-00008.safetensors",
254
+ "model.transformer.blocks.33.self_attn.q_norm.weight": "model-00006-of-00008.safetensors",
255
  "model.transformer.blocks.34.attn_norm.weight": "model-00007-of-00008.safetensors",
256
  "model.transformer.blocks.34.ff_norm.weight": "model-00007-of-00008.safetensors",
257
  "model.transformer.blocks.34.mlp.ff_out.weight": "model-00007-of-00008.safetensors",
 
285
  "model.transformer.blocks.5.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
286
  "model.transformer.blocks.5.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
287
  "model.transformer.blocks.6.attn_norm.weight": "model-00002-of-00008.safetensors",
288
+ "model.transformer.blocks.6.ff_norm.weight": "model-00002-of-00008.safetensors",
289
+ "model.transformer.blocks.6.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
290
+ "model.transformer.blocks.6.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
291
  "model.transformer.blocks.6.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
292
  "model.transformer.blocks.6.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
293
  "model.transformer.blocks.6.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
294
  "model.transformer.blocks.6.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
295
+ "model.transformer.blocks.7.attn_norm.weight": "model-00002-of-00008.safetensors",
296
+ "model.transformer.blocks.7.ff_norm.weight": "model-00002-of-00008.safetensors",
297
+ "model.transformer.blocks.7.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
298
+ "model.transformer.blocks.7.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
299
+ "model.transformer.blocks.7.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
300
+ "model.transformer.blocks.7.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
301
+ "model.transformer.blocks.7.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
302
+ "model.transformer.blocks.7.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
303
+ "model.transformer.blocks.8.attn_norm.weight": "model-00002-of-00008.safetensors",
304
+ "model.transformer.blocks.8.ff_norm.weight": "model-00002-of-00008.safetensors",
305
+ "model.transformer.blocks.8.mlp.ff_out.weight": "model-00002-of-00008.safetensors",
306
+ "model.transformer.blocks.8.mlp.ff_proj.weight": "model-00002-of-00008.safetensors",
307
+ "model.transformer.blocks.8.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
308
+ "model.transformer.blocks.8.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
309
+ "model.transformer.blocks.8.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
310
+ "model.transformer.blocks.8.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
311
+ "model.transformer.blocks.9.attn_norm.weight": "model-00002-of-00008.safetensors",
312
  "model.transformer.blocks.9.ff_norm.weight": "model-00003-of-00008.safetensors",
313
  "model.transformer.blocks.9.mlp.ff_out.weight": "model-00003-of-00008.safetensors",
314
  "model.transformer.blocks.9.mlp.ff_proj.weight": "model-00003-of-00008.safetensors",
315
+ "model.transformer.blocks.9.self_attn.att_proj.weight": "model-00002-of-00008.safetensors",
316
+ "model.transformer.blocks.9.self_attn.attn_out.weight": "model-00002-of-00008.safetensors",
317
+ "model.transformer.blocks.9.self_attn.k_norm.weight": "model-00002-of-00008.safetensors",
318
+ "model.transformer.blocks.9.self_attn.q_norm.weight": "model-00002-of-00008.safetensors",
319
  "model.transformer.ln_f.weight": "model-00007-of-00008.safetensors",
320
  "model.transformer.wte.embedding": "model-00001-of-00008.safetensors",
321
  "model.transformer.wte.new_embedding": "model-00001-of-00008.safetensors",
 
354
  "model.vit.transformer.resblocks.1.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
355
  "model.vit.transformer.resblocks.1.ffn_norm.bias": "model-00007-of-00008.safetensors",
356
  "model.vit.transformer.resblocks.1.ffn_norm.weight": "model-00007-of-00008.safetensors",
357
+ "model.vit.transformer.resblocks.10.attention.wk.bias": "model-00007-of-00008.safetensors",
358
+ "model.vit.transformer.resblocks.10.attention.wk.weight": "model-00007-of-00008.safetensors",
359
+ "model.vit.transformer.resblocks.10.attention.wo.bias": "model-00007-of-00008.safetensors",
360
+ "model.vit.transformer.resblocks.10.attention.wo.weight": "model-00007-of-00008.safetensors",
361
+ "model.vit.transformer.resblocks.10.attention.wq.bias": "model-00007-of-00008.safetensors",
362
+ "model.vit.transformer.resblocks.10.attention.wq.weight": "model-00007-of-00008.safetensors",
363
+ "model.vit.transformer.resblocks.10.attention.wv.bias": "model-00007-of-00008.safetensors",
364
+ "model.vit.transformer.resblocks.10.attention.wv.weight": "model-00007-of-00008.safetensors",
365
+ "model.vit.transformer.resblocks.10.attention_norm.bias": "model-00007-of-00008.safetensors",
366
+ "model.vit.transformer.resblocks.10.attention_norm.weight": "model-00007-of-00008.safetensors",
367
+ "model.vit.transformer.resblocks.10.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
368
+ "model.vit.transformer.resblocks.10.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
369
+ "model.vit.transformer.resblocks.10.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
370
+ "model.vit.transformer.resblocks.10.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
371
+ "model.vit.transformer.resblocks.10.ffn_norm.bias": "model-00007-of-00008.safetensors",
372
+ "model.vit.transformer.resblocks.10.ffn_norm.weight": "model-00007-of-00008.safetensors",
373
+ "model.vit.transformer.resblocks.11.attention.wk.bias": "model-00007-of-00008.safetensors",
374
+ "model.vit.transformer.resblocks.11.attention.wk.weight": "model-00007-of-00008.safetensors",
375
+ "model.vit.transformer.resblocks.11.attention.wo.bias": "model-00007-of-00008.safetensors",
376
+ "model.vit.transformer.resblocks.11.attention.wo.weight": "model-00007-of-00008.safetensors",
377
+ "model.vit.transformer.resblocks.11.attention.wq.bias": "model-00007-of-00008.safetensors",
378
+ "model.vit.transformer.resblocks.11.attention.wq.weight": "model-00007-of-00008.safetensors",
379
+ "model.vit.transformer.resblocks.11.attention.wv.bias": "model-00007-of-00008.safetensors",
380
+ "model.vit.transformer.resblocks.11.attention.wv.weight": "model-00007-of-00008.safetensors",
381
+ "model.vit.transformer.resblocks.11.attention_norm.bias": "model-00007-of-00008.safetensors",
382
+ "model.vit.transformer.resblocks.11.attention_norm.weight": "model-00007-of-00008.safetensors",
383
+ "model.vit.transformer.resblocks.11.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
384
+ "model.vit.transformer.resblocks.11.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
385
+ "model.vit.transformer.resblocks.11.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
386
+ "model.vit.transformer.resblocks.11.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
387
+ "model.vit.transformer.resblocks.11.ffn_norm.bias": "model-00007-of-00008.safetensors",
388
+ "model.vit.transformer.resblocks.11.ffn_norm.weight": "model-00007-of-00008.safetensors",
389
+ "model.vit.transformer.resblocks.12.attention.wk.bias": "model-00007-of-00008.safetensors",
390
+ "model.vit.transformer.resblocks.12.attention.wk.weight": "model-00007-of-00008.safetensors",
391
+ "model.vit.transformer.resblocks.12.attention.wo.bias": "model-00007-of-00008.safetensors",
392
+ "model.vit.transformer.resblocks.12.attention.wo.weight": "model-00007-of-00008.safetensors",
393
+ "model.vit.transformer.resblocks.12.attention.wq.bias": "model-00007-of-00008.safetensors",
394
+ "model.vit.transformer.resblocks.12.attention.wq.weight": "model-00007-of-00008.safetensors",
395
+ "model.vit.transformer.resblocks.12.attention.wv.bias": "model-00007-of-00008.safetensors",
396
+ "model.vit.transformer.resblocks.12.attention.wv.weight": "model-00007-of-00008.safetensors",
397
+ "model.vit.transformer.resblocks.12.attention_norm.bias": "model-00007-of-00008.safetensors",
398
+ "model.vit.transformer.resblocks.12.attention_norm.weight": "model-00007-of-00008.safetensors",
399
+ "model.vit.transformer.resblocks.12.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
400
+ "model.vit.transformer.resblocks.12.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
401
+ "model.vit.transformer.resblocks.12.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
402
+ "model.vit.transformer.resblocks.12.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
403
+ "model.vit.transformer.resblocks.12.ffn_norm.bias": "model-00007-of-00008.safetensors",
404
+ "model.vit.transformer.resblocks.12.ffn_norm.weight": "model-00007-of-00008.safetensors",
405
+ "model.vit.transformer.resblocks.13.attention.wk.bias": "model-00007-of-00008.safetensors",
406
+ "model.vit.transformer.resblocks.13.attention.wk.weight": "model-00007-of-00008.safetensors",
407
+ "model.vit.transformer.resblocks.13.attention.wo.bias": "model-00007-of-00008.safetensors",
408
+ "model.vit.transformer.resblocks.13.attention.wo.weight": "model-00007-of-00008.safetensors",
409
+ "model.vit.transformer.resblocks.13.attention.wq.bias": "model-00007-of-00008.safetensors",
410
+ "model.vit.transformer.resblocks.13.attention.wq.weight": "model-00007-of-00008.safetensors",
411
+ "model.vit.transformer.resblocks.13.attention.wv.bias": "model-00007-of-00008.safetensors",
412
+ "model.vit.transformer.resblocks.13.attention.wv.weight": "model-00007-of-00008.safetensors",
413
+ "model.vit.transformer.resblocks.13.attention_norm.bias": "model-00007-of-00008.safetensors",
414
+ "model.vit.transformer.resblocks.13.attention_norm.weight": "model-00007-of-00008.safetensors",
415
+ "model.vit.transformer.resblocks.13.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
416
+ "model.vit.transformer.resblocks.13.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
417
+ "model.vit.transformer.resblocks.13.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
418
+ "model.vit.transformer.resblocks.13.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
419
+ "model.vit.transformer.resblocks.13.ffn_norm.bias": "model-00007-of-00008.safetensors",
420
+ "model.vit.transformer.resblocks.13.ffn_norm.weight": "model-00007-of-00008.safetensors",
421
+ "model.vit.transformer.resblocks.14.attention.wk.bias": "model-00007-of-00008.safetensors",
422
+ "model.vit.transformer.resblocks.14.attention.wk.weight": "model-00007-of-00008.safetensors",
423
+ "model.vit.transformer.resblocks.14.attention.wo.bias": "model-00007-of-00008.safetensors",
424
+ "model.vit.transformer.resblocks.14.attention.wo.weight": "model-00007-of-00008.safetensors",
425
+ "model.vit.transformer.resblocks.14.attention.wq.bias": "model-00007-of-00008.safetensors",
426
+ "model.vit.transformer.resblocks.14.attention.wq.weight": "model-00007-of-00008.safetensors",
427
+ "model.vit.transformer.resblocks.14.attention.wv.bias": "model-00007-of-00008.safetensors",
428
+ "model.vit.transformer.resblocks.14.attention.wv.weight": "model-00007-of-00008.safetensors",
429
+ "model.vit.transformer.resblocks.14.attention_norm.bias": "model-00007-of-00008.safetensors",
430
+ "model.vit.transformer.resblocks.14.attention_norm.weight": "model-00007-of-00008.safetensors",
431
+ "model.vit.transformer.resblocks.14.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
432
+ "model.vit.transformer.resblocks.14.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
433
+ "model.vit.transformer.resblocks.14.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
434
+ "model.vit.transformer.resblocks.14.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
435
+ "model.vit.transformer.resblocks.14.ffn_norm.bias": "model-00007-of-00008.safetensors",
436
+ "model.vit.transformer.resblocks.14.ffn_norm.weight": "model-00007-of-00008.safetensors",
437
+ "model.vit.transformer.resblocks.15.attention.wk.bias": "model-00007-of-00008.safetensors",
438
+ "model.vit.transformer.resblocks.15.attention.wk.weight": "model-00007-of-00008.safetensors",
439
+ "model.vit.transformer.resblocks.15.attention.wo.bias": "model-00007-of-00008.safetensors",
440
+ "model.vit.transformer.resblocks.15.attention.wo.weight": "model-00007-of-00008.safetensors",
441
+ "model.vit.transformer.resblocks.15.attention.wq.bias": "model-00007-of-00008.safetensors",
442
+ "model.vit.transformer.resblocks.15.attention.wq.weight": "model-00007-of-00008.safetensors",
443
+ "model.vit.transformer.resblocks.15.attention.wv.bias": "model-00007-of-00008.safetensors",
444
+ "model.vit.transformer.resblocks.15.attention.wv.weight": "model-00007-of-00008.safetensors",
445
+ "model.vit.transformer.resblocks.15.attention_norm.bias": "model-00007-of-00008.safetensors",
446
+ "model.vit.transformer.resblocks.15.attention_norm.weight": "model-00007-of-00008.safetensors",
447
+ "model.vit.transformer.resblocks.15.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
448
+ "model.vit.transformer.resblocks.15.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
449
+ "model.vit.transformer.resblocks.15.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
450
+ "model.vit.transformer.resblocks.15.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
451
+ "model.vit.transformer.resblocks.15.ffn_norm.bias": "model-00007-of-00008.safetensors",
452
+ "model.vit.transformer.resblocks.15.ffn_norm.weight": "model-00007-of-00008.safetensors",
453
+ "model.vit.transformer.resblocks.16.attention.wk.bias": "model-00007-of-00008.safetensors",
454
+ "model.vit.transformer.resblocks.16.attention.wk.weight": "model-00007-of-00008.safetensors",
455
+ "model.vit.transformer.resblocks.16.attention.wo.bias": "model-00007-of-00008.safetensors",
456
+ "model.vit.transformer.resblocks.16.attention.wo.weight": "model-00007-of-00008.safetensors",
457
+ "model.vit.transformer.resblocks.16.attention.wq.bias": "model-00007-of-00008.safetensors",
458
+ "model.vit.transformer.resblocks.16.attention.wq.weight": "model-00007-of-00008.safetensors",
459
+ "model.vit.transformer.resblocks.16.attention.wv.bias": "model-00007-of-00008.safetensors",
460
+ "model.vit.transformer.resblocks.16.attention.wv.weight": "model-00007-of-00008.safetensors",
461
+ "model.vit.transformer.resblocks.16.attention_norm.bias": "model-00007-of-00008.safetensors",
462
+ "model.vit.transformer.resblocks.16.attention_norm.weight": "model-00007-of-00008.safetensors",
463
+ "model.vit.transformer.resblocks.16.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
464
+ "model.vit.transformer.resblocks.16.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
465
+ "model.vit.transformer.resblocks.16.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
466
+ "model.vit.transformer.resblocks.16.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
467
+ "model.vit.transformer.resblocks.16.ffn_norm.bias": "model-00007-of-00008.safetensors",
468
+ "model.vit.transformer.resblocks.16.ffn_norm.weight": "model-00007-of-00008.safetensors",
469
+ "model.vit.transformer.resblocks.17.attention.wk.bias": "model-00007-of-00008.safetensors",
470
+ "model.vit.transformer.resblocks.17.attention.wk.weight": "model-00007-of-00008.safetensors",
471
+ "model.vit.transformer.resblocks.17.attention.wo.bias": "model-00007-of-00008.safetensors",
472
+ "model.vit.transformer.resblocks.17.attention.wo.weight": "model-00007-of-00008.safetensors",
473
+ "model.vit.transformer.resblocks.17.attention.wq.bias": "model-00007-of-00008.safetensors",
474
+ "model.vit.transformer.resblocks.17.attention.wq.weight": "model-00007-of-00008.safetensors",
475
+ "model.vit.transformer.resblocks.17.attention.wv.bias": "model-00007-of-00008.safetensors",
476
+ "model.vit.transformer.resblocks.17.attention.wv.weight": "model-00007-of-00008.safetensors",
477
+ "model.vit.transformer.resblocks.17.attention_norm.bias": "model-00007-of-00008.safetensors",
478
+ "model.vit.transformer.resblocks.17.attention_norm.weight": "model-00007-of-00008.safetensors",
479
+ "model.vit.transformer.resblocks.17.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
480
+ "model.vit.transformer.resblocks.17.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
481
+ "model.vit.transformer.resblocks.17.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
482
+ "model.vit.transformer.resblocks.17.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
483
+ "model.vit.transformer.resblocks.17.ffn_norm.bias": "model-00007-of-00008.safetensors",
484
+ "model.vit.transformer.resblocks.17.ffn_norm.weight": "model-00007-of-00008.safetensors",
485
+ "model.vit.transformer.resblocks.18.attention.wk.bias": "model-00007-of-00008.safetensors",
486
+ "model.vit.transformer.resblocks.18.attention.wk.weight": "model-00007-of-00008.safetensors",
487
+ "model.vit.transformer.resblocks.18.attention.wo.bias": "model-00007-of-00008.safetensors",
488
+ "model.vit.transformer.resblocks.18.attention.wo.weight": "model-00007-of-00008.safetensors",
489
+ "model.vit.transformer.resblocks.18.attention.wq.bias": "model-00007-of-00008.safetensors",
490
+ "model.vit.transformer.resblocks.18.attention.wq.weight": "model-00007-of-00008.safetensors",
491
+ "model.vit.transformer.resblocks.18.attention.wv.bias": "model-00007-of-00008.safetensors",
492
+ "model.vit.transformer.resblocks.18.attention.wv.weight": "model-00007-of-00008.safetensors",
493
+ "model.vit.transformer.resblocks.18.attention_norm.bias": "model-00007-of-00008.safetensors",
494
+ "model.vit.transformer.resblocks.18.attention_norm.weight": "model-00007-of-00008.safetensors",
495
+ "model.vit.transformer.resblocks.18.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
496
+ "model.vit.transformer.resblocks.18.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
497
+ "model.vit.transformer.resblocks.18.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
498
+ "model.vit.transformer.resblocks.18.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
499
+ "model.vit.transformer.resblocks.18.ffn_norm.bias": "model-00007-of-00008.safetensors",
500
+ "model.vit.transformer.resblocks.18.ffn_norm.weight": "model-00007-of-00008.safetensors",
501
+ "model.vit.transformer.resblocks.19.attention.wk.bias": "model-00007-of-00008.safetensors",
502
+ "model.vit.transformer.resblocks.19.attention.wk.weight": "model-00007-of-00008.safetensors",
503
+ "model.vit.transformer.resblocks.19.attention.wo.bias": "model-00007-of-00008.safetensors",
504
+ "model.vit.transformer.resblocks.19.attention.wo.weight": "model-00007-of-00008.safetensors",
505
+ "model.vit.transformer.resblocks.19.attention.wq.bias": "model-00007-of-00008.safetensors",
506
+ "model.vit.transformer.resblocks.19.attention.wq.weight": "model-00007-of-00008.safetensors",
507
+ "model.vit.transformer.resblocks.19.attention.wv.bias": "model-00007-of-00008.safetensors",
508
+ "model.vit.transformer.resblocks.19.attention.wv.weight": "model-00007-of-00008.safetensors",
509
+ "model.vit.transformer.resblocks.19.attention_norm.bias": "model-00007-of-00008.safetensors",
510
+ "model.vit.transformer.resblocks.19.attention_norm.weight": "model-00007-of-00008.safetensors",
511
+ "model.vit.transformer.resblocks.19.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
512
+ "model.vit.transformer.resblocks.19.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
513
+ "model.vit.transformer.resblocks.19.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
514
+ "model.vit.transformer.resblocks.19.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
515
+ "model.vit.transformer.resblocks.19.ffn_norm.bias": "model-00007-of-00008.safetensors",
516
+ "model.vit.transformer.resblocks.19.ffn_norm.weight": "model-00007-of-00008.safetensors",
517
  "model.vit.transformer.resblocks.2.attention.wk.bias": "model-00007-of-00008.safetensors",
518
  "model.vit.transformer.resblocks.2.attention.wk.weight": "model-00007-of-00008.safetensors",
519
  "model.vit.transformer.resblocks.2.attention.wo.bias": "model-00007-of-00008.safetensors",
 
530
  "model.vit.transformer.resblocks.2.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
531
  "model.vit.transformer.resblocks.2.ffn_norm.bias": "model-00007-of-00008.safetensors",
532
  "model.vit.transformer.resblocks.2.ffn_norm.weight": "model-00007-of-00008.safetensors",
533
+ "model.vit.transformer.resblocks.20.attention.wk.bias": "model-00007-of-00008.safetensors",
534
+ "model.vit.transformer.resblocks.20.attention.wk.weight": "model-00007-of-00008.safetensors",
535
+ "model.vit.transformer.resblocks.20.attention.wo.bias": "model-00007-of-00008.safetensors",
536
+ "model.vit.transformer.resblocks.20.attention.wo.weight": "model-00007-of-00008.safetensors",
537
+ "model.vit.transformer.resblocks.20.attention.wq.bias": "model-00007-of-00008.safetensors",
538
+ "model.vit.transformer.resblocks.20.attention.wq.weight": "model-00007-of-00008.safetensors",
539
+ "model.vit.transformer.resblocks.20.attention.wv.bias": "model-00007-of-00008.safetensors",
540
+ "model.vit.transformer.resblocks.20.attention.wv.weight": "model-00007-of-00008.safetensors",
541
+ "model.vit.transformer.resblocks.20.attention_norm.bias": "model-00007-of-00008.safetensors",
542
+ "model.vit.transformer.resblocks.20.attention_norm.weight": "model-00007-of-00008.safetensors",
543
+ "model.vit.transformer.resblocks.20.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
544
+ "model.vit.transformer.resblocks.20.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
545
+ "model.vit.transformer.resblocks.20.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
546
+ "model.vit.transformer.resblocks.20.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
547
+ "model.vit.transformer.resblocks.20.ffn_norm.bias": "model-00007-of-00008.safetensors",
548
+ "model.vit.transformer.resblocks.20.ffn_norm.weight": "model-00007-of-00008.safetensors",
549
+ "model.vit.transformer.resblocks.21.attention.wk.bias": "model-00007-of-00008.safetensors",
550
+ "model.vit.transformer.resblocks.21.attention.wk.weight": "model-00007-of-00008.safetensors",
551
+ "model.vit.transformer.resblocks.21.attention.wo.bias": "model-00007-of-00008.safetensors",
552
+ "model.vit.transformer.resblocks.21.attention.wo.weight": "model-00007-of-00008.safetensors",
553
+ "model.vit.transformer.resblocks.21.attention.wq.bias": "model-00007-of-00008.safetensors",
554
+ "model.vit.transformer.resblocks.21.attention.wq.weight": "model-00007-of-00008.safetensors",
555
+ "model.vit.transformer.resblocks.21.attention.wv.bias": "model-00007-of-00008.safetensors",
556
+ "model.vit.transformer.resblocks.21.attention.wv.weight": "model-00007-of-00008.safetensors",
557
+ "model.vit.transformer.resblocks.21.attention_norm.bias": "model-00007-of-00008.safetensors",
558
+ "model.vit.transformer.resblocks.21.attention_norm.weight": "model-00007-of-00008.safetensors",
559
+ "model.vit.transformer.resblocks.21.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
560
+ "model.vit.transformer.resblocks.21.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
561
+ "model.vit.transformer.resblocks.21.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
562
+ "model.vit.transformer.resblocks.21.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
563
+ "model.vit.transformer.resblocks.21.ffn_norm.bias": "model-00007-of-00008.safetensors",
564
+ "model.vit.transformer.resblocks.21.ffn_norm.weight": "model-00007-of-00008.safetensors",
565
+ "model.vit.transformer.resblocks.22.attention.wk.bias": "model-00007-of-00008.safetensors",
566
+ "model.vit.transformer.resblocks.22.attention.wk.weight": "model-00007-of-00008.safetensors",
567
+ "model.vit.transformer.resblocks.22.attention.wo.bias": "model-00007-of-00008.safetensors",
568
+ "model.vit.transformer.resblocks.22.attention.wo.weight": "model-00007-of-00008.safetensors",
569
+ "model.vit.transformer.resblocks.22.attention.wq.bias": "model-00007-of-00008.safetensors",
570
+ "model.vit.transformer.resblocks.22.attention.wq.weight": "model-00007-of-00008.safetensors",
571
+ "model.vit.transformer.resblocks.22.attention.wv.bias": "model-00007-of-00008.safetensors",
572
+ "model.vit.transformer.resblocks.22.attention.wv.weight": "model-00007-of-00008.safetensors",
573
+ "model.vit.transformer.resblocks.22.attention_norm.bias": "model-00007-of-00008.safetensors",
574
+ "model.vit.transformer.resblocks.22.attention_norm.weight": "model-00007-of-00008.safetensors",
575
+ "model.vit.transformer.resblocks.22.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
576
+ "model.vit.transformer.resblocks.22.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
577
+ "model.vit.transformer.resblocks.22.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
578
+ "model.vit.transformer.resblocks.22.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
579
+ "model.vit.transformer.resblocks.22.ffn_norm.bias": "model-00007-of-00008.safetensors",
580
+ "model.vit.transformer.resblocks.22.ffn_norm.weight": "model-00007-of-00008.safetensors",
581
+ "model.vit.transformer.resblocks.23.attention.wk.bias": "model-00007-of-00008.safetensors",
582
+ "model.vit.transformer.resblocks.23.attention.wk.weight": "model-00007-of-00008.safetensors",
583
+ "model.vit.transformer.resblocks.23.attention.wo.bias": "model-00007-of-00008.safetensors",
584
+ "model.vit.transformer.resblocks.23.attention.wo.weight": "model-00007-of-00008.safetensors",
585
+ "model.vit.transformer.resblocks.23.attention.wq.bias": "model-00007-of-00008.safetensors",
586
+ "model.vit.transformer.resblocks.23.attention.wq.weight": "model-00007-of-00008.safetensors",
587
+ "model.vit.transformer.resblocks.23.attention.wv.bias": "model-00007-of-00008.safetensors",
588
+ "model.vit.transformer.resblocks.23.attention.wv.weight": "model-00007-of-00008.safetensors",
589
+ "model.vit.transformer.resblocks.23.attention_norm.bias": "model-00007-of-00008.safetensors",
590
+ "model.vit.transformer.resblocks.23.attention_norm.weight": "model-00007-of-00008.safetensors",
591
+ "model.vit.transformer.resblocks.23.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
592
+ "model.vit.transformer.resblocks.23.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
593
+ "model.vit.transformer.resblocks.23.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
594
+ "model.vit.transformer.resblocks.23.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
595
+ "model.vit.transformer.resblocks.23.ffn_norm.bias": "model-00007-of-00008.safetensors",
596
+ "model.vit.transformer.resblocks.23.ffn_norm.weight": "model-00007-of-00008.safetensors",
597
+ "model.vit.transformer.resblocks.24.attention.wk.bias": "model-00007-of-00008.safetensors",
598
+ "model.vit.transformer.resblocks.24.attention.wk.weight": "model-00007-of-00008.safetensors",
599
+ "model.vit.transformer.resblocks.24.attention.wo.bias": "model-00007-of-00008.safetensors",
600
+ "model.vit.transformer.resblocks.24.attention.wo.weight": "model-00007-of-00008.safetensors",
601
+ "model.vit.transformer.resblocks.24.attention.wq.bias": "model-00007-of-00008.safetensors",
602
+ "model.vit.transformer.resblocks.24.attention.wq.weight": "model-00007-of-00008.safetensors",
603
+ "model.vit.transformer.resblocks.24.attention.wv.bias": "model-00007-of-00008.safetensors",
604
+ "model.vit.transformer.resblocks.24.attention.wv.weight": "model-00007-of-00008.safetensors",
605
+ "model.vit.transformer.resblocks.24.attention_norm.bias": "model-00007-of-00008.safetensors",
606
+ "model.vit.transformer.resblocks.24.attention_norm.weight": "model-00007-of-00008.safetensors",
607
+ "model.vit.transformer.resblocks.24.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
608
+ "model.vit.transformer.resblocks.24.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
609
+ "model.vit.transformer.resblocks.24.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
610
+ "model.vit.transformer.resblocks.24.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
611
+ "model.vit.transformer.resblocks.24.ffn_norm.bias": "model-00007-of-00008.safetensors",
612
+ "model.vit.transformer.resblocks.24.ffn_norm.weight": "model-00007-of-00008.safetensors",
613
  "model.vit.transformer.resblocks.3.attention.wk.bias": "model-00007-of-00008.safetensors",
614
  "model.vit.transformer.resblocks.3.attention.wk.weight": "model-00007-of-00008.safetensors",
615
  "model.vit.transformer.resblocks.3.attention.wo.bias": "model-00007-of-00008.safetensors",
 
698
  "model.vit.transformer.resblocks.8.attention.wq.weight": "model-00007-of-00008.safetensors",
699
  "model.vit.transformer.resblocks.8.attention.wv.bias": "model-00007-of-00008.safetensors",
700
  "model.vit.transformer.resblocks.8.attention.wv.weight": "model-00007-of-00008.safetensors",
701
+ "model.vit.transformer.resblocks.8.attention_norm.bias": "model-00007-of-00008.safetensors",
702
+ "model.vit.transformer.resblocks.8.attention_norm.weight": "model-00007-of-00008.safetensors",
703
  "model.vit.transformer.resblocks.8.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
704
  "model.vit.transformer.resblocks.8.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
705
+ "model.vit.transformer.resblocks.8.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
706
+ "model.vit.transformer.resblocks.8.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
707
+ "model.vit.transformer.resblocks.8.ffn_norm.bias": "model-00007-of-00008.safetensors",
708
+ "model.vit.transformer.resblocks.8.ffn_norm.weight": "model-00007-of-00008.safetensors",
709
+ "model.vit.transformer.resblocks.9.attention.wk.bias": "model-00007-of-00008.safetensors",
710
+ "model.vit.transformer.resblocks.9.attention.wk.weight": "model-00007-of-00008.safetensors",
711
+ "model.vit.transformer.resblocks.9.attention.wo.bias": "model-00007-of-00008.safetensors",
712
+ "model.vit.transformer.resblocks.9.attention.wo.weight": "model-00007-of-00008.safetensors",
713
+ "model.vit.transformer.resblocks.9.attention.wq.bias": "model-00007-of-00008.safetensors",
714
+ "model.vit.transformer.resblocks.9.attention.wq.weight": "model-00007-of-00008.safetensors",
715
+ "model.vit.transformer.resblocks.9.attention.wv.bias": "model-00007-of-00008.safetensors",
716
+ "model.vit.transformer.resblocks.9.attention.wv.weight": "model-00007-of-00008.safetensors",
717
+ "model.vit.transformer.resblocks.9.attention_norm.bias": "model-00007-of-00008.safetensors",
718
+ "model.vit.transformer.resblocks.9.attention_norm.weight": "model-00007-of-00008.safetensors",
719
+ "model.vit.transformer.resblocks.9.feed_forward.w1.bias": "model-00007-of-00008.safetensors",
720
+ "model.vit.transformer.resblocks.9.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
721
+ "model.vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
722
+ "model.vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
723
+ "model.vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00008.safetensors",
724
+ "model.vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00008.safetensors",
725
+ "model.x_norm.weight": "model-00007-of-00008.safetensors"
 
 
726
  }
727
  }
modeling_molmo_point.py CHANGED
@@ -1307,9 +1307,10 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1307
  input_patch_ids = None
1308
  can_point = False
1309
 
1310
- x = self.transformer.wte(input_ids)
 
1311
  batch_size, _, dim = x.shape
1312
- batch_idx = torch.arange(batch_size, device=self.device)
1313
 
1314
  # TODO update embeddings for patch/subpatch tokens
1315
  vit_features_flat: Optional[torch.FloatTensor] = None
@@ -1326,7 +1327,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1326
  features = []
1327
  for layer in self.vit_layers:
1328
  features.append(vit_image_features[layer])
1329
- vit_features = torch.cat(features, dim=-1)
1330
  vit_feature_dim = vit_features.shape[-1]
1331
 
1332
  # Gather the features that should be pooled to build patch embeddings
@@ -1342,7 +1343,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1342
  vit_features_to_flat_mask = vit_features_mask.view(-1, token_pooling.shape[-1])[image_features_mask.view(-1)]
1343
 
1344
  # Finally apply the connector and add to input embeddings
1345
- image_features = self.connector(vit_features_flat, vit_features_to_flat_mask)
1346
  x = x.clone()
1347
  x.view(-1, dim)[is_image_token.view(-1)] += image_features.view(-1, dim)
1348
 
@@ -1350,7 +1351,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1350
  # embeddings
1351
  image_token_indices = torch.cumsum(is_indexable_image_token, dim=-1) - 1
1352
  image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
1353
- image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=self.device)
1354
  image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
1355
  max_image_pos_id = image_pos_ids_flat.max() + 1
1356
  elif image_data is not None:
@@ -1374,7 +1375,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1374
  assert last_predicted_patch_id is not None, "Patch should always be generated before a subpatch"
1375
  for_patches = (last_predicted_patch_id.view(batch_size) + image_token_offset)[input_subpatch_ids.view(batch_size) >= 0]
1376
  vit_features_to_embed = vit_features_flat[for_patches, input_subpatch_ids]
1377
- x.view(-1, dim)[is_subpatch.view(-1)] = self.build_vit_embedding(vit_features_to_embed).to(x.dtype)
1378
 
1379
  # shape: (batch_size, seq_len, d_model)
1380
  x = self.transformer.emb_drop(x) # type: ignore
@@ -1438,7 +1439,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1438
 
1439
  if images is not None or image_data is not None:
1440
  if self.x_norm:
1441
- x_norm = self.x_norm(x)
1442
  elif self.config.norm_x:
1443
  x_norm = x / math.sqrt(dim)
1444
  else:
@@ -1452,7 +1453,8 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1452
  patch_k_flat = self.patch_k(x_norm.view(-1, dim)[is_image_token.view(-1)])
1453
  if self.patch_rotary is not None:
1454
  patch_k_flat = self.patch_rotary(patch_k_flat, image_pos_ids_flat)
1455
- patch_k = torch.zeros([batch_size, image_features_mask.shape[1], patch_k_flat.shape[-1]], dtype=x.dtype, device=self.device)
 
1456
  patch_k.view(-1, patch_k_flat.shape[-1])[image_features_mask.flatten()] = patch_k_flat.to(dtype=x.dtype)
1457
 
1458
  patch_k_mask = image_features_mask.clone()
@@ -1460,14 +1462,14 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1460
  is_indexable_image_token.view(-1)[is_image_token.view(-1)])
1461
 
1462
  if self.config.no_more_points_class:
1463
- patch_k = self.add_no_point_class_embed(patch_k)
1464
  patch_k_mask = F.pad(patch_k_mask, (0, 1), value=True)
1465
 
1466
- subpatch_k = self.subpatch_k(vit_features)
1467
 
1468
  # Predict patch locations
1469
  if can_point:
1470
- image_q = self.patch_q(x_norm)
1471
  if self.patch_rotary is not None and last_predicted_patch_id is not None:
1472
  rotate_by = image_pos_ids[batch_idx, last_predicted_patch_id]
1473
  rotate_by = torch.where(last_predicted_patch_id >= 0, rotate_by, 0)
@@ -1475,7 +1477,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1475
  image_q = self.patch_rotary(
1476
  image_q.view(-1, image_q.shape[-1]),
1477
  torch.clamp(rotate_by, min=0),
1478
- ).reshape(batch_size, -1, image_q.shape[-1])
1479
 
1480
  dots = torch.matmul(image_q, patch_k.transpose(1, 2)) # [batch, 1, num_images]
1481
  if self.config.norm_logits:
@@ -1487,7 +1489,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1487
  if can_point and torch.any(is_patch):
1488
  if x_norm.shape[1] != 1:
1489
  raise NotImplementedError()
1490
- subpatch_point_q = self.subpatch_q(x_norm.squeeze(1))
1491
  subpatch_k = subpatch_k[batch_idx, input_patch_ids.squeeze(1)]
1492
  subpatch_logits = torch.einsum("pd,pcd->pc", subpatch_point_q, subpatch_k)
1493
  if self.config.norm_logits:
@@ -1497,7 +1499,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1497
  subpatch_logits = subpatch_logits[:, None, :]
1498
 
1499
  if can_point and torch.any(is_subpatch):
1500
- location_logits = self.subpatch_loc_k(x)
1501
 
1502
  if is_prefill:
1503
  num_image_tokens = is_image_token.sum(-1)
@@ -1534,6 +1536,17 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
1534
  )
1535
 
1536
 
 
 
 
 
 
 
 
 
 
 
 
1537
  class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMixin):
1538
  _checkpoint_conversion_mapping = {}
1539
  _tied_weights_keys = [] # Weights are not tied
@@ -1545,8 +1558,7 @@ class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMi
1545
  super().__init__(config)
1546
 
1547
  self.model = MolmoPointModel(config)
1548
- self.output_embeddings = nn.Parameter(torch.zeros([config.vocab_size, config.hidden_size]))
1549
- self.new_output_embeddings = nn.Parameter(torch.zeros([128, config.hidden_size]))
1550
  self.vocab_size = config.vocab_size
1551
 
1552
  # Initialize weights and apply final processing
@@ -1675,8 +1687,7 @@ class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMi
1675
  hidden_states = outputs.last_hidden_state
1676
  # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1677
  slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1678
- lm_head = torch.concatenate([self.output_embeddings, self.new_output_embeddings], dim=0)
1679
- logits = F.linear(hidden_states[:, slice_indices, :], lm_head)
1680
 
1681
  loss = None
1682
  if labels is not None:
@@ -1697,6 +1708,7 @@ class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMi
1697
  # process is hard to emulate in generation frameworks
1698
  # Our hack here is to assume that, if we generate a TOKEN, we always select the argmax
1699
  # patch. Then we can use PATCH_TOKEN scores as the argmax's patch scores
 
1700
  predicted_tokens = torch.argmax(logits[:, -1], dim=-1)
1701
  patch_token_logits = torch.clone(logits[:, :, self.config.patch_token_id])
1702
  logits[:, :, self.config.patch_token_id] = small_val
@@ -1705,8 +1717,8 @@ class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMi
1705
  if outputs.patch_logits is not None:
1706
  selected_patches = torch.argmax(outputs.patch_logits, -1)
1707
  bs, seq, n_patches = outputs.patch_logits.shape
1708
- batch_idx = torch.arange(outputs.patch_logits.shape[0], device=self.device)
1709
- seq_ix = torch.arange(outputs.patch_logits.shape[1], device=self.device)
1710
  argmax_patch_logits[batch_idx.view(-1, 1, 1), seq_ix.view(1, -1, 1), selected_patches] = patch_token_logits
1711
 
1712
  logits[:, :, self.config.subpatch_token_id] = small_val
@@ -1722,7 +1734,11 @@ class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMi
1722
  location_logits = torch.full([bs, seq, 9], small_val, dtype=logits.dtype, device=logits.device)
1723
 
1724
  logits = torch.concatenate([
1725
- logits, argmax_patch_logits, subpatch_logits, location_logits], -1)
 
 
 
 
1726
 
1727
  return MolmoPointCausalLMOutputWithPast(
1728
  loss=loss,
 
1307
  input_patch_ids = None
1308
  can_point = False
1309
 
1310
+ device = input_ids.device
1311
+ x = self.transformer.wte(input_ids).to(device=device)
1312
  batch_size, _, dim = x.shape
1313
+ batch_idx = torch.arange(batch_size, device=device)
1314
 
1315
  # TODO update embeddings for patch/subpatch tokens
1316
  vit_features_flat: Optional[torch.FloatTensor] = None
 
1327
  features = []
1328
  for layer in self.vit_layers:
1329
  features.append(vit_image_features[layer])
1330
+ vit_features = torch.cat(features, dim=-1).to(device=device)
1331
  vit_feature_dim = vit_features.shape[-1]
1332
 
1333
  # Gather the features that should be pooled to build patch embeddings
 
1343
  vit_features_to_flat_mask = vit_features_mask.view(-1, token_pooling.shape[-1])[image_features_mask.view(-1)]
1344
 
1345
  # Finally apply the connector and add to input embeddings
1346
+ image_features = self.connector(vit_features_flat, vit_features_to_flat_mask).to(device=device)
1347
  x = x.clone()
1348
  x.view(-1, dim)[is_image_token.view(-1)] += image_features.view(-1, dim)
1349
 
 
1351
  # embeddings
1352
  image_token_indices = torch.cumsum(is_indexable_image_token, dim=-1) - 1
1353
  image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
1354
+ image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=device)
1355
  image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
1356
  max_image_pos_id = image_pos_ids_flat.max() + 1
1357
  elif image_data is not None:
 
1375
  assert last_predicted_patch_id is not None, "Patch should always be generated before a subpatch"
1376
  for_patches = (last_predicted_patch_id.view(batch_size) + image_token_offset)[input_subpatch_ids.view(batch_size) >= 0]
1377
  vit_features_to_embed = vit_features_flat[for_patches, input_subpatch_ids]
1378
+ x.view(-1, dim)[is_subpatch.view(-1)] = self.build_vit_embedding(vit_features_to_embed).to(device=device)
1379
 
1380
  # shape: (batch_size, seq_len, d_model)
1381
  x = self.transformer.emb_drop(x) # type: ignore
 
1439
 
1440
  if images is not None or image_data is not None:
1441
  if self.x_norm:
1442
+ x_norm = self.x_norm(x).to(device=device)
1443
  elif self.config.norm_x:
1444
  x_norm = x / math.sqrt(dim)
1445
  else:
 
1453
  patch_k_flat = self.patch_k(x_norm.view(-1, dim)[is_image_token.view(-1)])
1454
  if self.patch_rotary is not None:
1455
  patch_k_flat = self.patch_rotary(patch_k_flat, image_pos_ids_flat)
1456
+ patch_k_flat = patch_k_flat.to(device=device)
1457
+ patch_k = torch.zeros([batch_size, image_features_mask.shape[1], patch_k_flat.shape[-1]], dtype=x.dtype, device=device)
1458
  patch_k.view(-1, patch_k_flat.shape[-1])[image_features_mask.flatten()] = patch_k_flat.to(dtype=x.dtype)
1459
 
1460
  patch_k_mask = image_features_mask.clone()
 
1462
  is_indexable_image_token.view(-1)[is_image_token.view(-1)])
1463
 
1464
  if self.config.no_more_points_class:
1465
+ patch_k = self.add_no_point_class_embed(patch_k).to(device=device)
1466
  patch_k_mask = F.pad(patch_k_mask, (0, 1), value=True)
1467
 
1468
+ subpatch_k = self.subpatch_k(vit_features).to(device=device)
1469
 
1470
  # Predict patch locations
1471
  if can_point:
1472
+ image_q = self.patch_q(x_norm).to(device=device)
1473
  if self.patch_rotary is not None and last_predicted_patch_id is not None:
1474
  rotate_by = image_pos_ids[batch_idx, last_predicted_patch_id]
1475
  rotate_by = torch.where(last_predicted_patch_id >= 0, rotate_by, 0)
 
1477
  image_q = self.patch_rotary(
1478
  image_q.view(-1, image_q.shape[-1]),
1479
  torch.clamp(rotate_by, min=0),
1480
+ ).reshape(batch_size, -1, image_q.shape[-1]).to(device=device)
1481
 
1482
  dots = torch.matmul(image_q, patch_k.transpose(1, 2)) # [batch, 1, num_images]
1483
  if self.config.norm_logits:
 
1489
  if can_point and torch.any(is_patch):
1490
  if x_norm.shape[1] != 1:
1491
  raise NotImplementedError()
1492
+ subpatch_point_q = self.subpatch_q(x_norm.squeeze(1)).to(device=device)
1493
  subpatch_k = subpatch_k[batch_idx, input_patch_ids.squeeze(1)]
1494
  subpatch_logits = torch.einsum("pd,pcd->pc", subpatch_point_q, subpatch_k)
1495
  if self.config.norm_logits:
 
1499
  subpatch_logits = subpatch_logits[:, None, :]
1500
 
1501
  if can_point and torch.any(is_subpatch):
1502
+ location_logits = self.subpatch_loc_k(x).to(device=device)
1503
 
1504
  if is_prefill:
1505
  num_image_tokens = is_image_token.sum(-1)
 
1536
  )
1537
 
1538
 
1539
+ class ExtendedLmHead(nn.Module):
1540
+ def __init__(self, config):
1541
+ super().__init__()
1542
+ self.output_embeddings = nn.Parameter(torch.zeros([config.vocab_size, config.hidden_size]))
1543
+ self.new_output_embeddings = nn.Parameter(torch.zeros([128, config.hidden_size]))
1544
+
1545
+ def __call__(self, hidden_states, slice_indices=None):
1546
+ lm_head = torch.concatenate([self.output_embeddings, self.new_output_embeddings], dim=0)
1547
+ return F.linear(hidden_states[:, slice_indices, :], lm_head)
1548
+
1549
+
1550
  class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMixin):
1551
  _checkpoint_conversion_mapping = {}
1552
  _tied_weights_keys = [] # Weights are not tied
 
1558
  super().__init__(config)
1559
 
1560
  self.model = MolmoPointModel(config)
1561
+ self.lm_head = ExtendedLmHead(config)
 
1562
  self.vocab_size = config.vocab_size
1563
 
1564
  # Initialize weights and apply final processing
 
1687
  hidden_states = outputs.last_hidden_state
1688
  # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1689
  slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1690
+ logits = self.lm_head(hidden_states, slice_indices=slice_indices)
 
1691
 
1692
  loss = None
1693
  if labels is not None:
 
1708
  # process is hard to emulate in generation frameworks
1709
  # Our hack here is to assume that, if we generate a TOKEN, we always select the argmax
1710
  # patch. Then we can use PATCH_TOKEN scores as the argmax's patch scores
1711
+ device = logits.device
1712
  predicted_tokens = torch.argmax(logits[:, -1], dim=-1)
1713
  patch_token_logits = torch.clone(logits[:, :, self.config.patch_token_id])
1714
  logits[:, :, self.config.patch_token_id] = small_val
 
1717
  if outputs.patch_logits is not None:
1718
  selected_patches = torch.argmax(outputs.patch_logits, -1)
1719
  bs, seq, n_patches = outputs.patch_logits.shape
1720
+ batch_idx = torch.arange(outputs.patch_logits.shape[0], device=device)
1721
+ seq_ix = torch.arange(outputs.patch_logits.shape[1], device=device)
1722
  argmax_patch_logits[batch_idx.view(-1, 1, 1), seq_ix.view(1, -1, 1), selected_patches] = patch_token_logits
1723
 
1724
  logits[:, :, self.config.subpatch_token_id] = small_val
 
1734
  location_logits = torch.full([bs, seq, 9], small_val, dtype=logits.dtype, device=logits.device)
1735
 
1736
  logits = torch.concatenate([
1737
+ logits,
1738
+ argmax_patch_logits,
1739
+ subpatch_logits.to(device=device),
1740
+ location_logits.to(device=device)
1741
+ ], -1)
1742
 
1743
  return MolmoPointCausalLMOutputWithPast(
1744
  loss=loss,