seconds-0 commited on
Commit
3558023
·
verified ·
1 Parent(s): 883d17e

NSA 117M initial export

Browse files
logs/logs_extra_keys.txt CHANGED
@@ -1 +1,48 @@
1
- norm_f.weight
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ blocks.0.attn.gate.fc1.bias
2
+ blocks.0.attn.gate.fc1.weight
3
+ blocks.0.attn.gate.fc2.bias
4
+ blocks.0.attn.gate.fc2.weight
5
+ blocks.1.attn.gate.fc1.bias
6
+ blocks.1.attn.gate.fc1.weight
7
+ blocks.1.attn.gate.fc2.bias
8
+ blocks.1.attn.gate.fc2.weight
9
+ blocks.10.attn.gate.fc1.bias
10
+ blocks.10.attn.gate.fc1.weight
11
+ blocks.10.attn.gate.fc2.bias
12
+ blocks.10.attn.gate.fc2.weight
13
+ blocks.11.attn.gate.fc1.bias
14
+ blocks.11.attn.gate.fc1.weight
15
+ blocks.11.attn.gate.fc2.bias
16
+ blocks.11.attn.gate.fc2.weight
17
+ blocks.2.attn.gate.fc1.bias
18
+ blocks.2.attn.gate.fc1.weight
19
+ blocks.2.attn.gate.fc2.bias
20
+ blocks.2.attn.gate.fc2.weight
21
+ blocks.3.attn.gate.fc1.bias
22
+ blocks.3.attn.gate.fc1.weight
23
+ blocks.3.attn.gate.fc2.bias
24
+ blocks.3.attn.gate.fc2.weight
25
+ blocks.4.attn.gate.fc1.bias
26
+ blocks.4.attn.gate.fc1.weight
27
+ blocks.4.attn.gate.fc2.bias
28
+ blocks.4.attn.gate.fc2.weight
29
+ blocks.5.attn.gate.fc1.bias
30
+ blocks.5.attn.gate.fc1.weight
31
+ blocks.5.attn.gate.fc2.bias
32
+ blocks.5.attn.gate.fc2.weight
33
+ blocks.6.attn.gate.fc1.bias
34
+ blocks.6.attn.gate.fc1.weight
35
+ blocks.6.attn.gate.fc2.bias
36
+ blocks.6.attn.gate.fc2.weight
37
+ blocks.7.attn.gate.fc1.bias
38
+ blocks.7.attn.gate.fc1.weight
39
+ blocks.7.attn.gate.fc2.bias
40
+ blocks.7.attn.gate.fc2.weight
41
+ blocks.8.attn.gate.fc1.bias
42
+ blocks.8.attn.gate.fc1.weight
43
+ blocks.8.attn.gate.fc2.bias
44
+ blocks.8.attn.gate.fc2.weight
45
+ blocks.9.attn.gate.fc1.bias
46
+ blocks.9.attn.gate.fc1.weight
47
+ blocks.9.attn.gate.fc2.bias
48
+ blocks.9.attn.gate.fc2.weight
logs/logs_mapping.json CHANGED
@@ -7,10 +7,6 @@
7
  "model.blocks.0.attn.W_V_cmp.weight",
8
  "model.blocks.0.attn.W_V_sel.weight",
9
  "model.blocks.0.attn.W_V_win.weight",
10
- "model.blocks.0.attn.gate.fc1.bias",
11
- "model.blocks.0.attn.gate.fc1.weight",
12
- "model.blocks.0.attn.gate.fc2.bias",
13
- "model.blocks.0.attn.gate.fc2.weight",
14
  "model.blocks.0.attn.out.weight",
15
  "model.blocks.0.mlp.fc1.weight",
16
  "model.blocks.0.mlp.fc2.weight",
@@ -23,10 +19,6 @@
23
  "model.blocks.1.attn.W_V_cmp.weight",
24
  "model.blocks.1.attn.W_V_sel.weight",
25
  "model.blocks.1.attn.W_V_win.weight",
26
- "model.blocks.1.attn.gate.fc1.bias",
27
- "model.blocks.1.attn.gate.fc1.weight",
28
- "model.blocks.1.attn.gate.fc2.bias",
29
- "model.blocks.1.attn.gate.fc2.weight",
30
  "model.blocks.1.attn.out.weight",
31
  "model.blocks.1.mlp.fc1.weight",
32
  "model.blocks.1.mlp.fc2.weight",
@@ -39,10 +31,6 @@
39
  "model.blocks.10.attn.W_V_cmp.weight",
40
  "model.blocks.10.attn.W_V_sel.weight",
41
  "model.blocks.10.attn.W_V_win.weight",
42
- "model.blocks.10.attn.gate.fc1.bias",
43
- "model.blocks.10.attn.gate.fc1.weight",
44
- "model.blocks.10.attn.gate.fc2.bias",
45
- "model.blocks.10.attn.gate.fc2.weight",
46
  "model.blocks.10.attn.out.weight",
47
  "model.blocks.10.mlp.fc1.weight",
48
  "model.blocks.10.mlp.fc2.weight",
@@ -55,10 +43,6 @@
55
  "model.blocks.11.attn.W_V_cmp.weight",
56
  "model.blocks.11.attn.W_V_sel.weight",
57
  "model.blocks.11.attn.W_V_win.weight",
58
- "model.blocks.11.attn.gate.fc1.bias",
59
- "model.blocks.11.attn.gate.fc1.weight",
60
- "model.blocks.11.attn.gate.fc2.bias",
61
- "model.blocks.11.attn.gate.fc2.weight",
62
  "model.blocks.11.attn.out.weight",
63
  "model.blocks.11.mlp.fc1.weight",
64
  "model.blocks.11.mlp.fc2.weight",
@@ -71,10 +55,6 @@
71
  "model.blocks.2.attn.W_V_cmp.weight",
72
  "model.blocks.2.attn.W_V_sel.weight",
73
  "model.blocks.2.attn.W_V_win.weight",
74
- "model.blocks.2.attn.gate.fc1.bias",
75
- "model.blocks.2.attn.gate.fc1.weight",
76
- "model.blocks.2.attn.gate.fc2.bias",
77
- "model.blocks.2.attn.gate.fc2.weight",
78
  "model.blocks.2.attn.out.weight",
79
  "model.blocks.2.mlp.fc1.weight",
80
  "model.blocks.2.mlp.fc2.weight",
@@ -87,10 +67,6 @@
87
  "model.blocks.3.attn.W_V_cmp.weight",
88
  "model.blocks.3.attn.W_V_sel.weight",
89
  "model.blocks.3.attn.W_V_win.weight",
90
- "model.blocks.3.attn.gate.fc1.bias",
91
- "model.blocks.3.attn.gate.fc1.weight",
92
- "model.blocks.3.attn.gate.fc2.bias",
93
- "model.blocks.3.attn.gate.fc2.weight",
94
  "model.blocks.3.attn.out.weight",
95
  "model.blocks.3.mlp.fc1.weight",
96
  "model.blocks.3.mlp.fc2.weight",
@@ -103,10 +79,6 @@
103
  "model.blocks.4.attn.W_V_cmp.weight",
104
  "model.blocks.4.attn.W_V_sel.weight",
105
  "model.blocks.4.attn.W_V_win.weight",
106
- "model.blocks.4.attn.gate.fc1.bias",
107
- "model.blocks.4.attn.gate.fc1.weight",
108
- "model.blocks.4.attn.gate.fc2.bias",
109
- "model.blocks.4.attn.gate.fc2.weight",
110
  "model.blocks.4.attn.out.weight",
111
  "model.blocks.4.mlp.fc1.weight",
112
  "model.blocks.4.mlp.fc2.weight",
@@ -119,10 +91,6 @@
119
  "model.blocks.5.attn.W_V_cmp.weight",
120
  "model.blocks.5.attn.W_V_sel.weight",
121
  "model.blocks.5.attn.W_V_win.weight",
122
- "model.blocks.5.attn.gate.fc1.bias",
123
- "model.blocks.5.attn.gate.fc1.weight",
124
- "model.blocks.5.attn.gate.fc2.bias",
125
- "model.blocks.5.attn.gate.fc2.weight",
126
  "model.blocks.5.attn.out.weight",
127
  "model.blocks.5.mlp.fc1.weight",
128
  "model.blocks.5.mlp.fc2.weight",
@@ -135,10 +103,6 @@
135
  "model.blocks.6.attn.W_V_cmp.weight",
136
  "model.blocks.6.attn.W_V_sel.weight",
137
  "model.blocks.6.attn.W_V_win.weight",
138
- "model.blocks.6.attn.gate.fc1.bias",
139
- "model.blocks.6.attn.gate.fc1.weight",
140
- "model.blocks.6.attn.gate.fc2.bias",
141
- "model.blocks.6.attn.gate.fc2.weight",
142
  "model.blocks.6.attn.out.weight",
143
  "model.blocks.6.mlp.fc1.weight",
144
  "model.blocks.6.mlp.fc2.weight",
@@ -151,10 +115,6 @@
151
  "model.blocks.7.attn.W_V_cmp.weight",
152
  "model.blocks.7.attn.W_V_sel.weight",
153
  "model.blocks.7.attn.W_V_win.weight",
154
- "model.blocks.7.attn.gate.fc1.bias",
155
- "model.blocks.7.attn.gate.fc1.weight",
156
- "model.blocks.7.attn.gate.fc2.bias",
157
- "model.blocks.7.attn.gate.fc2.weight",
158
  "model.blocks.7.attn.out.weight",
159
  "model.blocks.7.mlp.fc1.weight",
160
  "model.blocks.7.mlp.fc2.weight",
@@ -167,10 +127,6 @@
167
  "model.blocks.8.attn.W_V_cmp.weight",
168
  "model.blocks.8.attn.W_V_sel.weight",
169
  "model.blocks.8.attn.W_V_win.weight",
170
- "model.blocks.8.attn.gate.fc1.bias",
171
- "model.blocks.8.attn.gate.fc1.weight",
172
- "model.blocks.8.attn.gate.fc2.bias",
173
- "model.blocks.8.attn.gate.fc2.weight",
174
  "model.blocks.8.attn.out.weight",
175
  "model.blocks.8.mlp.fc1.weight",
176
  "model.blocks.8.mlp.fc2.weight",
@@ -183,23 +139,114 @@
183
  "model.blocks.9.attn.W_V_cmp.weight",
184
  "model.blocks.9.attn.W_V_sel.weight",
185
  "model.blocks.9.attn.W_V_win.weight",
186
- "model.blocks.9.attn.gate.fc1.bias",
187
- "model.blocks.9.attn.gate.fc1.weight",
188
- "model.blocks.9.attn.gate.fc2.bias",
189
- "model.blocks.9.attn.gate.fc2.weight",
190
  "model.blocks.9.attn.out.weight",
191
  "model.blocks.9.mlp.fc1.weight",
192
  "model.blocks.9.mlp.fc2.weight",
193
  "model.blocks.9.norm1.weight",
194
  "model.blocks.9.norm2.weight",
195
  "model.embed.weight",
196
- "model.lm_head.weight"
 
197
  ],
198
  "missing": [
199
- "model.norm.bias",
200
- "model.norm.weight"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  ],
202
  "extra": [
203
- "norm_f.weight"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  ]
205
  }
 
7
  "model.blocks.0.attn.W_V_cmp.weight",
8
  "model.blocks.0.attn.W_V_sel.weight",
9
  "model.blocks.0.attn.W_V_win.weight",
 
 
 
 
10
  "model.blocks.0.attn.out.weight",
11
  "model.blocks.0.mlp.fc1.weight",
12
  "model.blocks.0.mlp.fc2.weight",
 
19
  "model.blocks.1.attn.W_V_cmp.weight",
20
  "model.blocks.1.attn.W_V_sel.weight",
21
  "model.blocks.1.attn.W_V_win.weight",
 
 
 
 
22
  "model.blocks.1.attn.out.weight",
23
  "model.blocks.1.mlp.fc1.weight",
24
  "model.blocks.1.mlp.fc2.weight",
 
31
  "model.blocks.10.attn.W_V_cmp.weight",
32
  "model.blocks.10.attn.W_V_sel.weight",
33
  "model.blocks.10.attn.W_V_win.weight",
 
 
 
 
34
  "model.blocks.10.attn.out.weight",
35
  "model.blocks.10.mlp.fc1.weight",
36
  "model.blocks.10.mlp.fc2.weight",
 
43
  "model.blocks.11.attn.W_V_cmp.weight",
44
  "model.blocks.11.attn.W_V_sel.weight",
45
  "model.blocks.11.attn.W_V_win.weight",
 
 
 
 
46
  "model.blocks.11.attn.out.weight",
47
  "model.blocks.11.mlp.fc1.weight",
48
  "model.blocks.11.mlp.fc2.weight",
 
55
  "model.blocks.2.attn.W_V_cmp.weight",
56
  "model.blocks.2.attn.W_V_sel.weight",
57
  "model.blocks.2.attn.W_V_win.weight",
 
 
 
 
58
  "model.blocks.2.attn.out.weight",
59
  "model.blocks.2.mlp.fc1.weight",
60
  "model.blocks.2.mlp.fc2.weight",
 
67
  "model.blocks.3.attn.W_V_cmp.weight",
68
  "model.blocks.3.attn.W_V_sel.weight",
69
  "model.blocks.3.attn.W_V_win.weight",
 
 
 
 
70
  "model.blocks.3.attn.out.weight",
71
  "model.blocks.3.mlp.fc1.weight",
72
  "model.blocks.3.mlp.fc2.weight",
 
79
  "model.blocks.4.attn.W_V_cmp.weight",
80
  "model.blocks.4.attn.W_V_sel.weight",
81
  "model.blocks.4.attn.W_V_win.weight",
 
 
 
 
82
  "model.blocks.4.attn.out.weight",
83
  "model.blocks.4.mlp.fc1.weight",
84
  "model.blocks.4.mlp.fc2.weight",
 
91
  "model.blocks.5.attn.W_V_cmp.weight",
92
  "model.blocks.5.attn.W_V_sel.weight",
93
  "model.blocks.5.attn.W_V_win.weight",
 
 
 
 
94
  "model.blocks.5.attn.out.weight",
95
  "model.blocks.5.mlp.fc1.weight",
96
  "model.blocks.5.mlp.fc2.weight",
 
103
  "model.blocks.6.attn.W_V_cmp.weight",
104
  "model.blocks.6.attn.W_V_sel.weight",
105
  "model.blocks.6.attn.W_V_win.weight",
 
 
 
 
106
  "model.blocks.6.attn.out.weight",
107
  "model.blocks.6.mlp.fc1.weight",
108
  "model.blocks.6.mlp.fc2.weight",
 
115
  "model.blocks.7.attn.W_V_cmp.weight",
116
  "model.blocks.7.attn.W_V_sel.weight",
117
  "model.blocks.7.attn.W_V_win.weight",
 
 
 
 
118
  "model.blocks.7.attn.out.weight",
119
  "model.blocks.7.mlp.fc1.weight",
120
  "model.blocks.7.mlp.fc2.weight",
 
127
  "model.blocks.8.attn.W_V_cmp.weight",
128
  "model.blocks.8.attn.W_V_sel.weight",
129
  "model.blocks.8.attn.W_V_win.weight",
 
 
 
 
130
  "model.blocks.8.attn.out.weight",
131
  "model.blocks.8.mlp.fc1.weight",
132
  "model.blocks.8.mlp.fc2.weight",
 
139
  "model.blocks.9.attn.W_V_cmp.weight",
140
  "model.blocks.9.attn.W_V_sel.weight",
141
  "model.blocks.9.attn.W_V_win.weight",
 
 
 
 
142
  "model.blocks.9.attn.out.weight",
143
  "model.blocks.9.mlp.fc1.weight",
144
  "model.blocks.9.mlp.fc2.weight",
145
  "model.blocks.9.norm1.weight",
146
  "model.blocks.9.norm2.weight",
147
  "model.embed.weight",
148
+ "model.lm_head.weight",
149
+ "model.norm.weight"
150
  ],
151
  "missing": [
152
+ "model.blocks.0.attn.gate_fc1.bias",
153
+ "model.blocks.0.attn.gate_fc1.weight",
154
+ "model.blocks.0.attn.gate_fc2.bias",
155
+ "model.blocks.0.attn.gate_fc2.weight",
156
+ "model.blocks.1.attn.gate_fc1.bias",
157
+ "model.blocks.1.attn.gate_fc1.weight",
158
+ "model.blocks.1.attn.gate_fc2.bias",
159
+ "model.blocks.1.attn.gate_fc2.weight",
160
+ "model.blocks.10.attn.gate_fc1.bias",
161
+ "model.blocks.10.attn.gate_fc1.weight",
162
+ "model.blocks.10.attn.gate_fc2.bias",
163
+ "model.blocks.10.attn.gate_fc2.weight",
164
+ "model.blocks.11.attn.gate_fc1.bias",
165
+ "model.blocks.11.attn.gate_fc1.weight",
166
+ "model.blocks.11.attn.gate_fc2.bias",
167
+ "model.blocks.11.attn.gate_fc2.weight",
168
+ "model.blocks.2.attn.gate_fc1.bias",
169
+ "model.blocks.2.attn.gate_fc1.weight",
170
+ "model.blocks.2.attn.gate_fc2.bias",
171
+ "model.blocks.2.attn.gate_fc2.weight",
172
+ "model.blocks.3.attn.gate_fc1.bias",
173
+ "model.blocks.3.attn.gate_fc1.weight",
174
+ "model.blocks.3.attn.gate_fc2.bias",
175
+ "model.blocks.3.attn.gate_fc2.weight",
176
+ "model.blocks.4.attn.gate_fc1.bias",
177
+ "model.blocks.4.attn.gate_fc1.weight",
178
+ "model.blocks.4.attn.gate_fc2.bias",
179
+ "model.blocks.4.attn.gate_fc2.weight",
180
+ "model.blocks.5.attn.gate_fc1.bias",
181
+ "model.blocks.5.attn.gate_fc1.weight",
182
+ "model.blocks.5.attn.gate_fc2.bias",
183
+ "model.blocks.5.attn.gate_fc2.weight",
184
+ "model.blocks.6.attn.gate_fc1.bias",
185
+ "model.blocks.6.attn.gate_fc1.weight",
186
+ "model.blocks.6.attn.gate_fc2.bias",
187
+ "model.blocks.6.attn.gate_fc2.weight",
188
+ "model.blocks.7.attn.gate_fc1.bias",
189
+ "model.blocks.7.attn.gate_fc1.weight",
190
+ "model.blocks.7.attn.gate_fc2.bias",
191
+ "model.blocks.7.attn.gate_fc2.weight",
192
+ "model.blocks.8.attn.gate_fc1.bias",
193
+ "model.blocks.8.attn.gate_fc1.weight",
194
+ "model.blocks.8.attn.gate_fc2.bias",
195
+ "model.blocks.8.attn.gate_fc2.weight",
196
+ "model.blocks.9.attn.gate_fc1.bias",
197
+ "model.blocks.9.attn.gate_fc1.weight",
198
+ "model.blocks.9.attn.gate_fc2.bias",
199
+ "model.blocks.9.attn.gate_fc2.weight",
200
+ "model.norm.bias"
201
  ],
202
  "extra": [
203
+ "blocks.0.attn.gate.fc1.bias",
204
+ "blocks.0.attn.gate.fc1.weight",
205
+ "blocks.0.attn.gate.fc2.bias",
206
+ "blocks.0.attn.gate.fc2.weight",
207
+ "blocks.1.attn.gate.fc1.bias",
208
+ "blocks.1.attn.gate.fc1.weight",
209
+ "blocks.1.attn.gate.fc2.bias",
210
+ "blocks.1.attn.gate.fc2.weight",
211
+ "blocks.10.attn.gate.fc1.bias",
212
+ "blocks.10.attn.gate.fc1.weight",
213
+ "blocks.10.attn.gate.fc2.bias",
214
+ "blocks.10.attn.gate.fc2.weight",
215
+ "blocks.11.attn.gate.fc1.bias",
216
+ "blocks.11.attn.gate.fc1.weight",
217
+ "blocks.11.attn.gate.fc2.bias",
218
+ "blocks.11.attn.gate.fc2.weight",
219
+ "blocks.2.attn.gate.fc1.bias",
220
+ "blocks.2.attn.gate.fc1.weight",
221
+ "blocks.2.attn.gate.fc2.bias",
222
+ "blocks.2.attn.gate.fc2.weight",
223
+ "blocks.3.attn.gate.fc1.bias",
224
+ "blocks.3.attn.gate.fc1.weight",
225
+ "blocks.3.attn.gate.fc2.bias",
226
+ "blocks.3.attn.gate.fc2.weight",
227
+ "blocks.4.attn.gate.fc1.bias",
228
+ "blocks.4.attn.gate.fc1.weight",
229
+ "blocks.4.attn.gate.fc2.bias",
230
+ "blocks.4.attn.gate.fc2.weight",
231
+ "blocks.5.attn.gate.fc1.bias",
232
+ "blocks.5.attn.gate.fc1.weight",
233
+ "blocks.5.attn.gate.fc2.bias",
234
+ "blocks.5.attn.gate.fc2.weight",
235
+ "blocks.6.attn.gate.fc1.bias",
236
+ "blocks.6.attn.gate.fc1.weight",
237
+ "blocks.6.attn.gate.fc2.bias",
238
+ "blocks.6.attn.gate.fc2.weight",
239
+ "blocks.7.attn.gate.fc1.bias",
240
+ "blocks.7.attn.gate.fc1.weight",
241
+ "blocks.7.attn.gate.fc2.bias",
242
+ "blocks.7.attn.gate.fc2.weight",
243
+ "blocks.8.attn.gate.fc1.bias",
244
+ "blocks.8.attn.gate.fc1.weight",
245
+ "blocks.8.attn.gate.fc2.bias",
246
+ "blocks.8.attn.gate.fc2.weight",
247
+ "blocks.9.attn.gate.fc1.bias",
248
+ "blocks.9.attn.gate.fc1.weight",
249
+ "blocks.9.attn.gate.fc2.bias",
250
+ "blocks.9.attn.gate.fc2.weight"
251
  ]
252
  }
logs/logs_missing_keys.txt CHANGED
@@ -1,2 +1,49 @@
1
- model.norm.bias
2
- model.norm.weight
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model.blocks.0.attn.gate_fc1.bias
2
+ model.blocks.0.attn.gate_fc1.weight
3
+ model.blocks.0.attn.gate_fc2.bias
4
+ model.blocks.0.attn.gate_fc2.weight
5
+ model.blocks.1.attn.gate_fc1.bias
6
+ model.blocks.1.attn.gate_fc1.weight
7
+ model.blocks.1.attn.gate_fc2.bias
8
+ model.blocks.1.attn.gate_fc2.weight
9
+ model.blocks.10.attn.gate_fc1.bias
10
+ model.blocks.10.attn.gate_fc1.weight
11
+ model.blocks.10.attn.gate_fc2.bias
12
+ model.blocks.10.attn.gate_fc2.weight
13
+ model.blocks.11.attn.gate_fc1.bias
14
+ model.blocks.11.attn.gate_fc1.weight
15
+ model.blocks.11.attn.gate_fc2.bias
16
+ model.blocks.11.attn.gate_fc2.weight
17
+ model.blocks.2.attn.gate_fc1.bias
18
+ model.blocks.2.attn.gate_fc1.weight
19
+ model.blocks.2.attn.gate_fc2.bias
20
+ model.blocks.2.attn.gate_fc2.weight
21
+ model.blocks.3.attn.gate_fc1.bias
22
+ model.blocks.3.attn.gate_fc1.weight
23
+ model.blocks.3.attn.gate_fc2.bias
24
+ model.blocks.3.attn.gate_fc2.weight
25
+ model.blocks.4.attn.gate_fc1.bias
26
+ model.blocks.4.attn.gate_fc1.weight
27
+ model.blocks.4.attn.gate_fc2.bias
28
+ model.blocks.4.attn.gate_fc2.weight
29
+ model.blocks.5.attn.gate_fc1.bias
30
+ model.blocks.5.attn.gate_fc1.weight
31
+ model.blocks.5.attn.gate_fc2.bias
32
+ model.blocks.5.attn.gate_fc2.weight
33
+ model.blocks.6.attn.gate_fc1.bias
34
+ model.blocks.6.attn.gate_fc1.weight
35
+ model.blocks.6.attn.gate_fc2.bias
36
+ model.blocks.6.attn.gate_fc2.weight
37
+ model.blocks.7.attn.gate_fc1.bias
38
+ model.blocks.7.attn.gate_fc1.weight
39
+ model.blocks.7.attn.gate_fc2.bias
40
+ model.blocks.7.attn.gate_fc2.weight
41
+ model.blocks.8.attn.gate_fc1.bias
42
+ model.blocks.8.attn.gate_fc1.weight
43
+ model.blocks.8.attn.gate_fc2.bias
44
+ model.blocks.8.attn.gate_fc2.weight
45
+ model.blocks.9.attn.gate_fc1.bias
46
+ model.blocks.9.attn.gate_fc1.weight
47
+ model.blocks.9.attn.gate_fc2.bias
48
+ model.blocks.9.attn.gate_fc2.weight
49
+ model.norm.bias
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21d3ac54cadd49cc11ea0e88d37874aa3a9391e7b47a2704b6557a0e9229640c
3
  size 313204736
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:973adf1381893bfed4aec91d2329a3f1548afc8e235f4858115d4cf75bc996c5
3
  size 313204736
modeling_nsa.py CHANGED
@@ -9,7 +9,7 @@ from transformers.generation.utils import GenerationMixin
9
  from transformers.modeling_outputs import CausalLMOutput
10
 
11
  from .configuration_nsa import NSAConfig
12
- _HAS_NSA = False # avoid nested vendor imports in HF dynamic loader
13
 
14
 
15
  class RMSNorm(nn.Module):
 
9
  from transformers.modeling_outputs import CausalLMOutput
10
 
11
  from .configuration_nsa import NSAConfig
12
+ _HAS_NSA = False # Do not attempt nested vendor import in HF dynamic loader
13
 
14
 
15
  class RMSNorm(nn.Module):