aklein4 commited on
Commit
16b2354
·
verified ·
1 Parent(s): 7745dcf

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. 000000012000/config.json +373 -0
  2. 000000012000/model.pt +3 -0
000000012000/config.json ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "zlm.ZLMModel",
3
+ "pretrained_url": "aklein4/ZEBRA_ar-1p7b-kernel",
4
+ "pretrained_step": 7000,
5
+ "pretrained_strict": true,
6
+ "torch_dtype": "float32",
7
+ "vocab_size": 49152,
8
+ "bos_token_id": 0,
9
+ "eos_token_id": 0,
10
+ "pad_token_id": 49152,
11
+ "hidden_size": 2048,
12
+ "num_hidden_layers": 24,
13
+ "num_attention_heads": 32,
14
+ "num_key_value_heads": 32,
15
+ "intermediate_size": 8192,
16
+ "hidden_act": "silu",
17
+ "max_position_embeddings": 8192,
18
+ "rope_theta": 130000,
19
+ "initializer_range": 0.02,
20
+ "attention_dropout": false,
21
+ "attention_bias": false,
22
+ "rms_norm_eps": 1e-05,
23
+ "pad_attention_bias_value": -100.0,
24
+ "attention_kernel": "flash_attention",
25
+ "pretrained_llama": "aklein4/SmolLM2-1.7B-TPU",
26
+ "input_length": 256,
27
+ "output_length": 512,
28
+ "z_length": 384,
29
+ "latent_size": 64,
30
+ "z_ar_steps": 16,
31
+ "head_intermediate_size": 8192,
32
+ "lm_loss_ema_beta": 0.75,
33
+ "pure_modules": [],
34
+ "sharding": {
35
+ "embed_tokens.weight": [
36
+ "fsdp",
37
+ null
38
+ ],
39
+ "lm_head.weight": [
40
+ "fsdp",
41
+ null
42
+ ],
43
+ "encoder_model.layers.*.self_attn.q_proj.weight": [
44
+ "fsdp",
45
+ null
46
+ ],
47
+ "encoder_model.layers.*.self_attn.k_proj.weight": [
48
+ "fsdp",
49
+ null
50
+ ],
51
+ "encoder_model.layers.*.self_attn.v_proj.weight": [
52
+ "fsdp",
53
+ null
54
+ ],
55
+ "encoder_model.layers.*.self_attn.o_proj.weight": [
56
+ "fsdp",
57
+ null
58
+ ],
59
+ "encoder_model.layers.*.mlp.gate_proj.weight": [
60
+ "fsdp",
61
+ null
62
+ ],
63
+ "encoder_model.layers.*.mlp.up_proj.weight": [
64
+ "fsdp",
65
+ null
66
+ ],
67
+ "encoder_model.layers.*.mlp.down_proj.weight": [
68
+ null,
69
+ "fsdp"
70
+ ],
71
+ "encoder_model.layers.*.input_layernorm.weight": [
72
+ "fsdp"
73
+ ],
74
+ "encoder_model.layers.*.post_attention_layernorm.weight": [
75
+ "fsdp"
76
+ ],
77
+ "encoder_model.norm.weight": [
78
+ "fsdp"
79
+ ],
80
+ "decoder_model.layers.*.self_attn.q_proj.weight": [
81
+ "fsdp",
82
+ null
83
+ ],
84
+ "decoder_model.layers.*.self_attn.k_proj.weight": [
85
+ "fsdp",
86
+ null
87
+ ],
88
+ "decoder_model.layers.*.self_attn.v_proj.weight": [
89
+ "fsdp",
90
+ null
91
+ ],
92
+ "decoder_model.layers.*.self_attn.o_proj.weight": [
93
+ "fsdp",
94
+ null
95
+ ],
96
+ "decoder_model.layers.*.mlp.gate_proj.weight": [
97
+ "fsdp",
98
+ null
99
+ ],
100
+ "decoder_model.layers.*.mlp.up_proj.weight": [
101
+ "fsdp",
102
+ null
103
+ ],
104
+ "decoder_model.layers.*.mlp.down_proj.weight": [
105
+ null,
106
+ "fsdp"
107
+ ],
108
+ "decoder_model.layers.*.input_layernorm.weight": [
109
+ "fsdp"
110
+ ],
111
+ "decoder_model.layers.*.post_attention_layernorm.weight": [
112
+ "fsdp"
113
+ ],
114
+ "decoder_model.norm.weight": [
115
+ "fsdp"
116
+ ],
117
+ "encoder_head.states_gate_proj.weight": [
118
+ "fsdp",
119
+ null
120
+ ],
121
+ "encoder_head.states_up_proj.weight": [
122
+ "fsdp",
123
+ null
124
+ ],
125
+ "encoder_head.z_gate_proj.weight": [
126
+ "fsdp",
127
+ null
128
+ ],
129
+ "encoder_head.z_gate_proj.mask": [
130
+ "fsdp",
131
+ null
132
+ ],
133
+ "encoder_head.z_up_proj.weight": [
134
+ "fsdp",
135
+ null
136
+ ],
137
+ "encoder_head.z_up_proj.mask": [
138
+ "fsdp",
139
+ null
140
+ ],
141
+ "encoder_head.down_proj.weight": [
142
+ null,
143
+ "fsdp"
144
+ ],
145
+ "encoder_head.down_proj.mask": [
146
+ null,
147
+ "fsdp"
148
+ ],
149
+ "encoder_head.cross_proj.weight": [
150
+ null,
151
+ "fsdp"
152
+ ],
153
+ "decoder_head.states_gate_proj.weight": [
154
+ "fsdp",
155
+ null
156
+ ],
157
+ "decoder_head.states_up_proj.weight": [
158
+ "fsdp",
159
+ null
160
+ ],
161
+ "decoder_head.z_gate_proj.weight": [
162
+ "fsdp",
163
+ null
164
+ ],
165
+ "decoder_head.z_gate_proj.mask": [
166
+ "fsdp",
167
+ null
168
+ ],
169
+ "decoder_head.z_up_proj.weight": [
170
+ "fsdp",
171
+ null
172
+ ],
173
+ "decoder_head.z_up_proj.mask": [
174
+ "fsdp",
175
+ null
176
+ ],
177
+ "decoder_head.down_proj.weight": [
178
+ null,
179
+ "fsdp"
180
+ ],
181
+ "decoder_head.down_proj.mask": [
182
+ null,
183
+ "fsdp"
184
+ ],
185
+ "decoder_head.cross_proj.weight": [
186
+ null,
187
+ "fsdp"
188
+ ],
189
+ "uncond_decoder_head.states_gate_proj.weight": [
190
+ "fsdp",
191
+ null
192
+ ],
193
+ "uncond_decoder_head.states_up_proj.weight": [
194
+ "fsdp",
195
+ null
196
+ ],
197
+ "uncond_decoder_head.z_gate_proj.weight": [
198
+ "fsdp",
199
+ null
200
+ ],
201
+ "uncond_decoder_head.z_gate_proj.mask": [
202
+ "fsdp",
203
+ null
204
+ ],
205
+ "uncond_decoder_head.z_up_proj.weight": [
206
+ "fsdp",
207
+ null
208
+ ],
209
+ "uncond_decoder_head.z_up_proj.mask": [
210
+ "fsdp",
211
+ null
212
+ ],
213
+ "uncond_decoder_head.down_proj.weight": [
214
+ null,
215
+ "fsdp"
216
+ ],
217
+ "uncond_decoder_head.down_proj.mask": [
218
+ null,
219
+ "fsdp"
220
+ ],
221
+ "uncond_decoder_head.cross_proj.weight": [
222
+ null,
223
+ "fsdp"
224
+ ],
225
+ "uncond_tokens": [
226
+ null,
227
+ "fsdp"
228
+ ],
229
+ "encoder_sep_token": [
230
+ null,
231
+ "fsdp"
232
+ ],
233
+ "encoder_z_tokens": [
234
+ null,
235
+ "fsdp"
236
+ ],
237
+ "decoder_z_tokens": [
238
+ null,
239
+ "fsdp"
240
+ ],
241
+ "decoder_start_output_token": [
242
+ null,
243
+ "fsdp"
244
+ ],
245
+ "encoder_input_embeddings": [
246
+ "fsdp"
247
+ ],
248
+ "encoder_output_embeddings": [
249
+ "fsdp"
250
+ ],
251
+ "decoder_input_embeddings": [
252
+ "fsdp"
253
+ ],
254
+ "decoder_output_embeddings": [
255
+ "fsdp"
256
+ ],
257
+ "encoder_noise_proj_in.weight": [
258
+ "fsdp",
259
+ null
260
+ ],
261
+ "decoder_z_proj_in.weight": [
262
+ "fsdp",
263
+ null
264
+ ],
265
+ "lm_loss_ema.num_updates": [
266
+ null
267
+ ],
268
+ "lm_loss_ema.weight": [
269
+ null
270
+ ],
271
+ "embed_tokens": [
272
+ [
273
+ "data",
274
+ "fsdp"
275
+ ],
276
+ null,
277
+ null
278
+ ],
279
+ "encoder_model.layers.*": [
280
+ [
281
+ "data",
282
+ "fsdp"
283
+ ],
284
+ null,
285
+ null
286
+ ],
287
+ "decoder_model.layers.*": [
288
+ [
289
+ "data",
290
+ "fsdp"
291
+ ],
292
+ null,
293
+ null
294
+ ],
295
+ "encoder_head": [
296
+ [
297
+ "data",
298
+ "fsdp"
299
+ ],
300
+ null,
301
+ null
302
+ ],
303
+ "decoder_head": [
304
+ [
305
+ "data",
306
+ "fsdp"
307
+ ],
308
+ null,
309
+ null
310
+ ],
311
+ "uncond_decoder_head": [
312
+ [
313
+ "data",
314
+ "fsdp"
315
+ ],
316
+ null,
317
+ null
318
+ ],
319
+ "lm_head": [
320
+ [
321
+ "data",
322
+ "fsdp"
323
+ ],
324
+ null,
325
+ null
326
+ ]
327
+ },
328
+ "remat": {
329
+ "advanced": [
330
+ {
331
+ "name": "self",
332
+ "settings": {
333
+ "activation_checkpoint_layers": [
334
+ "ARHead"
335
+ ],
336
+ "optimization_barrier_layers": [
337
+ "ARHead"
338
+ ]
339
+ }
340
+ },
341
+ {
342
+ "name": "encoder_model",
343
+ "settings": {
344
+ "activation_checkpoint_layers": [
345
+ "EncoderModelLayer"
346
+ ],
347
+ "optimization_barrier_layers": [
348
+ "EncoderModelLayer"
349
+ ],
350
+ "scan_layers": "layers",
351
+ "offload_tensors": [
352
+ "encoder_model_input"
353
+ ]
354
+ }
355
+ },
356
+ {
357
+ "name": "decoder_model",
358
+ "settings": {
359
+ "activation_checkpoint_layers": [
360
+ "DecoderModelLayer"
361
+ ],
362
+ "optimization_barrier_layers": [
363
+ "DecoderModelLayer"
364
+ ],
365
+ "scan_layers": "layers",
366
+ "offload_tensors": [
367
+ "decoder_model_input"
368
+ ]
369
+ }
370
+ }
371
+ ]
372
+ }
373
+ }
000000012000/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2cb166983b7ad7df084d83709da0ba88c274d3098f758b744dcc49a7b2c63a3
3
+ size 14143688111