aklein4 commited on
Commit
d5b3bba
·
verified ·
1 Parent(s): b906304

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. 000000001000/config.json +369 -0
  2. 000000001000/model.pt +3 -0
000000001000/config.json ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pure_modules": [],
3
+ "remat": {
4
+ "activation_checkpoint_layers": [],
5
+ "scan_layers": null,
6
+ "offload_tensors": [],
7
+ "advanced": [
8
+ {
9
+ "name": "self",
10
+ "settings": {
11
+ "activation_checkpoint_layers": [
12
+ "DiffusionHead"
13
+ ],
14
+ "activation_barrier_layers": [
15
+ "DiffusionHead"
16
+ ]
17
+ }
18
+ },
19
+ {
20
+ "name": "encoder_model",
21
+ "settings": {
22
+ "activation_checkpoint_layers": [
23
+ "EncoderModelLayer"
24
+ ],
25
+ "optimization_barrier_layers": [
26
+ "EncoderModelLayer"
27
+ ],
28
+ "scan_layers": "layers",
29
+ "offload_tensors": [
30
+ "encoder_model_input"
31
+ ]
32
+ }
33
+ },
34
+ {
35
+ "name": "decoder_model",
36
+ "settings": {
37
+ "activation_checkpoint_layers": [
38
+ "DecoderModelLayer"
39
+ ],
40
+ "optimization_barrier_layers": [
41
+ "DecoderModelLayer"
42
+ ],
43
+ "scan_layers": "layers",
44
+ "offload_tensors": [
45
+ "decoder_model_input"
46
+ ]
47
+ }
48
+ }
49
+ ]
50
+ },
51
+ "type": "zlm.ZLMModel",
52
+ "pretrained_url": null,
53
+ "pretrained_step": null,
54
+ "pretrained_strict": null,
55
+ "torch_dtype": "float32",
56
+ "vocab_size": 49152,
57
+ "bos_token_id": 0,
58
+ "eos_token_id": 0,
59
+ "pad_token_id": 49152,
60
+ "hidden_size": 2048,
61
+ "num_hidden_layers": 24,
62
+ "num_attention_heads": 32,
63
+ "num_key_value_heads": 32,
64
+ "intermediate_size": 8192,
65
+ "hidden_act": "silu",
66
+ "max_position_embeddings": 8192,
67
+ "rope_theta": 130000,
68
+ "attention_dropout": false,
69
+ "attention_bias": false,
70
+ "initializer_range": 0.02,
71
+ "rms_norm_eps": 1e-05,
72
+ "pad_attention_bias_value": -100.0,
73
+ "attention_kernel": "nan_safe_flash_attention",
74
+ "pretrained_llama": "aklein4/SmolLM2-1.7B-TPU",
75
+ "input_length": 256,
76
+ "output_length": 512,
77
+ "z_length": 384,
78
+ "latent_size": 64,
79
+ "minimum_diffusion_timestep": 0.5,
80
+ "num_diffusion_timesteps": 16,
81
+ "diffusion_in_proj": false,
82
+ "num_diffusion_head_layers": 2,
83
+ "diffusion_mlp_size": 3072,
84
+ "diffusion_output_init_scale": 0.1,
85
+ "sharding": {
86
+ "embed_tokens.weight": [
87
+ "fsdp",
88
+ null
89
+ ],
90
+ "lm_head.weight": [
91
+ "fsdp",
92
+ null
93
+ ],
94
+ "encoder_model.layers.*.self_attn.q_proj.weight": [
95
+ "fsdp",
96
+ null
97
+ ],
98
+ "encoder_model.layers.*.self_attn.k_proj.weight": [
99
+ "fsdp",
100
+ null
101
+ ],
102
+ "encoder_model.layers.*.self_attn.v_proj.weight": [
103
+ "fsdp",
104
+ null
105
+ ],
106
+ "encoder_model.layers.*.self_attn.o_proj.weight": [
107
+ "fsdp",
108
+ null
109
+ ],
110
+ "encoder_model.layers.*.mlp.gate_proj.weight": [
111
+ "fsdp",
112
+ null
113
+ ],
114
+ "encoder_model.layers.*.mlp.up_proj.weight": [
115
+ "fsdp",
116
+ null
117
+ ],
118
+ "encoder_model.layers.*.mlp.down_proj.weight": [
119
+ null,
120
+ "fsdp"
121
+ ],
122
+ "encoder_model.layers.*.input_layernorm.weight": [
123
+ "fsdp"
124
+ ],
125
+ "encoder_model.layers.*.post_attention_layernorm.weight": [
126
+ "fsdp"
127
+ ],
128
+ "encoder_model.norm.weight": [
129
+ "fsdp"
130
+ ],
131
+ "decoder_model.layers.*.self_attn.q_proj.weight": [
132
+ "fsdp",
133
+ null
134
+ ],
135
+ "decoder_model.layers.*.self_attn.k_proj.weight": [
136
+ "fsdp",
137
+ null
138
+ ],
139
+ "decoder_model.layers.*.self_attn.v_proj.weight": [
140
+ "fsdp",
141
+ null
142
+ ],
143
+ "decoder_model.layers.*.self_attn.o_proj.weight": [
144
+ "fsdp",
145
+ null
146
+ ],
147
+ "decoder_model.layers.*.mlp.gate_proj.weight": [
148
+ "fsdp",
149
+ null
150
+ ],
151
+ "decoder_model.layers.*.mlp.up_proj.weight": [
152
+ "fsdp",
153
+ null
154
+ ],
155
+ "decoder_model.layers.*.mlp.down_proj.weight": [
156
+ null,
157
+ "fsdp"
158
+ ],
159
+ "decoder_model.layers.*.input_layernorm.weight": [
160
+ "fsdp"
161
+ ],
162
+ "decoder_model.layers.*.post_attention_layernorm.weight": [
163
+ "fsdp"
164
+ ],
165
+ "decoder_model.norm.weight": [
166
+ "fsdp"
167
+ ],
168
+ "scheduler.timesteps": [
169
+ null
170
+ ],
171
+ "scheduler.a": [
172
+ null
173
+ ],
174
+ "scheduler.b": [
175
+ null
176
+ ],
177
+ "diffusion_head.x_t_in_proj.weight": [
178
+ "fsdp",
179
+ null
180
+ ],
181
+ "diffusion_head.layers.*.norm.embed.weight": [
182
+ null,
183
+ "fsdp"
184
+ ],
185
+ "diffusion_head.layers.*.mlp.gate_proj.weight": [
186
+ "fsdp",
187
+ null
188
+ ],
189
+ "diffusion_head.layers.*.mlp.up_proj.weight": [
190
+ "fsdp",
191
+ null
192
+ ],
193
+ "diffusion_head.layers.*.mlp.down_proj.weight": [
194
+ null,
195
+ "fsdp"
196
+ ],
197
+ "diffusion_head.layers.*.out_scale.embed.weight": [
198
+ null,
199
+ "fsdp"
200
+ ],
201
+ "diffusion_head.out_norm.embed.weight": [
202
+ null,
203
+ "fsdp"
204
+ ],
205
+ "diffusion_head.out_proj.weight": [
206
+ null,
207
+ "fsdp"
208
+ ],
209
+ "uncond_diffusion_head.x_t_in_proj.weight": [
210
+ "fsdp",
211
+ null
212
+ ],
213
+ "uncond_diffusion_head.layers.*.norm.embed.weight": [
214
+ null,
215
+ "fsdp"
216
+ ],
217
+ "uncond_diffusion_head.layers.*.mlp.gate_proj.weight": [
218
+ "fsdp",
219
+ null
220
+ ],
221
+ "uncond_diffusion_head.layers.*.mlp.up_proj.weight": [
222
+ "fsdp",
223
+ null
224
+ ],
225
+ "uncond_diffusion_head.layers.*.mlp.down_proj.weight": [
226
+ null,
227
+ "fsdp"
228
+ ],
229
+ "uncond_diffusion_head.layers.*.out_scale.embed.weight": [
230
+ null,
231
+ "fsdp"
232
+ ],
233
+ "uncond_diffusion_head.out_norm.embed.weight": [
234
+ null,
235
+ "fsdp"
236
+ ],
237
+ "uncond_diffusion_head.out_proj.weight": [
238
+ null,
239
+ "fsdp"
240
+ ],
241
+ "mu_initial_batch_norm.initialized": [
242
+ null
243
+ ],
244
+ "mu_initial_batch_norm.shift": [
245
+ null,
246
+ null
247
+ ],
248
+ "mu_initial_batch_norm.scale": [
249
+ null,
250
+ null
251
+ ],
252
+ "encoder_sep_token": [
253
+ null,
254
+ "fsdp"
255
+ ],
256
+ "encoder_z_tokens": [
257
+ null,
258
+ "fsdp"
259
+ ],
260
+ "decoder_z_tokens": [
261
+ null,
262
+ "fsdp"
263
+ ],
264
+ "decoder_start_output_token": [
265
+ null,
266
+ "fsdp"
267
+ ],
268
+ "encoder_input_embeddings": [
269
+ "fsdp"
270
+ ],
271
+ "encoder_output_embeddings": [
272
+ "fsdp"
273
+ ],
274
+ "decoder_input_embeddings": [
275
+ "fsdp"
276
+ ],
277
+ "decoder_output_embeddings": [
278
+ "fsdp"
279
+ ],
280
+ "encoder_noise_proj_in.weight": [
281
+ "fsdp",
282
+ null
283
+ ],
284
+ "decoder_z_proj_in.weight": [
285
+ "fsdp",
286
+ null
287
+ ],
288
+ "encoder_mu_proj_out.weight": [
289
+ null,
290
+ "fsdp"
291
+ ],
292
+ "uncond_tokens": [
293
+ null,
294
+ "fsdp"
295
+ ],
296
+ "embed_tokens": [
297
+ [
298
+ "data",
299
+ "fsdp"
300
+ ],
301
+ null,
302
+ null
303
+ ],
304
+ "encoder_model.layers.*": [
305
+ [
306
+ "data",
307
+ "fsdp"
308
+ ],
309
+ null,
310
+ null
311
+ ],
312
+ "decoder_model.layers.*": [
313
+ [
314
+ "data",
315
+ "fsdp"
316
+ ],
317
+ null,
318
+ null
319
+ ],
320
+ "diffusion_head.layers.*": [
321
+ [
322
+ "data",
323
+ "fsdp"
324
+ ],
325
+ null,
326
+ null
327
+ ],
328
+ "diffusion_head.out_proj": [
329
+ [
330
+ "data",
331
+ "fsdp"
332
+ ],
333
+ null,
334
+ null
335
+ ],
336
+ "uncond_diffusion_head.layers.*": [
337
+ [
338
+ "data",
339
+ "fsdp"
340
+ ],
341
+ null,
342
+ null
343
+ ],
344
+ "uncond_diffusion_head.out_proj": [
345
+ [
346
+ "data",
347
+ "fsdp"
348
+ ],
349
+ null,
350
+ null
351
+ ],
352
+ "encoder_mu_proj_out": [
353
+ [
354
+ "data",
355
+ "fsdp"
356
+ ],
357
+ null,
358
+ null
359
+ ],
360
+ "lm_head": [
361
+ [
362
+ "data",
363
+ "fsdp"
364
+ ],
365
+ null,
366
+ null
367
+ ]
368
+ }
369
+ }
000000001000/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfd26f0e0c78ff123e52fc93e21c454f88e78bf6520e4945486946639c9f387b
3
+ size 14007751759