zebra-1.7b-preview / config.json
aklein4's picture
Upload folder using huggingface_hub
b6c7e0f verified
{
"pure_modules": [],
"remat": {
"activation_checkpoint_layers": [],
"scan_layers": null,
"offload_tensors": [],
"advanced": [
{
"name": "self",
"settings": {
"activation_checkpoint_layers": [
"DiffusionHead"
],
"activation_barrier_layers": [
"DiffusionHead"
]
}
},
{
"name": "encoder_model",
"settings": {
"activation_checkpoint_layers": [
"EncoderModelLayer"
],
"optimization_barrier_layers": [
"EncoderModelLayer"
],
"scan_layers": "layers",
"offload_tensors": [
"encoder_model_input"
]
}
},
{
"name": "decoder_model",
"settings": {
"activation_checkpoint_layers": [
"DecoderModelLayer"
],
"optimization_barrier_layers": [
"DecoderModelLayer"
],
"scan_layers": "layers",
"offload_tensors": [
"decoder_model_input"
]
}
}
]
},
"type": "zlm.ZLMModel",
"pretrained_url": "aklein4/ZLM-v2_zlm-large-once-norm",
"pretrained_step": 19500,
"pretrained_strict": true,
"torch_dtype": "float32",
"vocab_size": 49152,
"bos_token_id": 0,
"eos_token_id": 0,
"pad_token_id": 49152,
"hidden_size": 2048,
"num_hidden_layers": 24,
"num_attention_heads": 32,
"num_key_value_heads": 32,
"intermediate_size": 8192,
"hidden_act": "silu",
"max_position_embeddings": 8192,
"rope_theta": 130000,
"attention_dropout": false,
"attention_bias": false,
"initializer_range": 0.02,
"rms_norm_eps": 1e-05,
"pad_attention_bias_value": -100.0,
"attention_kernel": "flash_attention",
"pretrained_llama": "aklein4/SmolLM2-1.7B-TPU",
"input_length": 256,
"output_length": 512,
"z_length": 384,
"latent_size": 32,
"batch_norm_beta": 0.95,
"once_norm": true,
"minimum_diffusion_timestep": 0.5,
"num_diffusion_timesteps": 16,
"diffusion_in_proj": false,
"num_diffusion_head_layers": 2,
"diffusion_mlp_size": 3072,
"init_sep_token_id": 198,
"init_z_token_message": [
16339,
1103,
253,
1732,
876,
1129,
3073,
827,
15852,
6223,
42,
3039,
1833,
411,
1833,
284,
33748,
732,
346,
699,
588,
1869,
30,
5970,
29,
1717,
29,
5528,
3039,
314,
260,
2327,
338,
7488,
346,
3703,
43,
33748,
314,
260,
38354,
338,
6200,
346,
1991,
346,
359,
1361,
335,
260,
1048,
3398,
30,
16269,
1592,
28,
502,
724,
253,
1055,
2951,
6554,
28,
6063,
9101,
1584,
28,
284,
919,
11621,
2894,
908,
645,
260,
1732,
7422,
6991,
30,
198,
198,
3512,
29,
1717,
29,
5528,
3039,
5909,
411,
6939,
253,
21765,
3775,
618,
253,
2437,
2370,
30,
378,
1055,
2496,
732,
260,
1732,
314,
5617,
28,
14316,
260,
619,
28728,
573,
1096,
28,
284,
3834,
260,
6552,
99,
30,
1069,
654,
34013,
260,
1962,
281,
480,
1038,
1924,
28,
975,
29532,
260,
3491,
314,
260,
17938,
970,
288,
3368,
3867,
30,
9528,
28,
502,
3525,
253,
1252,
30,
1848,
1252,
1124,
325,
253,
1343,
1341,
365,
2579,
8659,
253,
3856,
618,
3764,
2391,
28,
5510,
253,
11262,
28,
15656,
10195,
28,
355,
3728,
3480,
643,
355,
357,
1124,
325,
253,
2232,
3062,
715,
347,
619,
7298,
339,
417,
764,
4960,
5307,
28,
965,
339,
417,
764,
6811,
3416,
1184,
1814,
971,
1833,
28,
502,
1188,
582,
1165,
2353,
284,
2545,
732,
357,
7124,
30,
1094,
260,
2353,
1072,
441,
2369,
10443,
28,
502,
4211,
42,
1576,
253,
896,
2424,
28,
1492,
327,
6782,
1096,
28,
355,
1532,
368,
260,
1732,
1163,
30,
198,
198,
1348,
314,
837,
33748,
3207,
1895,
30,
2550,
11915,
253,
6579,
20430,
282,
746,
1265,
16593,
253,
3246,
28,
17085,
253,
1685,
28,
3728,
253,
1671,
28,
355,
7782,
253,
9704,
1265,
1195,
1055,
43216,
284,
30557,
732,
502,
457,
2269,
588,
1869,
30,
330,
1123,
7127,
314,
1890,
28,
7257,
28,
284,
1028,
515,
30,
657,
1124,
2139,
702,
42,
619,
2931,
1869,
28,
260,
1646,
24653,
314,
2273,
1447,
355,
619,
504,
940,
4567,
330,
314,
2474,
645,
389,
12191,
1447,
355,
619,
57,
417,
307,
15129,
827,
3416,
975,
502,
26599,
260,
4292,
1184,
6567,
93,
2372,
1746,
260,
1055,
429,
7980,
1130,
29025,
22912,
30,
1069,
1607,
13490,
6826,
618,
253,
1165,
932,
282,
5307,
338,
416,
3948,
260,
1867,
1833,
30,
198,
198,
2696,
392,
359,
4421,
288,
1956,
2385,
653,
3403,
3564,
1535,
30
],
"z_proj_in_init_scale": 0.5,
"sharding": {
"embed_tokens.weight": [
"fsdp",
null
],
"lm_head.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.q_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.k_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.v_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.o_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.mlp.gate_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.mlp.up_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.mlp.down_proj.weight": [
null,
"fsdp"
],
"encoder_model.layers.*.input_layernorm.weight": [
"fsdp"
],
"encoder_model.layers.*.post_attention_layernorm.weight": [
"fsdp"
],
"encoder_model.norm.weight": [
"fsdp"
],
"decoder_model.layers.*.self_attn.q_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.self_attn.k_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.self_attn.v_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.self_attn.o_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.mlp.gate_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.mlp.up_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.mlp.down_proj.weight": [
null,
"fsdp"
],
"decoder_model.layers.*.input_layernorm.weight": [
"fsdp"
],
"decoder_model.layers.*.post_attention_layernorm.weight": [
"fsdp"
],
"decoder_model.norm.weight": [
"fsdp"
],
"mu_out_norm.cov_tracker.num_updates": [
null
],
"mu_out_norm.cov_tracker.weight": [
null,
null,
null
],
"mu_out_norm.mean_tracker.num_updates": [
null
],
"mu_out_norm.mean_tracker.weight": [
null,
null
],
"scheduler.timesteps": [
null
],
"scheduler.a": [
null
],
"scheduler.b": [
null
],
"diffusion_head.x_t_in_proj.weight": [
"fsdp",
null
],
"diffusion_head.layers.*.norm.embed.weight": [
null,
"fsdp"
],
"diffusion_head.layers.*.mlp.gate_proj.weight": [
"fsdp",
null
],
"diffusion_head.layers.*.mlp.up_proj.weight": [
"fsdp",
null
],
"diffusion_head.layers.*.mlp.down_proj.weight": [
null,
"fsdp"
],
"diffusion_head.layers.*.out_scale.embed.weight": [
null,
"fsdp"
],
"diffusion_head.out_norm.embed.weight": [
null,
"fsdp"
],
"diffusion_head.out_proj.weight": [
null,
"fsdp"
],
"uncond_diffusion_head.x_t_in_proj.weight": [
"fsdp",
null
],
"uncond_diffusion_head.layers.*.norm.embed.weight": [
null,
"fsdp"
],
"uncond_diffusion_head.layers.*.mlp.gate_proj.weight": [
"fsdp",
null
],
"uncond_diffusion_head.layers.*.mlp.up_proj.weight": [
"fsdp",
null
],
"uncond_diffusion_head.layers.*.mlp.down_proj.weight": [
null,
"fsdp"
],
"uncond_diffusion_head.layers.*.out_scale.embed.weight": [
null,
"fsdp"
],
"uncond_diffusion_head.out_norm.embed.weight": [
null,
"fsdp"
],
"uncond_diffusion_head.out_proj.weight": [
null,
"fsdp"
],
"uncond_tokens": [
null,
"fsdp"
],
"encoder_highway_out_proj.weight": [
"fsdp",
null
],
"highway_out_norm.mean_tracker.num_updates": [
null
],
"highway_out_norm.mean_tracker.weight": [
null,
null
],
"highway_out_norm.var_tracker.num_updates": [
null
],
"highway_out_norm.var_tracker.weight": [
null,
null
],
"decoder_highway_in_proj.weight": [
"fsdp",
null
],
"encoder_sep_token": [
null,
"fsdp"
],
"encoder_z_tokens": [
null,
"fsdp"
],
"decoder_z_tokens": [
null,
"fsdp"
],
"decoder_start_output_token": [
null,
"fsdp"
],
"encoder_input_embeddings": [
"fsdp"
],
"encoder_output_embeddings": [
"fsdp"
],
"decoder_input_embeddings": [
"fsdp"
],
"decoder_output_embeddings": [
"fsdp"
],
"encoder_noise_proj_in.weight": [
"fsdp",
null
],
"decoder_z_proj_in.weight": [
"fsdp",
null
],
"encoder_mu_proj_out.weight": [
null,
"fsdp"
],
"embed_tokens": [
[
"data",
"fsdp"
],
null,
null
],
"encoder_model.layers.*": [
[
"data",
"fsdp"
],
null,
null
],
"decoder_model.layers.*": [
[
"data",
"fsdp"
],
null,
null
],
"diffusion_head.layers.*": [
[
"data",
"fsdp"
],
null,
null
],
"diffusion_head.out_proj": [
[
"data",
"fsdp"
],
null,
null
],
"uncond_diffusion_head.layers.*": [
[
"data",
"fsdp"
],
null,
null
],
"uncond_diffusion_head.out_proj": [
[
"data",
"fsdp"
],
null,
null
],
"encoder_mu_proj_out": [
[
"data",
"fsdp"
],
null,
null
],
"lm_head": [
[
"data",
"fsdp"
],
null,
null
]
}
}