| { |
| "pure_modules": [], |
| "remat": { |
| "activation_checkpoint_layers": [], |
| "scan_layers": null, |
| "offload_tensors": [], |
| "advanced": [ |
| { |
| "name": "self", |
| "settings": { |
| "activation_checkpoint_layers": [ |
| "DiffusionHead" |
| ], |
| "activation_barrier_layers": [ |
| "DiffusionHead" |
| ] |
| } |
| }, |
| { |
| "name": "encoder_model", |
| "settings": { |
| "activation_checkpoint_layers": [ |
| "EncoderModelLayer" |
| ], |
| "optimization_barrier_layers": [ |
| "EncoderModelLayer" |
| ], |
| "scan_layers": "layers", |
| "offload_tensors": [ |
| "encoder_model_input" |
| ] |
| } |
| }, |
| { |
| "name": "decoder_model", |
| "settings": { |
| "activation_checkpoint_layers": [ |
| "DecoderModelLayer" |
| ], |
| "optimization_barrier_layers": [ |
| "DecoderModelLayer" |
| ], |
| "scan_layers": "layers", |
| "offload_tensors": [ |
| "decoder_model_input" |
| ] |
| } |
| } |
| ] |
| }, |
| "type": "zlm.ZLMModel", |
| "pretrained_url": "aklein4/ZLM-v2_zlm-large-once-norm", |
| "pretrained_step": 19500, |
| "pretrained_strict": true, |
| "torch_dtype": "float32", |
| "vocab_size": 49152, |
| "bos_token_id": 0, |
| "eos_token_id": 0, |
| "pad_token_id": 49152, |
| "hidden_size": 2048, |
| "num_hidden_layers": 24, |
| "num_attention_heads": 32, |
| "num_key_value_heads": 32, |
| "intermediate_size": 8192, |
| "hidden_act": "silu", |
| "max_position_embeddings": 8192, |
| "rope_theta": 130000, |
| "attention_dropout": false, |
| "attention_bias": false, |
| "initializer_range": 0.02, |
| "rms_norm_eps": 1e-05, |
| "pad_attention_bias_value": -100.0, |
| "attention_kernel": "flash_attention", |
| "pretrained_llama": "aklein4/SmolLM2-1.7B-TPU", |
| "input_length": 256, |
| "output_length": 512, |
| "z_length": 384, |
| "latent_size": 32, |
| "batch_norm_beta": 0.95, |
| "once_norm": true, |
| "minimum_diffusion_timestep": 0.5, |
| "num_diffusion_timesteps": 16, |
| "diffusion_in_proj": false, |
| "num_diffusion_head_layers": 2, |
| "diffusion_mlp_size": 3072, |
| "init_sep_token_id": 198, |
| "init_z_token_message": [ |
| 16339, |
| 1103, |
| 253, |
| 1732, |
| 876, |
| 1129, |
| 3073, |
| 827, |
| 15852, |
| 6223, |
| 42, |
| 3039, |
| 1833, |
| 411, |
| 1833, |
| 284, |
| 33748, |
| 732, |
| 346, |
| 699, |
| 588, |
| 1869, |
| 30, |
| 5970, |
| 29, |
| 1717, |
| 29, |
| 5528, |
| 3039, |
| 314, |
| 260, |
| 2327, |
| 338, |
| 7488, |
| 346, |
| 3703, |
| 43, |
| 33748, |
| 314, |
| 260, |
| 38354, |
| 338, |
| 6200, |
| 346, |
| 1991, |
| 346, |
| 359, |
| 1361, |
| 335, |
| 260, |
| 1048, |
| 3398, |
| 30, |
| 16269, |
| 1592, |
| 28, |
| 502, |
| 724, |
| 253, |
| 1055, |
| 2951, |
| 6554, |
| 28, |
| 6063, |
| 9101, |
| 1584, |
| 28, |
| 284, |
| 919, |
| 11621, |
| 2894, |
| 908, |
| 645, |
| 260, |
| 1732, |
| 7422, |
| 6991, |
| 30, |
| 198, |
| 198, |
| 3512, |
| 29, |
| 1717, |
| 29, |
| 5528, |
| 3039, |
| 5909, |
| 411, |
| 6939, |
| 253, |
| 21765, |
| 3775, |
| 618, |
| 253, |
| 2437, |
| 2370, |
| 30, |
| 378, |
| 1055, |
| 2496, |
| 732, |
| 260, |
| 1732, |
| 314, |
| 5617, |
| 28, |
| 14316, |
| 260, |
| 619, |
| 28728, |
| 573, |
| 1096, |
| 28, |
| 284, |
| 3834, |
| 260, |
| 6552, |
| 99, |
| 30, |
| 1069, |
| 654, |
| 34013, |
| 260, |
| 1962, |
| 281, |
| 480, |
| 1038, |
| 1924, |
| 28, |
| 975, |
| 29532, |
| 260, |
| 3491, |
| 314, |
| 260, |
| 17938, |
| 970, |
| 288, |
| 3368, |
| 3867, |
| 30, |
| 9528, |
| 28, |
| 502, |
| 3525, |
| 253, |
| 1252, |
| 30, |
| 1848, |
| 1252, |
| 1124, |
| 325, |
| 253, |
| 1343, |
| 1341, |
| 365, |
| 2579, |
| 8659, |
| 253, |
| 3856, |
| 618, |
| 3764, |
| 2391, |
| 28, |
| 5510, |
| 253, |
| 11262, |
| 28, |
| 15656, |
| 10195, |
| 28, |
| 355, |
| 3728, |
| 3480, |
| 643, |
| 355, |
| 357, |
| 1124, |
| 325, |
| 253, |
| 2232, |
| 3062, |
| 715, |
| 347, |
| 619, |
| 7298, |
| 339, |
| 417, |
| 764, |
| 4960, |
| 5307, |
| 28, |
| 965, |
| 339, |
| 417, |
| 764, |
| 6811, |
| 3416, |
| 1184, |
| 1814, |
| 971, |
| 1833, |
| 28, |
| 502, |
| 1188, |
| 582, |
| 1165, |
| 2353, |
| 284, |
| 2545, |
| 732, |
| 357, |
| 7124, |
| 30, |
| 1094, |
| 260, |
| 2353, |
| 1072, |
| 441, |
| 2369, |
| 10443, |
| 28, |
| 502, |
| 4211, |
| 42, |
| 1576, |
| 253, |
| 896, |
| 2424, |
| 28, |
| 1492, |
| 327, |
| 6782, |
| 1096, |
| 28, |
| 355, |
| 1532, |
| 368, |
| 260, |
| 1732, |
| 1163, |
| 30, |
| 198, |
| 198, |
| 1348, |
| 314, |
| 837, |
| 33748, |
| 3207, |
| 1895, |
| 30, |
| 2550, |
| 11915, |
| 253, |
| 6579, |
| 20430, |
| 282, |
| 746, |
| 1265, |
| 16593, |
| 253, |
| 3246, |
| 28, |
| 17085, |
| 253, |
| 1685, |
| 28, |
| 3728, |
| 253, |
| 1671, |
| 28, |
| 355, |
| 7782, |
| 253, |
| 9704, |
| 1265, |
| 1195, |
| 1055, |
| 43216, |
| 284, |
| 30557, |
| 732, |
| 502, |
| 457, |
| 2269, |
| 588, |
| 1869, |
| 30, |
| 330, |
| 1123, |
| 7127, |
| 314, |
| 1890, |
| 28, |
| 7257, |
| 28, |
| 284, |
| 1028, |
| 515, |
| 30, |
| 657, |
| 1124, |
| 2139, |
| 702, |
| 42, |
| 619, |
| 2931, |
| 1869, |
| 28, |
| 260, |
| 1646, |
| 24653, |
| 314, |
| 2273, |
| 1447, |
| 355, |
| 619, |
| 504, |
| 940, |
| 4567, |
| 330, |
| 314, |
| 2474, |
| 645, |
| 389, |
| 12191, |
| 1447, |
| 355, |
| 619, |
| 57, |
| 417, |
| 307, |
| 15129, |
| 827, |
| 3416, |
| 975, |
| 502, |
| 26599, |
| 260, |
| 4292, |
| 1184, |
| 6567, |
| 93, |
| 2372, |
| 1746, |
| 260, |
| 1055, |
| 429, |
| 7980, |
| 1130, |
| 29025, |
| 22912, |
| 30, |
| 1069, |
| 1607, |
| 13490, |
| 6826, |
| 618, |
| 253, |
| 1165, |
| 932, |
| 282, |
| 5307, |
| 338, |
| 416, |
| 3948, |
| 260, |
| 1867, |
| 1833, |
| 30, |
| 198, |
| 198, |
| 2696, |
| 392, |
| 359, |
| 4421, |
| 288, |
| 1956, |
| 2385, |
| 653, |
| 3403, |
| 3564, |
| 1535, |
| 30 |
| ], |
| "z_proj_in_init_scale": 0.5, |
| "sharding": { |
| "embed_tokens.weight": [ |
| "fsdp", |
| null |
| ], |
| "lm_head.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.q_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.k_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.v_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.o_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.mlp.gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.mlp.up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.mlp.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_model.layers.*.input_layernorm.weight": [ |
| "fsdp" |
| ], |
| "encoder_model.layers.*.post_attention_layernorm.weight": [ |
| "fsdp" |
| ], |
| "encoder_model.norm.weight": [ |
| "fsdp" |
| ], |
| "decoder_model.layers.*.self_attn.q_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.self_attn.k_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.self_attn.v_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.self_attn.o_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.mlp.gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.mlp.up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.mlp.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_model.layers.*.input_layernorm.weight": [ |
| "fsdp" |
| ], |
| "decoder_model.layers.*.post_attention_layernorm.weight": [ |
| "fsdp" |
| ], |
| "decoder_model.norm.weight": [ |
| "fsdp" |
| ], |
| "mu_out_norm.cov_tracker.num_updates": [ |
| null |
| ], |
| "mu_out_norm.cov_tracker.weight": [ |
| null, |
| null, |
| null |
| ], |
| "mu_out_norm.mean_tracker.num_updates": [ |
| null |
| ], |
| "mu_out_norm.mean_tracker.weight": [ |
| null, |
| null |
| ], |
| "scheduler.timesteps": [ |
| null |
| ], |
| "scheduler.a": [ |
| null |
| ], |
| "scheduler.b": [ |
| null |
| ], |
| "diffusion_head.x_t_in_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "diffusion_head.layers.*.norm.embed.weight": [ |
| null, |
| "fsdp" |
| ], |
| "diffusion_head.layers.*.mlp.gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "diffusion_head.layers.*.mlp.up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "diffusion_head.layers.*.mlp.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "diffusion_head.layers.*.out_scale.embed.weight": [ |
| null, |
| "fsdp" |
| ], |
| "diffusion_head.out_norm.embed.weight": [ |
| null, |
| "fsdp" |
| ], |
| "diffusion_head.out_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_diffusion_head.x_t_in_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "uncond_diffusion_head.layers.*.norm.embed.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_diffusion_head.layers.*.mlp.gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "uncond_diffusion_head.layers.*.mlp.up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "uncond_diffusion_head.layers.*.mlp.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_diffusion_head.layers.*.out_scale.embed.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_diffusion_head.out_norm.embed.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_diffusion_head.out_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_tokens": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_highway_out_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "highway_out_norm.mean_tracker.num_updates": [ |
| null |
| ], |
| "highway_out_norm.mean_tracker.weight": [ |
| null, |
| null |
| ], |
| "highway_out_norm.var_tracker.num_updates": [ |
| null |
| ], |
| "highway_out_norm.var_tracker.weight": [ |
| null, |
| null |
| ], |
| "decoder_highway_in_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_sep_token": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_z_tokens": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_z_tokens": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_start_output_token": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_input_embeddings": [ |
| "fsdp" |
| ], |
| "encoder_output_embeddings": [ |
| "fsdp" |
| ], |
| "decoder_input_embeddings": [ |
| "fsdp" |
| ], |
| "decoder_output_embeddings": [ |
| "fsdp" |
| ], |
| "encoder_noise_proj_in.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_z_proj_in.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_mu_proj_out.weight": [ |
| null, |
| "fsdp" |
| ], |
| "embed_tokens": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "encoder_model.layers.*": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "decoder_model.layers.*": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "diffusion_head.layers.*": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "diffusion_head.out_proj": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "uncond_diffusion_head.layers.*": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "uncond_diffusion_head.out_proj": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "encoder_mu_proj_out": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "lm_head": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ] |
| } |
| } |