| { |
| "type": "zlm.ZLMModel", |
| "pretrained_url": "aklein4/ZEBRA_ar-1p7b-kernel", |
| "pretrained_step": 7000, |
| "pretrained_strict": true, |
| "torch_dtype": "float32", |
| "vocab_size": 49152, |
| "bos_token_id": 0, |
| "eos_token_id": 0, |
| "pad_token_id": 49152, |
| "hidden_size": 2048, |
| "num_hidden_layers": 24, |
| "num_attention_heads": 32, |
| "num_key_value_heads": 32, |
| "intermediate_size": 8192, |
| "hidden_act": "silu", |
| "max_position_embeddings": 8192, |
| "rope_theta": 130000, |
| "initializer_range": 0.02, |
| "attention_dropout": false, |
| "attention_bias": false, |
| "rms_norm_eps": 1e-05, |
| "pad_attention_bias_value": -100.0, |
| "attention_kernel": "flash_attention", |
| "pretrained_llama": "aklein4/SmolLM2-1.7B-TPU", |
| "input_length": 256, |
| "output_length": 512, |
| "z_length": 384, |
| "latent_size": 64, |
| "z_ar_steps": 16, |
| "head_intermediate_size": 8192, |
| "lm_loss_ema_beta": 0.75, |
| "pure_modules": [], |
| "sharding": { |
| "embed_tokens.weight": [ |
| "fsdp", |
| null |
| ], |
| "lm_head.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.q_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.k_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.v_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.self_attn.o_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.mlp.gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.mlp.up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_model.layers.*.mlp.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_model.layers.*.input_layernorm.weight": [ |
| "fsdp" |
| ], |
| "encoder_model.layers.*.post_attention_layernorm.weight": [ |
| "fsdp" |
| ], |
| "encoder_model.norm.weight": [ |
| "fsdp" |
| ], |
| "decoder_model.layers.*.self_attn.q_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.self_attn.k_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.self_attn.v_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.self_attn.o_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.mlp.gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.mlp.up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_model.layers.*.mlp.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_model.layers.*.input_layernorm.weight": [ |
| "fsdp" |
| ], |
| "decoder_model.layers.*.post_attention_layernorm.weight": [ |
| "fsdp" |
| ], |
| "decoder_model.norm.weight": [ |
| "fsdp" |
| ], |
| "encoder_head.states_gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_head.states_up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_head.z_gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_head.z_gate_proj.mask": [ |
| "fsdp", |
| null |
| ], |
| "encoder_head.z_up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "encoder_head.z_up_proj.mask": [ |
| "fsdp", |
| null |
| ], |
| "encoder_head.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_head.down_proj.mask": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_head.cross_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_head.states_gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_head.states_up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_head.z_gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_head.z_gate_proj.mask": [ |
| "fsdp", |
| null |
| ], |
| "decoder_head.z_up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_head.z_up_proj.mask": [ |
| "fsdp", |
| null |
| ], |
| "decoder_head.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_head.down_proj.mask": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_head.cross_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_decoder_head.states_gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "uncond_decoder_head.states_up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "uncond_decoder_head.z_gate_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "uncond_decoder_head.z_gate_proj.mask": [ |
| "fsdp", |
| null |
| ], |
| "uncond_decoder_head.z_up_proj.weight": [ |
| "fsdp", |
| null |
| ], |
| "uncond_decoder_head.z_up_proj.mask": [ |
| "fsdp", |
| null |
| ], |
| "uncond_decoder_head.down_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_decoder_head.down_proj.mask": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_decoder_head.cross_proj.weight": [ |
| null, |
| "fsdp" |
| ], |
| "uncond_tokens": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_sep_token": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_z_tokens": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_z_tokens": [ |
| null, |
| "fsdp" |
| ], |
| "decoder_start_output_token": [ |
| null, |
| "fsdp" |
| ], |
| "encoder_input_embeddings": [ |
| "fsdp" |
| ], |
| "encoder_output_embeddings": [ |
| "fsdp" |
| ], |
| "decoder_input_embeddings": [ |
| "fsdp" |
| ], |
| "decoder_output_embeddings": [ |
| "fsdp" |
| ], |
| "encoder_noise_proj_in.weight": [ |
| "fsdp", |
| null |
| ], |
| "decoder_z_proj_in.weight": [ |
| "fsdp", |
| null |
| ], |
| "lm_loss_ema.num_updates": [ |
| null |
| ], |
| "lm_loss_ema.weight": [ |
| null |
| ], |
| "embed_tokens": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "encoder_model.layers.*": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "decoder_model.layers.*": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "encoder_head": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "decoder_head": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "uncond_decoder_head": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ], |
| "lm_head": [ |
| [ |
| "data", |
| "fsdp" |
| ], |
| null, |
| null |
| ] |
| }, |
| "remat": { |
| "advanced": [ |
| { |
| "name": "self", |
| "settings": { |
| "activation_checkpoint_layers": [ |
| "ARHead" |
| ], |
| "optimization_barrier_layers": [ |
| "ARHead" |
| ] |
| } |
| }, |
| { |
| "name": "encoder_model", |
| "settings": { |
| "activation_checkpoint_layers": [ |
| "EncoderModelLayer" |
| ], |
| "optimization_barrier_layers": [ |
| "EncoderModelLayer" |
| ], |
| "scan_layers": "layers", |
| "offload_tensors": [ |
| "encoder_model_input" |
| ] |
| } |
| }, |
| { |
| "name": "decoder_model", |
| "settings": { |
| "activation_checkpoint_layers": [ |
| "DecoderModelLayer" |
| ], |
| "optimization_barrier_layers": [ |
| "DecoderModelLayer" |
| ], |
| "scan_layers": "layers", |
| "offload_tensors": [ |
| "decoder_model_input" |
| ] |
| } |
| } |
| ] |
| } |
| } |