aklein4's picture
Upload folder using huggingface_hub
16b2354 verified
{
"type": "zlm.ZLMModel",
"pretrained_url": "aklein4/ZEBRA_ar-1p7b-kernel",
"pretrained_step": 7000,
"pretrained_strict": true,
"torch_dtype": "float32",
"vocab_size": 49152,
"bos_token_id": 0,
"eos_token_id": 0,
"pad_token_id": 49152,
"hidden_size": 2048,
"num_hidden_layers": 24,
"num_attention_heads": 32,
"num_key_value_heads": 32,
"intermediate_size": 8192,
"hidden_act": "silu",
"max_position_embeddings": 8192,
"rope_theta": 130000,
"initializer_range": 0.02,
"attention_dropout": false,
"attention_bias": false,
"rms_norm_eps": 1e-05,
"pad_attention_bias_value": -100.0,
"attention_kernel": "flash_attention",
"pretrained_llama": "aklein4/SmolLM2-1.7B-TPU",
"input_length": 256,
"output_length": 512,
"z_length": 384,
"latent_size": 64,
"z_ar_steps": 16,
"head_intermediate_size": 8192,
"lm_loss_ema_beta": 0.75,
"pure_modules": [],
"sharding": {
"embed_tokens.weight": [
"fsdp",
null
],
"lm_head.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.q_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.k_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.v_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.self_attn.o_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.mlp.gate_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.mlp.up_proj.weight": [
"fsdp",
null
],
"encoder_model.layers.*.mlp.down_proj.weight": [
null,
"fsdp"
],
"encoder_model.layers.*.input_layernorm.weight": [
"fsdp"
],
"encoder_model.layers.*.post_attention_layernorm.weight": [
"fsdp"
],
"encoder_model.norm.weight": [
"fsdp"
],
"decoder_model.layers.*.self_attn.q_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.self_attn.k_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.self_attn.v_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.self_attn.o_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.mlp.gate_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.mlp.up_proj.weight": [
"fsdp",
null
],
"decoder_model.layers.*.mlp.down_proj.weight": [
null,
"fsdp"
],
"decoder_model.layers.*.input_layernorm.weight": [
"fsdp"
],
"decoder_model.layers.*.post_attention_layernorm.weight": [
"fsdp"
],
"decoder_model.norm.weight": [
"fsdp"
],
"encoder_head.states_gate_proj.weight": [
"fsdp",
null
],
"encoder_head.states_up_proj.weight": [
"fsdp",
null
],
"encoder_head.z_gate_proj.weight": [
"fsdp",
null
],
"encoder_head.z_gate_proj.mask": [
"fsdp",
null
],
"encoder_head.z_up_proj.weight": [
"fsdp",
null
],
"encoder_head.z_up_proj.mask": [
"fsdp",
null
],
"encoder_head.down_proj.weight": [
null,
"fsdp"
],
"encoder_head.down_proj.mask": [
null,
"fsdp"
],
"encoder_head.cross_proj.weight": [
null,
"fsdp"
],
"decoder_head.states_gate_proj.weight": [
"fsdp",
null
],
"decoder_head.states_up_proj.weight": [
"fsdp",
null
],
"decoder_head.z_gate_proj.weight": [
"fsdp",
null
],
"decoder_head.z_gate_proj.mask": [
"fsdp",
null
],
"decoder_head.z_up_proj.weight": [
"fsdp",
null
],
"decoder_head.z_up_proj.mask": [
"fsdp",
null
],
"decoder_head.down_proj.weight": [
null,
"fsdp"
],
"decoder_head.down_proj.mask": [
null,
"fsdp"
],
"decoder_head.cross_proj.weight": [
null,
"fsdp"
],
"uncond_decoder_head.states_gate_proj.weight": [
"fsdp",
null
],
"uncond_decoder_head.states_up_proj.weight": [
"fsdp",
null
],
"uncond_decoder_head.z_gate_proj.weight": [
"fsdp",
null
],
"uncond_decoder_head.z_gate_proj.mask": [
"fsdp",
null
],
"uncond_decoder_head.z_up_proj.weight": [
"fsdp",
null
],
"uncond_decoder_head.z_up_proj.mask": [
"fsdp",
null
],
"uncond_decoder_head.down_proj.weight": [
null,
"fsdp"
],
"uncond_decoder_head.down_proj.mask": [
null,
"fsdp"
],
"uncond_decoder_head.cross_proj.weight": [
null,
"fsdp"
],
"uncond_tokens": [
null,
"fsdp"
],
"encoder_sep_token": [
null,
"fsdp"
],
"encoder_z_tokens": [
null,
"fsdp"
],
"decoder_z_tokens": [
null,
"fsdp"
],
"decoder_start_output_token": [
null,
"fsdp"
],
"encoder_input_embeddings": [
"fsdp"
],
"encoder_output_embeddings": [
"fsdp"
],
"decoder_input_embeddings": [
"fsdp"
],
"decoder_output_embeddings": [
"fsdp"
],
"encoder_noise_proj_in.weight": [
"fsdp",
null
],
"decoder_z_proj_in.weight": [
"fsdp",
null
],
"lm_loss_ema.num_updates": [
null
],
"lm_loss_ema.weight": [
null
],
"embed_tokens": [
[
"data",
"fsdp"
],
null,
null
],
"encoder_model.layers.*": [
[
"data",
"fsdp"
],
null,
null
],
"decoder_model.layers.*": [
[
"data",
"fsdp"
],
null,
null
],
"encoder_head": [
[
"data",
"fsdp"
],
null,
null
],
"decoder_head": [
[
"data",
"fsdp"
],
null,
null
],
"uncond_decoder_head": [
[
"data",
"fsdp"
],
null,
null
],
"lm_head": [
[
"data",
"fsdp"
],
null,
null
]
},
"remat": {
"advanced": [
{
"name": "self",
"settings": {
"activation_checkpoint_layers": [
"ARHead"
],
"optimization_barrier_layers": [
"ARHead"
]
}
},
{
"name": "encoder_model",
"settings": {
"activation_checkpoint_layers": [
"EncoderModelLayer"
],
"optimization_barrier_layers": [
"EncoderModelLayer"
],
"scan_layers": "layers",
"offload_tensors": [
"encoder_model_input"
]
}
},
{
"name": "decoder_model",
"settings": {
"activation_checkpoint_layers": [
"DecoderModelLayer"
],
"optimization_barrier_layers": [
"DecoderModelLayer"
],
"scan_layers": "layers",
"offload_tensors": [
"decoder_model_input"
]
}
}
]
}
}