Upload folder using huggingface_hub

16b2354 verified about 1 month ago

9.14 kB

	{
	"type": "zlm.ZLMModel",
	"pretrained_url": "aklein4/ZEBRA_ar-1p7b-kernel",
	"pretrained_step": 7000,
	"pretrained_strict": true,
	"torch_dtype": "float32",
	"vocab_size": 49152,
	"bos_token_id": 0,
	"eos_token_id": 0,
	"pad_token_id": 49152,
	"hidden_size": 2048,
	"num_hidden_layers": 24,
	"num_attention_heads": 32,
	"num_key_value_heads": 32,
	"intermediate_size": 8192,
	"hidden_act": "silu",
	"max_position_embeddings": 8192,
	"rope_theta": 130000,
	"initializer_range": 0.02,
	"attention_dropout": false,
	"attention_bias": false,
	"rms_norm_eps": 1e-05,
	"pad_attention_bias_value": -100.0,
	"attention_kernel": "flash_attention",
	"pretrained_llama": "aklein4/SmolLM2-1.7B-TPU",
	"input_length": 256,
	"output_length": 512,
	"z_length": 384,
	"latent_size": 64,
	"z_ar_steps": 16,
	"head_intermediate_size": 8192,
	"lm_loss_ema_beta": 0.75,
	"pure_modules": [],
	"sharding": {
	"embed_tokens.weight": [
	"fsdp",
	null
	],
	"lm_head.weight": [
	"fsdp",
	null
	],
	"encoder_model.layers.*.self_attn.q_proj.weight": [
	"fsdp",
	null
	],
	"encoder_model.layers.*.self_attn.k_proj.weight": [
	"fsdp",
	null
	],
	"encoder_model.layers.*.self_attn.v_proj.weight": [
	"fsdp",
	null
	],
	"encoder_model.layers.*.self_attn.o_proj.weight": [
	"fsdp",
	null
	],
	"encoder_model.layers.*.mlp.gate_proj.weight": [
	"fsdp",
	null
	],
	"encoder_model.layers.*.mlp.up_proj.weight": [
	"fsdp",
	null
	],
	"encoder_model.layers.*.mlp.down_proj.weight": [
	null,
	"fsdp"
	],
	"encoder_model.layers.*.input_layernorm.weight": [
	"fsdp"
	],
	"encoder_model.layers.*.post_attention_layernorm.weight": [
	"fsdp"
	],
	"encoder_model.norm.weight": [
	"fsdp"
	],
	"decoder_model.layers.*.self_attn.q_proj.weight": [
	"fsdp",
	null
	],
	"decoder_model.layers.*.self_attn.k_proj.weight": [
	"fsdp",
	null
	],
	"decoder_model.layers.*.self_attn.v_proj.weight": [
	"fsdp",
	null
	],
	"decoder_model.layers.*.self_attn.o_proj.weight": [
	"fsdp",
	null
	],
	"decoder_model.layers.*.mlp.gate_proj.weight": [
	"fsdp",
	null
	],
	"decoder_model.layers.*.mlp.up_proj.weight": [
	"fsdp",
	null
	],
	"decoder_model.layers.*.mlp.down_proj.weight": [
	null,
	"fsdp"
	],
	"decoder_model.layers.*.input_layernorm.weight": [
	"fsdp"
	],
	"decoder_model.layers.*.post_attention_layernorm.weight": [
	"fsdp"
	],
	"decoder_model.norm.weight": [
	"fsdp"
	],
	"encoder_head.states_gate_proj.weight": [
	"fsdp",
	null
	],
	"encoder_head.states_up_proj.weight": [
	"fsdp",
	null
	],
	"encoder_head.z_gate_proj.weight": [
	"fsdp",
	null
	],
	"encoder_head.z_gate_proj.mask": [
	"fsdp",
	null
	],
	"encoder_head.z_up_proj.weight": [
	"fsdp",
	null
	],
	"encoder_head.z_up_proj.mask": [
	"fsdp",
	null
	],
	"encoder_head.down_proj.weight": [
	null,
	"fsdp"
	],
	"encoder_head.down_proj.mask": [
	null,
	"fsdp"
	],
	"encoder_head.cross_proj.weight": [
	null,
	"fsdp"
	],
	"decoder_head.states_gate_proj.weight": [
	"fsdp",
	null
	],
	"decoder_head.states_up_proj.weight": [
	"fsdp",
	null
	],
	"decoder_head.z_gate_proj.weight": [
	"fsdp",
	null
	],
	"decoder_head.z_gate_proj.mask": [
	"fsdp",
	null
	],
	"decoder_head.z_up_proj.weight": [
	"fsdp",
	null
	],
	"decoder_head.z_up_proj.mask": [
	"fsdp",
	null
	],
	"decoder_head.down_proj.weight": [
	null,
	"fsdp"
	],
	"decoder_head.down_proj.mask": [
	null,
	"fsdp"
	],
	"decoder_head.cross_proj.weight": [
	null,
	"fsdp"
	],
	"uncond_decoder_head.states_gate_proj.weight": [
	"fsdp",
	null
	],
	"uncond_decoder_head.states_up_proj.weight": [
	"fsdp",
	null
	],
	"uncond_decoder_head.z_gate_proj.weight": [
	"fsdp",
	null
	],
	"uncond_decoder_head.z_gate_proj.mask": [
	"fsdp",
	null
	],
	"uncond_decoder_head.z_up_proj.weight": [
	"fsdp",
	null
	],
	"uncond_decoder_head.z_up_proj.mask": [
	"fsdp",
	null
	],
	"uncond_decoder_head.down_proj.weight": [
	null,
	"fsdp"
	],
	"uncond_decoder_head.down_proj.mask": [
	null,
	"fsdp"
	],
	"uncond_decoder_head.cross_proj.weight": [
	null,
	"fsdp"
	],
	"uncond_tokens": [
	null,
	"fsdp"
	],
	"encoder_sep_token": [
	null,
	"fsdp"
	],
	"encoder_z_tokens": [
	null,
	"fsdp"
	],
	"decoder_z_tokens": [
	null,
	"fsdp"
	],
	"decoder_start_output_token": [
	null,
	"fsdp"
	],
	"encoder_input_embeddings": [
	"fsdp"
	],
	"encoder_output_embeddings": [
	"fsdp"
	],
	"decoder_input_embeddings": [
	"fsdp"
	],
	"decoder_output_embeddings": [
	"fsdp"
	],
	"encoder_noise_proj_in.weight": [
	"fsdp",
	null
	],
	"decoder_z_proj_in.weight": [
	"fsdp",
	null
	],
	"lm_loss_ema.num_updates": [
	null
	],
	"lm_loss_ema.weight": [
	null
	],
	"embed_tokens": [
	[
	"data",
	"fsdp"
	],
	null,
	null
	],
	"encoder_model.layers.*": [
	[
	"data",
	"fsdp"
	],
	null,
	null
	],
	"decoder_model.layers.*": [
	[
	"data",
	"fsdp"
	],
	null,
	null
	],
	"encoder_head": [
	[
	"data",
	"fsdp"
	],
	null,
	null
	],
	"decoder_head": [
	[
	"data",
	"fsdp"
	],
	null,
	null
	],
	"uncond_decoder_head": [
	[
	"data",
	"fsdp"
	],
	null,
	null
	],
	"lm_head": [
	[
	"data",
	"fsdp"
	],
	null,
	null
	]
	},
	"remat": {
	"advanced": [
	{
	"name": "self",
	"settings": {
	"activation_checkpoint_layers": [
	"ARHead"
	],
	"optimization_barrier_layers": [
	"ARHead"
	]
	}
	},
	{
	"name": "encoder_model",
	"settings": {
	"activation_checkpoint_layers": [
	"EncoderModelLayer"
	],
	"optimization_barrier_layers": [
	"EncoderModelLayer"
	],
	"scan_layers": "layers",
	"offload_tensors": [
	"encoder_model_input"
	]
	}
	},
	{
	"name": "decoder_model",
	"settings": {
	"activation_checkpoint_layers": [
	"DecoderModelLayer"
	],
	"optimization_barrier_layers": [
	"DecoderModelLayer"
	],
	"scan_layers": "layers",
	"offload_tensors": [
	"decoder_model_input"
	]
	}
	}
	]
	}
	}