aklein4
/

ZEBRA_ar-1p7b-kernel-strong

Model card Files Files and versions

xet

Community

aklein4 commited on Mar 13

Commit

16b2354

verified ·

1 Parent(s): 7745dcf

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

000000012000/config.json +373 -0
000000012000/model.pt +3 -0

000000012000/config.json ADDED Viewed

	@@ -0,0 +1,373 @@

+{
+    "type": "zlm.ZLMModel",
+    "pretrained_url": "aklein4/ZEBRA_ar-1p7b-kernel",
+    "pretrained_step": 7000,
+    "pretrained_strict": true,
+    "torch_dtype": "float32",
+    "vocab_size": 49152,
+    "bos_token_id": 0,
+    "eos_token_id": 0,
+    "pad_token_id": 49152,
+    "hidden_size": 2048,
+    "num_hidden_layers": 24,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 32,
+    "intermediate_size": 8192,
+    "hidden_act": "silu",
+    "max_position_embeddings": 8192,
+    "rope_theta": 130000,
+    "initializer_range": 0.02,
+    "attention_dropout": false,
+    "attention_bias": false,
+    "rms_norm_eps": 1e-05,
+    "pad_attention_bias_value": -100.0,
+    "attention_kernel": "flash_attention",
+    "pretrained_llama": "aklein4/SmolLM2-1.7B-TPU",
+    "input_length": 256,
+    "output_length": 512,
+    "z_length": 384,
+    "latent_size": 64,
+    "z_ar_steps": 16,
+    "head_intermediate_size": 8192,
+    "lm_loss_ema_beta": 0.75,
+    "pure_modules": [],
+    "sharding": {
+        "embed_tokens.weight": [
+            "fsdp",
+            null
+        ],
+        "lm_head.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.q_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.k_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.v_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.o_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.mlp.gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.mlp.up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.mlp.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "encoder_model.layers.*.input_layernorm.weight": [
+            "fsdp"
+        ],
+        "encoder_model.layers.*.post_attention_layernorm.weight": [
+            "fsdp"
+        ],
+        "encoder_model.norm.weight": [
+            "fsdp"
+        ],
+        "decoder_model.layers.*.self_attn.q_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.self_attn.k_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.self_attn.v_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.self_attn.o_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.mlp.gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.mlp.up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.mlp.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "decoder_model.layers.*.input_layernorm.weight": [
+            "fsdp"
+        ],
+        "decoder_model.layers.*.post_attention_layernorm.weight": [
+            "fsdp"
+        ],
+        "decoder_model.norm.weight": [
+            "fsdp"
+        ],
+        "encoder_head.states_gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_head.states_up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_head.z_gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_head.z_gate_proj.mask": [
+            "fsdp",
+            null
+        ],
+        "encoder_head.z_up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_head.z_up_proj.mask": [
+            "fsdp",
+            null
+        ],
+        "encoder_head.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "encoder_head.down_proj.mask": [
+            null,
+            "fsdp"
+        ],
+        "encoder_head.cross_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "decoder_head.states_gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_head.states_up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_head.z_gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_head.z_gate_proj.mask": [
+            "fsdp",
+            null
+        ],
+        "decoder_head.z_up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_head.z_up_proj.mask": [
+            "fsdp",
+            null
+        ],
+        "decoder_head.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "decoder_head.down_proj.mask": [
+            null,
+            "fsdp"
+        ],
+        "decoder_head.cross_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_decoder_head.states_gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "uncond_decoder_head.states_up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "uncond_decoder_head.z_gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "uncond_decoder_head.z_gate_proj.mask": [
+            "fsdp",
+            null
+        ],
+        "uncond_decoder_head.z_up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "uncond_decoder_head.z_up_proj.mask": [
+            "fsdp",
+            null
+        ],
+        "uncond_decoder_head.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_decoder_head.down_proj.mask": [
+            null,
+            "fsdp"
+        ],
+        "uncond_decoder_head.cross_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_tokens": [
+            null,
+            "fsdp"
+        ],
+        "encoder_sep_token": [
+            null,
+            "fsdp"
+        ],
+        "encoder_z_tokens": [
+            null,
+            "fsdp"
+        ],
+        "decoder_z_tokens": [
+            null,
+            "fsdp"
+        ],
+        "decoder_start_output_token": [
+            null,
+            "fsdp"
+        ],
+        "encoder_input_embeddings": [
+            "fsdp"
+        ],
+        "encoder_output_embeddings": [
+            "fsdp"
+        ],
+        "decoder_input_embeddings": [
+            "fsdp"
+        ],
+        "decoder_output_embeddings": [
+            "fsdp"
+        ],
+        "encoder_noise_proj_in.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_z_proj_in.weight": [
+            "fsdp",
+            null
+        ],
+        "lm_loss_ema.num_updates": [
+            null
+        ],
+        "lm_loss_ema.weight": [
+            null
+        ],
+        "embed_tokens": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "encoder_model.layers.*": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "decoder_model.layers.*": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "encoder_head": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "decoder_head": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "uncond_decoder_head": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "lm_head": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ]
+    },
+    "remat": {
+        "advanced": [
+            {
+                "name": "self",
+                "settings": {
+                    "activation_checkpoint_layers": [
+                        "ARHead"
+                    ],
+                    "optimization_barrier_layers": [
+                        "ARHead"
+                    ]
+                }
+            },
+            {
+                "name": "encoder_model",
+                "settings": {
+                    "activation_checkpoint_layers": [
+                        "EncoderModelLayer"
+                    ],
+                    "optimization_barrier_layers": [
+                        "EncoderModelLayer"
+                    ],
+                    "scan_layers": "layers",
+                    "offload_tensors": [
+                        "encoder_model_input"
+                    ]
+                }
+            },
+            {
+                "name": "decoder_model",
+                "settings": {
+                    "activation_checkpoint_layers": [
+                        "DecoderModelLayer"
+                    ],
+                    "optimization_barrier_layers": [
+                        "DecoderModelLayer"
+                    ],
+                    "scan_layers": "layers",
+                    "offload_tensors": [
+                        "decoder_model_input"
+                    ]
+                }
+            }
+        ]
+    }
+}

000000012000/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2cb166983b7ad7df084d83709da0ba88c274d3098f758b744dcc49a7b2c63a3
+size 14143688111