aklein4
/

ZLM-v2_zlm-large-wait

Model card Files Files and versions

xet

Community

aklein4 commited on Jan 26

Commit

d5b3bba

verified ·

1 Parent(s): b906304

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

000000001000/config.json +369 -0
000000001000/model.pt +3 -0

000000001000/config.json ADDED Viewed

	@@ -0,0 +1,369 @@

+{
+    "pure_modules": [],
+    "remat": {
+        "activation_checkpoint_layers": [],
+        "scan_layers": null,
+        "offload_tensors": [],
+        "advanced": [
+            {
+                "name": "self",
+                "settings": {
+                    "activation_checkpoint_layers": [
+                        "DiffusionHead"
+                    ],
+                    "activation_barrier_layers": [
+                        "DiffusionHead"
+                    ]
+                }
+            },
+            {
+                "name": "encoder_model",
+                "settings": {
+                    "activation_checkpoint_layers": [
+                        "EncoderModelLayer"
+                    ],
+                    "optimization_barrier_layers": [
+                        "EncoderModelLayer"
+                    ],
+                    "scan_layers": "layers",
+                    "offload_tensors": [
+                        "encoder_model_input"
+                    ]
+                }
+            },
+            {
+                "name": "decoder_model",
+                "settings": {
+                    "activation_checkpoint_layers": [
+                        "DecoderModelLayer"
+                    ],
+                    "optimization_barrier_layers": [
+                        "DecoderModelLayer"
+                    ],
+                    "scan_layers": "layers",
+                    "offload_tensors": [
+                        "decoder_model_input"
+                    ]
+                }
+            }
+        ]
+    },
+    "type": "zlm.ZLMModel",
+    "pretrained_url": null,
+    "pretrained_step": null,
+    "pretrained_strict": null,
+    "torch_dtype": "float32",
+    "vocab_size": 49152,
+    "bos_token_id": 0,
+    "eos_token_id": 0,
+    "pad_token_id": 49152,
+    "hidden_size": 2048,
+    "num_hidden_layers": 24,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 32,
+    "intermediate_size": 8192,
+    "hidden_act": "silu",
+    "max_position_embeddings": 8192,
+    "rope_theta": 130000,
+    "attention_dropout": false,
+    "attention_bias": false,
+    "initializer_range": 0.02,
+    "rms_norm_eps": 1e-05,
+    "pad_attention_bias_value": -100.0,
+    "attention_kernel": "nan_safe_flash_attention",
+    "pretrained_llama": "aklein4/SmolLM2-1.7B-TPU",
+    "input_length": 256,
+    "output_length": 512,
+    "z_length": 384,
+    "latent_size": 64,
+    "minimum_diffusion_timestep": 0.5,
+    "num_diffusion_timesteps": 16,
+    "diffusion_in_proj": false,
+    "num_diffusion_head_layers": 2,
+    "diffusion_mlp_size": 3072,
+    "diffusion_output_init_scale": 0.1,
+    "sharding": {
+        "embed_tokens.weight": [
+            "fsdp",
+            null
+        ],
+        "lm_head.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.q_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.k_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.v_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.self_attn.o_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.mlp.gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.mlp.up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_model.layers.*.mlp.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "encoder_model.layers.*.input_layernorm.weight": [
+            "fsdp"
+        ],
+        "encoder_model.layers.*.post_attention_layernorm.weight": [
+            "fsdp"
+        ],
+        "encoder_model.norm.weight": [
+            "fsdp"
+        ],
+        "decoder_model.layers.*.self_attn.q_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.self_attn.k_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.self_attn.v_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.self_attn.o_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.mlp.gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.mlp.up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_model.layers.*.mlp.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "decoder_model.layers.*.input_layernorm.weight": [
+            "fsdp"
+        ],
+        "decoder_model.layers.*.post_attention_layernorm.weight": [
+            "fsdp"
+        ],
+        "decoder_model.norm.weight": [
+            "fsdp"
+        ],
+        "scheduler.timesteps": [
+            null
+        ],
+        "scheduler.a": [
+            null
+        ],
+        "scheduler.b": [
+            null
+        ],
+        "diffusion_head.x_t_in_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "diffusion_head.layers.*.norm.embed.weight": [
+            null,
+            "fsdp"
+        ],
+        "diffusion_head.layers.*.mlp.gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "diffusion_head.layers.*.mlp.up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "diffusion_head.layers.*.mlp.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "diffusion_head.layers.*.out_scale.embed.weight": [
+            null,
+            "fsdp"
+        ],
+        "diffusion_head.out_norm.embed.weight": [
+            null,
+            "fsdp"
+        ],
+        "diffusion_head.out_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_diffusion_head.x_t_in_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "uncond_diffusion_head.layers.*.norm.embed.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_diffusion_head.layers.*.mlp.gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "uncond_diffusion_head.layers.*.mlp.up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "uncond_diffusion_head.layers.*.mlp.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_diffusion_head.layers.*.out_scale.embed.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_diffusion_head.out_norm.embed.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_diffusion_head.out_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "mu_initial_batch_norm.initialized": [
+            null
+        ],
+        "mu_initial_batch_norm.shift": [
+            null,
+            null
+        ],
+        "mu_initial_batch_norm.scale": [
+            null,
+            null
+        ],
+        "encoder_sep_token": [
+            null,
+            "fsdp"
+        ],
+        "encoder_z_tokens": [
+            null,
+            "fsdp"
+        ],
+        "decoder_z_tokens": [
+            null,
+            "fsdp"
+        ],
+        "decoder_start_output_token": [
+            null,
+            "fsdp"
+        ],
+        "encoder_input_embeddings": [
+            "fsdp"
+        ],
+        "encoder_output_embeddings": [
+            "fsdp"
+        ],
+        "decoder_input_embeddings": [
+            "fsdp"
+        ],
+        "decoder_output_embeddings": [
+            "fsdp"
+        ],
+        "encoder_noise_proj_in.weight": [
+            "fsdp",
+            null
+        ],
+        "decoder_z_proj_in.weight": [
+            "fsdp",
+            null
+        ],
+        "encoder_mu_proj_out.weight": [
+            null,
+            "fsdp"
+        ],
+        "uncond_tokens": [
+            null,
+            "fsdp"
+        ],
+        "embed_tokens": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "encoder_model.layers.*": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "decoder_model.layers.*": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "diffusion_head.layers.*": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "diffusion_head.out_proj": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "uncond_diffusion_head.layers.*": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "uncond_diffusion_head.out_proj": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "encoder_mu_proj_out": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "lm_head": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ]
+    }
+}

000000001000/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd26f0e0c78ff123e52fc93e21c454f88e78bf6520e4945486946639c9f387b
+size 14007751759