aklein4
/

ZLM-v2_baseline-large

Model card Files Files and versions

aklein4 commited on Feb 14

Commit

c924e2c

·

verified ·

1 Parent(s): 3ea9162

Upload folder using huggingface_hub

Files changed (2) hide show

000000020000/config.json +102 -0
000000020000/model.pt +3 -0

000000020000/config.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+    "pure_modules": [],
+    "remat": {
+        "activation_checkpoint_layers": [
+            "LlamaDecoderLayer"
+        ],
+        "scan_layers": "model.layers",
+        "offload_tensors": [
+            "decoder_input"
+        ],
+        "optimization_barrier_layers": [
+            "LlamaDecoderLayer"
+        ]
+    },
+    "type": "custom_llama.CustomLlamaForCausalLM",
+    "pretrained_url": "aklein4/SmolLM2-1.7B-TPU",
+    "pretrained_step": 0,
+    "pretrained_strict": false,
+    "torch_dtype": "float32",
+    "vocab_size": 49152,
+    "bos_token_id": 0,
+    "eos_token_id": 0,
+    "pad_token_id": 49152,
+    "hidden_size": 2048,
+    "num_hidden_layers": 24,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 32,
+    "intermediate_size": 8192,
+    "hidden_act": "silu",
+    "max_position_embeddings": 8192,
+    "rope_theta": 130000,
+    "initializer_range": null,
+    "gaussian_init": true,
+    "attention_dropout": false,
+    "attention_bias": false,
+    "rms_norm_eps": 1e-05,
+    "pad_attention_bias_value": -100.0,
+    "attention_kernel": "flash_attention",
+    "sharding": {
+        "model.embed_tokens.weight": [
+            "fsdp",
+            null
+        ],
+        "lm_head.weight": [
+            "fsdp",
+            null
+        ],
+        "model.layers.*.self_attn.q_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "model.layers.*.self_attn.k_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "model.layers.*.self_attn.v_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "model.layers.*.self_attn.o_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "model.layers.*.mlp.gate_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "model.layers.*.mlp.up_proj.weight": [
+            "fsdp",
+            null
+        ],
+        "model.layers.*.mlp.down_proj.weight": [
+            null,
+            "fsdp"
+        ],
+        "model.layers.*.input_layernorm.weight": [
+            "fsdp"
+        ],
+        "model.layers.*.post_attention_layernorm.weight": [
+            "fsdp"
+        ],
+        "model.norm.weight": [
+            "fsdp"
+        ],
+        "model.layers.*": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ],
+        "lm_head": [
+            [
+                "data",
+                "fsdp"
+            ],
+            null,
+            null
+        ]
+    }
+}

000000020000/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:369764e76e0440c8b6915363e5c05c56c273a7e5012b6722f46f5083e2cb0d18
+size 7248226091