Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +20 -14
config.json +2 -2
generation_config.json +1 -1
model.safetensors +2 -2

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ This tiny model is intended for debugging. It is randomly initialized using the
 | File path | Size |
 |------|------|
-| model.safetensors | 9.0MB |
 ### Example usage:
@@ -102,7 +102,7 @@ config_json.update({
     "mlp_layer_types": ['dense'] + ['sparse'],
     "head_dim": head_dim,
     "hidden_size": 8,
-    "index_head_dim": 32,
     "index_n_heads": 4,
     "intermediate_size": 32,
     "moe_intermediate_size": 32,
@@ -187,10 +187,12 @@ GlmMoeDsaForCausalLM(
           (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
           (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
           (o_proj): Linear(in_features=1024, out_features=8, bias=False)
-          (wq_b): Linear(in_features=32, out_features=1024, bias=False)
-          (wk): Linear(in_features=8, out_features=256, bias=False)
-          (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
-          (weights_proj): Linear(in_features=8, out_features=4, bias=False)
         )
         (mlp): GlmMoeDsaMLP(
           (gate_proj): Linear(in_features=8, out_features=32, bias=False)
@@ -210,10 +212,12 @@ GlmMoeDsaForCausalLM(
           (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
           (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
           (o_proj): Linear(in_features=1024, out_features=8, bias=False)
-          (wq_b): Linear(in_features=32, out_features=1024, bias=False)
-          (wk): Linear(in_features=8, out_features=256, bias=False)
-          (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
-          (weights_proj): Linear(in_features=8, out_features=4, bias=False)
         )
         (mlp): GlmMoeDsaMoE(
           (experts): GlmMoeDsaNaiveMoe(
@@ -247,10 +251,12 @@ GlmMoeDsaForCausalLM(
           (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
           (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
           (o_proj): Linear(in_features=1024, out_features=8, bias=False)
-          (wq_b): Linear(in_features=32, out_features=1024, bias=False)
-          (wk): Linear(in_features=8, out_features=256, bias=False)
-          (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
-          (weights_proj): Linear(in_features=8, out_features=4, bias=False)
         )
         (mlp): GlmMoeDsaMoE(
           (experts): GlmMoeDsaNaiveMoe(

 | File path | Size |
 |------|------|
+| model.safetensors | 9.4MB |
 ### Example usage:
     "mlp_layer_types": ['dense'] + ['sparse'],
     "head_dim": head_dim,
     "hidden_size": 8,
+    "index_head_dim": 128,
     "index_n_heads": 4,
     "intermediate_size": 32,
     "moe_intermediate_size": 32,
           (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
           (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
           (o_proj): Linear(in_features=1024, out_features=8, bias=False)
+          (indexer): GlmMoeDsaIndexer(
+            (wq_b): Linear(in_features=32, out_features=512, bias=False)
+            (wk): Linear(in_features=8, out_features=128, bias=False)
+            (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
+            (weights_proj): Linear(in_features=8, out_features=4, bias=False)
+          )
         )
         (mlp): GlmMoeDsaMLP(
           (gate_proj): Linear(in_features=8, out_features=32, bias=False)
           (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
           (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
           (o_proj): Linear(in_features=1024, out_features=8, bias=False)
+          (indexer): GlmMoeDsaIndexer(
+            (wq_b): Linear(in_features=32, out_features=512, bias=False)
+            (wk): Linear(in_features=8, out_features=128, bias=False)
+            (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
+            (weights_proj): Linear(in_features=8, out_features=4, bias=False)
+          )
         )
         (mlp): GlmMoeDsaMoE(
           (experts): GlmMoeDsaNaiveMoe(
           (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
           (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
           (o_proj): Linear(in_features=1024, out_features=8, bias=False)
+          (indexer): GlmMoeDsaIndexer(
+            (wq_b): Linear(in_features=32, out_features=512, bias=False)
+            (wk): Linear(in_features=8, out_features=128, bias=False)
+            (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
+            (weights_proj): Linear(in_features=8, out_features=4, bias=False)
+          )
         )
         (mlp): GlmMoeDsaMoE(
           (experts): GlmMoeDsaNaiveMoe(

config.json CHANGED Viewed

@@ -16,7 +16,7 @@
   "head_dim": 64,
   "hidden_act": "silu",
   "hidden_size": 8,
-  "index_head_dim": 32,
   "index_n_heads": 4,
   "index_topk": 2048,
   "indexer_rope_interleave": true,
@@ -57,7 +57,7 @@
   "tie_word_embeddings": true,
   "topk_group": 1,
   "topk_method": "noaux_tc",
-  "transformers_version": "5.2.0.dev0",
   "use_cache": true,
   "v_head_dim": 256,
   "vocab_size": 154880

   "head_dim": 64,
   "hidden_act": "silu",
   "hidden_size": 8,
+  "index_head_dim": 128,
   "index_n_heads": 4,
   "index_topk": 2048,
   "indexer_rope_interleave": true,
   "tie_word_embeddings": true,
   "topk_group": 1,
   "topk_method": "noaux_tc",
+  "transformers_version": "5.3.0.dev0",
   "use_cache": true,
   "v_head_dim": 256,
   "vocab_size": 154880

generation_config.json CHANGED Viewed

@@ -9,5 +9,5 @@
   "pad_token_id": 154820,
   "temperature": 1.0,
   "top_p": 0.95,
-  "transformers_version": "5.2.0.dev0"
 }

   "pad_token_id": 154820,
   "temperature": 1.0,
   "top_p": 0.95,
+  "transformers_version": "5.3.0.dev0"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ed734a563b043f3746f8922c9447c582f2084b63369e5a0a75347e0b210a90a
-size 9455168

 version https://git-lfs.github.com/spec/v1
+oid sha256:688a21163128eb9c83409f069c9ab8f3fb4ed9c6129b1d7ba692c1db62147206
+size 9351152