yujiepan commited on
Commit
6013b9e
·
verified ·
1 Parent(s): 82919c0

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +20 -14
  2. config.json +2 -2
  3. generation_config.json +1 -1
  4. model.safetensors +2 -2
README.md CHANGED
@@ -8,7 +8,7 @@ This tiny model is intended for debugging. It is randomly initialized using the
8
 
9
  | File path | Size |
10
  |------|------|
11
- | model.safetensors | 9.0MB |
12
 
13
 
14
  ### Example usage:
@@ -102,7 +102,7 @@ config_json.update({
102
  "mlp_layer_types": ['dense'] + ['sparse'],
103
  "head_dim": head_dim,
104
  "hidden_size": 8,
105
- "index_head_dim": 32,
106
  "index_n_heads": 4,
107
  "intermediate_size": 32,
108
  "moe_intermediate_size": 32,
@@ -187,10 +187,12 @@ GlmMoeDsaForCausalLM(
187
  (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
188
  (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
189
  (o_proj): Linear(in_features=1024, out_features=8, bias=False)
190
- (wq_b): Linear(in_features=32, out_features=1024, bias=False)
191
- (wk): Linear(in_features=8, out_features=256, bias=False)
192
- (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
193
- (weights_proj): Linear(in_features=8, out_features=4, bias=False)
 
 
194
  )
195
  (mlp): GlmMoeDsaMLP(
196
  (gate_proj): Linear(in_features=8, out_features=32, bias=False)
@@ -210,10 +212,12 @@ GlmMoeDsaForCausalLM(
210
  (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
211
  (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
212
  (o_proj): Linear(in_features=1024, out_features=8, bias=False)
213
- (wq_b): Linear(in_features=32, out_features=1024, bias=False)
214
- (wk): Linear(in_features=8, out_features=256, bias=False)
215
- (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
216
- (weights_proj): Linear(in_features=8, out_features=4, bias=False)
 
 
217
  )
218
  (mlp): GlmMoeDsaMoE(
219
  (experts): GlmMoeDsaNaiveMoe(
@@ -247,10 +251,12 @@ GlmMoeDsaForCausalLM(
247
  (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
248
  (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
249
  (o_proj): Linear(in_features=1024, out_features=8, bias=False)
250
- (wq_b): Linear(in_features=32, out_features=1024, bias=False)
251
- (wk): Linear(in_features=8, out_features=256, bias=False)
252
- (k_norm): GlmMoeDsaRMSNorm((256,), eps=1e-06)
253
- (weights_proj): Linear(in_features=8, out_features=4, bias=False)
 
 
254
  )
255
  (mlp): GlmMoeDsaMoE(
256
  (experts): GlmMoeDsaNaiveMoe(
 
8
 
9
  | File path | Size |
10
  |------|------|
11
+ | model.safetensors | 9.4MB |
12
 
13
 
14
  ### Example usage:
 
102
  "mlp_layer_types": ['dense'] + ['sparse'],
103
  "head_dim": head_dim,
104
  "hidden_size": 8,
105
+ "index_head_dim": 128,
106
  "index_n_heads": 4,
107
  "intermediate_size": 32,
108
  "moe_intermediate_size": 32,
 
187
  (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
188
  (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
189
  (o_proj): Linear(in_features=1024, out_features=8, bias=False)
190
+ (indexer): GlmMoeDsaIndexer(
191
+ (wq_b): Linear(in_features=32, out_features=512, bias=False)
192
+ (wk): Linear(in_features=8, out_features=128, bias=False)
193
+ (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
194
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
195
+ )
196
  )
197
  (mlp): GlmMoeDsaMLP(
198
  (gate_proj): Linear(in_features=8, out_features=32, bias=False)
 
212
  (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
213
  (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
214
  (o_proj): Linear(in_features=1024, out_features=8, bias=False)
215
+ (indexer): GlmMoeDsaIndexer(
216
+ (wq_b): Linear(in_features=32, out_features=512, bias=False)
217
+ (wk): Linear(in_features=8, out_features=128, bias=False)
218
+ (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
219
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
220
+ )
221
  )
222
  (mlp): GlmMoeDsaMoE(
223
  (experts): GlmMoeDsaNaiveMoe(
 
251
  (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
252
  (kv_b_proj): Linear(in_features=512, out_features=1792, bias=False)
253
  (o_proj): Linear(in_features=1024, out_features=8, bias=False)
254
+ (indexer): GlmMoeDsaIndexer(
255
+ (wq_b): Linear(in_features=32, out_features=512, bias=False)
256
+ (wk): Linear(in_features=8, out_features=128, bias=False)
257
+ (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
258
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
259
+ )
260
  )
261
  (mlp): GlmMoeDsaMoE(
262
  (experts): GlmMoeDsaNaiveMoe(
config.json CHANGED
@@ -16,7 +16,7 @@
16
  "head_dim": 64,
17
  "hidden_act": "silu",
18
  "hidden_size": 8,
19
- "index_head_dim": 32,
20
  "index_n_heads": 4,
21
  "index_topk": 2048,
22
  "indexer_rope_interleave": true,
@@ -57,7 +57,7 @@
57
  "tie_word_embeddings": true,
58
  "topk_group": 1,
59
  "topk_method": "noaux_tc",
60
- "transformers_version": "5.2.0.dev0",
61
  "use_cache": true,
62
  "v_head_dim": 256,
63
  "vocab_size": 154880
 
16
  "head_dim": 64,
17
  "hidden_act": "silu",
18
  "hidden_size": 8,
19
+ "index_head_dim": 128,
20
  "index_n_heads": 4,
21
  "index_topk": 2048,
22
  "indexer_rope_interleave": true,
 
57
  "tie_word_embeddings": true,
58
  "topk_group": 1,
59
  "topk_method": "noaux_tc",
60
+ "transformers_version": "5.3.0.dev0",
61
  "use_cache": true,
62
  "v_head_dim": 256,
63
  "vocab_size": 154880
generation_config.json CHANGED
@@ -9,5 +9,5 @@
9
  "pad_token_id": 154820,
10
  "temperature": 1.0,
11
  "top_p": 0.95,
12
- "transformers_version": "5.2.0.dev0"
13
  }
 
9
  "pad_token_id": 154820,
10
  "temperature": 1.0,
11
  "top_p": 0.95,
12
+ "transformers_version": "5.3.0.dev0"
13
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ed734a563b043f3746f8922c9447c582f2084b63369e5a0a75347e0b210a90a
3
- size 9455168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:688a21163128eb9c83409f069c9ab8f3fb4ed9c6129b1d7ba692c1db62147206
3
+ size 9351152