AdamF92 commited on
Commit
c3d88cd
·
verified ·
1 Parent(s): 0765942

Push model using huggingface_hub.

Browse files
Files changed (2) hide show
  1. config.json +44 -0
  2. model.safetensors +1 -1
config.json CHANGED
@@ -1,4 +1,48 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "encoder_config": {
3
  "att_groups": 8,
4
  "att_heads": 16,
 
1
  {
2
+ "decoder_config": {
3
+ "att_groups": 4,
4
+ "att_heads": 16,
5
+ "att_query_groups": 8,
6
+ "cross_att_type": "sqa",
7
+ "dense_layer_dim": 1536,
8
+ "embed_dim": 512,
9
+ "ff_activation": "silu",
10
+ "ff_dim": 192,
11
+ "ff_dropout": 0.0,
12
+ "final_stateless_layers_config": [
13
+ "moe",
14
+ "moe"
15
+ ],
16
+ "head_norm_type": "rms_norm",
17
+ "moe_bias_mode": "global",
18
+ "moe_grouped_gemm": true,
19
+ "moe_shared_experts_bias_mode": "global",
20
+ "moe_top_k": 10,
21
+ "moe_use_cutlass_grouped_gemm": true,
22
+ "moe_use_weighted_shared_experts": false,
23
+ "num_experts": 384,
24
+ "num_layers": 21,
25
+ "num_shared_experts": 2,
26
+ "rope_base": 100000,
27
+ "router_amp": true,
28
+ "self_att_type": "sqa",
29
+ "seq_len": 8192,
30
+ "shared_expert_dim": 384,
31
+ "stateless_layers_config": [
32
+ "dense",
33
+ "moe"
34
+ ],
35
+ "stm_size": 4096,
36
+ "use_attention_output_bias": false,
37
+ "use_flash_attention": true,
38
+ "use_gated": true,
39
+ "use_gated_attention": true,
40
+ "use_gated_cross_attention": false,
41
+ "use_head_norm": true,
42
+ "use_moe": true,
43
+ "use_vectorized_moe": true,
44
+ "vocab_size": 65536
45
+ },
46
  "encoder_config": {
47
  "att_groups": 8,
48
  "att_heads": 16,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4879d67758d3c6ed9aa9a3891d5976a518acdb7c7be19782859649747cec733e
3
  size 6099558592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1bbeee16809db34a0e38fc699e6cde268609721885caa7d2b1feab25bab27e1
3
  size 6099558592