fhai50032 commited on
Commit
f2f9ba0
·
verified ·
1 Parent(s): 0f18e59

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +88 -3
README.md CHANGED
@@ -1,3 +1,88 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ 0 and -1 layer are dense
6
+
7
+ shared causal convulation expert
8
+
9
+ top-k noisy routing with load reblancing (deepseel)
10
+
11
+
12
+ ```python
13
+
14
+ BiBoForCausalLM(
15
+ (model): BiBoModel(
16
+ (embed_tokens): Embedding(128000, 1024)
17
+ (layers): ModuleList(
18
+ (0): BiBoDecoderLayer(
19
+ (self_attn): BiBoAttention(
20
+ (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
21
+ (k_proj): Linear(in_features=1024, out_features=170, bias=True)
22
+ (v_proj): Linear(in_features=1024, out_features=170, bias=True)
23
+ (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
24
+ )
25
+ (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
26
+ (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
27
+ (mlp): BiBoMLP(
28
+ (gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
29
+ (up_proj): Linear(in_features=1024, out_features=49600, bias=False)
30
+ (down_proj): Linear(in_features=49600, out_features=1024, bias=False)
31
+ (act_fn): SiLU()
32
+ )
33
+ )
34
+ (1-10): 10 x BiBoDecoderLayer(
35
+ (self_attn): BiBoAttention(
36
+ (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
37
+ (k_proj): Linear(in_features=1024, out_features=170, bias=True)
38
+ (v_proj): Linear(in_features=1024, out_features=170, bias=True)
39
+ (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
40
+ )
41
+ (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
42
+ (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
43
+ (mlp): BiBoMoELayer(
44
+ (routed_experts): ModuleList(
45
+ (0-8): 9 x MLPExpert(
46
+ (gate_proj): Linear(in_features=1024, out_features=512, bias=False)
47
+ (up_proj): Linear(in_features=1024, out_features=512, bias=False)
48
+ (down_proj): Linear(in_features=512, out_features=1024, bias=False)
49
+ (act_fn): SiLU()
50
+ )
51
+ (9): IdentityExpert()
52
+ )
53
+ (shared_experts_list): ModuleList(
54
+ (0): ModifiedConvolutionalExpert(
55
+ (gate_conv): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), bias=False)
56
+ (up_proj): Linear(in_features=1024, out_features=512, bias=False)
57
+ (down_proj): Linear(in_features=512, out_features=1024, bias=False)
58
+ (act_fn): SiLU()
59
+ )
60
+ )
61
+ (gate): BiBoMoERouter(
62
+ (gate_proj): Linear(in_features=1024, out_features=10, bias=False)
63
+ )
64
+ )
65
+ )
66
+ (11): BiBoDecoderLayer(
67
+ (self_attn): BiBoAttention(
68
+ (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
69
+ (k_proj): Linear(in_features=1024, out_features=170, bias=True)
70
+ (v_proj): Linear(in_features=1024, out_features=170, bias=True)
71
+ (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
72
+ )
73
+ (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
74
+ (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
75
+ (mlp): BiBoMLP(
76
+ (gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
77
+ (up_proj): Linear(in_features=1024, out_features=49600, bias=False)
78
+ (down_proj): Linear(in_features=49600, out_features=1024, bias=False)
79
+ (act_fn): SiLU()
80
+ )
81
+ )
82
+ )
83
+ (norm): BiBoRMSNorm((1024,), eps=1e-06)
84
+ (rotary_emb): BiBoRotaryEmbedding()
85
+ )
86
+ (lm_head): Linear(in_features=1024, out_features=128000, bias=False)
87
+ )
88
+ ```