|
|
--- |
|
|
license: apache-2.0 |
|
|
--- |
|
|
|
|
|
0 and -1 layer are dense |
|
|
|
|
|
shared causal convulation expert |
|
|
|
|
|
top-k noisy routing with load reblancing (deepseel) |
|
|
|
|
|
|
|
|
```python |
|
|
|
|
|
BiBoForCausalLM( |
|
|
(model): BiBoModel( |
|
|
(embed_tokens): Embedding(128000, 1024) |
|
|
(layers): ModuleList( |
|
|
(0): BiBoDecoderLayer( |
|
|
(self_attn): BiBoAttention( |
|
|
(q_proj): Linear(in_features=1024, out_features=1020, bias=True) |
|
|
(k_proj): Linear(in_features=1024, out_features=170, bias=True) |
|
|
(v_proj): Linear(in_features=1024, out_features=170, bias=True) |
|
|
(o_proj): Linear(in_features=1020, out_features=1024, bias=False) |
|
|
) |
|
|
(input_layernorm): BiBoRMSNorm((1024,), eps=1e-06) |
|
|
(post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06) |
|
|
(mlp): BiBoMLP( |
|
|
(gate_proj): Linear(in_features=1024, out_features=49600, bias=False) |
|
|
(up_proj): Linear(in_features=1024, out_features=49600, bias=False) |
|
|
(down_proj): Linear(in_features=49600, out_features=1024, bias=False) |
|
|
(act_fn): SiLU() |
|
|
) |
|
|
) |
|
|
(1-10): 10 x BiBoDecoderLayer( |
|
|
(self_attn): BiBoAttention( |
|
|
(q_proj): Linear(in_features=1024, out_features=1020, bias=True) |
|
|
(k_proj): Linear(in_features=1024, out_features=170, bias=True) |
|
|
(v_proj): Linear(in_features=1024, out_features=170, bias=True) |
|
|
(o_proj): Linear(in_features=1020, out_features=1024, bias=False) |
|
|
) |
|
|
(input_layernorm): BiBoRMSNorm((1024,), eps=1e-06) |
|
|
(post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06) |
|
|
(mlp): BiBoMoELayer( |
|
|
(routed_experts): ModuleList( |
|
|
(0-8): 9 x MLPExpert( |
|
|
(gate_proj): Linear(in_features=1024, out_features=512, bias=False) |
|
|
(up_proj): Linear(in_features=1024, out_features=512, bias=False) |
|
|
(down_proj): Linear(in_features=512, out_features=1024, bias=False) |
|
|
(act_fn): SiLU() |
|
|
) |
|
|
(9): IdentityExpert() |
|
|
) |
|
|
(shared_experts_list): ModuleList( |
|
|
(0): ModifiedConvolutionalExpert( |
|
|
(gate_conv): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), bias=False) |
|
|
(up_proj): Linear(in_features=1024, out_features=512, bias=False) |
|
|
(down_proj): Linear(in_features=512, out_features=1024, bias=False) |
|
|
(act_fn): SiLU() |
|
|
) |
|
|
) |
|
|
(gate): BiBoMoERouter( |
|
|
(gate_proj): Linear(in_features=1024, out_features=10, bias=False) |
|
|
) |
|
|
) |
|
|
) |
|
|
(11): BiBoDecoderLayer( |
|
|
(self_attn): BiBoAttention( |
|
|
(q_proj): Linear(in_features=1024, out_features=1020, bias=True) |
|
|
(k_proj): Linear(in_features=1024, out_features=170, bias=True) |
|
|
(v_proj): Linear(in_features=1024, out_features=170, bias=True) |
|
|
(o_proj): Linear(in_features=1020, out_features=1024, bias=False) |
|
|
) |
|
|
(input_layernorm): BiBoRMSNorm((1024,), eps=1e-06) |
|
|
(post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06) |
|
|
(mlp): BiBoMLP( |
|
|
(gate_proj): Linear(in_features=1024, out_features=49600, bias=False) |
|
|
(up_proj): Linear(in_features=1024, out_features=49600, bias=False) |
|
|
(down_proj): Linear(in_features=49600, out_features=1024, bias=False) |
|
|
(act_fn): SiLU() |
|
|
) |
|
|
) |
|
|
) |
|
|
(norm): BiBoRMSNorm((1024,), eps=1e-06) |
|
|
(rotary_emb): BiBoRotaryEmbedding() |
|
|
) |
|
|
(lm_head): Linear(in_features=1024, out_features=128000, bias=False) |
|
|
) |
|
|
``` |
|
|
|