--- license: apache-2.0 --- 0 and -1 layer are dense shared causal convulation expert top-k noisy routing with load reblancing (deepseel) ```python BiBoForCausalLM( (model): BiBoModel( (embed_tokens): Embedding(128000, 1024) (layers): ModuleList( (0): BiBoDecoderLayer( (self_attn): BiBoAttention( (q_proj): Linear(in_features=1024, out_features=1020, bias=True) (k_proj): Linear(in_features=1024, out_features=170, bias=True) (v_proj): Linear(in_features=1024, out_features=170, bias=True) (o_proj): Linear(in_features=1020, out_features=1024, bias=False) ) (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06) (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06) (mlp): BiBoMLP( (gate_proj): Linear(in_features=1024, out_features=49600, bias=False) (up_proj): Linear(in_features=1024, out_features=49600, bias=False) (down_proj): Linear(in_features=49600, out_features=1024, bias=False) (act_fn): SiLU() ) ) (1-10): 10 x BiBoDecoderLayer( (self_attn): BiBoAttention( (q_proj): Linear(in_features=1024, out_features=1020, bias=True) (k_proj): Linear(in_features=1024, out_features=170, bias=True) (v_proj): Linear(in_features=1024, out_features=170, bias=True) (o_proj): Linear(in_features=1020, out_features=1024, bias=False) ) (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06) (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06) (mlp): BiBoMoELayer( (routed_experts): ModuleList( (0-8): 9 x MLPExpert( (gate_proj): Linear(in_features=1024, out_features=512, bias=False) (up_proj): Linear(in_features=1024, out_features=512, bias=False) (down_proj): Linear(in_features=512, out_features=1024, bias=False) (act_fn): SiLU() ) (9): IdentityExpert() ) (shared_experts_list): ModuleList( (0): ModifiedConvolutionalExpert( (gate_conv): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), bias=False) (up_proj): Linear(in_features=1024, out_features=512, bias=False) (down_proj): Linear(in_features=512, out_features=1024, bias=False) (act_fn): SiLU() ) ) (gate): BiBoMoERouter( (gate_proj): Linear(in_features=1024, out_features=10, bias=False) ) ) ) (11): BiBoDecoderLayer( (self_attn): BiBoAttention( (q_proj): Linear(in_features=1024, out_features=1020, bias=True) (k_proj): Linear(in_features=1024, out_features=170, bias=True) (v_proj): Linear(in_features=1024, out_features=170, bias=True) (o_proj): Linear(in_features=1020, out_features=1024, bias=False) ) (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06) (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06) (mlp): BiBoMLP( (gate_proj): Linear(in_features=1024, out_features=49600, bias=False) (up_proj): Linear(in_features=1024, out_features=49600, bias=False) (down_proj): Linear(in_features=49600, out_features=1024, bias=False) (act_fn): SiLU() ) ) ) (norm): BiBoRMSNorm((1024,), eps=1e-06) (rotary_emb): BiBoRotaryEmbedding() ) (lm_head): Linear(in_features=1024, out_features=128000, bias=False) ) ```