File size: 3,539 Bytes
f2f9ba0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
---
license: apache-2.0
---

0 and -1 layer are dense

shared causal convulation expert 

top-k noisy routing with load reblancing (deepseel)


```python

BiBoForCausalLM(
  (model): BiBoModel(
    (embed_tokens): Embedding(128000, 1024)
    (layers): ModuleList(
      (0): BiBoDecoderLayer(
        (self_attn): BiBoAttention(
          (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
          (k_proj): Linear(in_features=1024, out_features=170, bias=True)
          (v_proj): Linear(in_features=1024, out_features=170, bias=True)
          (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
        )
        (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
        (mlp): BiBoMLP(
          (gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
          (up_proj): Linear(in_features=1024, out_features=49600, bias=False)
          (down_proj): Linear(in_features=49600, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
      )
      (1-10): 10 x BiBoDecoderLayer(
        (self_attn): BiBoAttention(
          (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
          (k_proj): Linear(in_features=1024, out_features=170, bias=True)
          (v_proj): Linear(in_features=1024, out_features=170, bias=True)
          (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
        )
        (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
        (mlp): BiBoMoELayer(
          (routed_experts): ModuleList(
            (0-8): 9 x MLPExpert(
              (gate_proj): Linear(in_features=1024, out_features=512, bias=False)
              (up_proj): Linear(in_features=1024, out_features=512, bias=False)
              (down_proj): Linear(in_features=512, out_features=1024, bias=False)
              (act_fn): SiLU()
            )
            (9): IdentityExpert()
          )
          (shared_experts_list): ModuleList(
            (0): ModifiedConvolutionalExpert(
              (gate_conv): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), bias=False)
              (up_proj): Linear(in_features=1024, out_features=512, bias=False)
              (down_proj): Linear(in_features=512, out_features=1024, bias=False)
              (act_fn): SiLU()
            )
          )
          (gate): BiBoMoERouter(
            (gate_proj): Linear(in_features=1024, out_features=10, bias=False)
          )
        )
      )
      (11): BiBoDecoderLayer(
        (self_attn): BiBoAttention(
          (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
          (k_proj): Linear(in_features=1024, out_features=170, bias=True)
          (v_proj): Linear(in_features=1024, out_features=170, bias=True)
          (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
        )
        (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
        (mlp): BiBoMLP(
          (gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
          (up_proj): Linear(in_features=1024, out_features=49600, bias=False)
          (down_proj): Linear(in_features=49600, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
      )
    )
    (norm): BiBoRMSNorm((1024,), eps=1e-06)
    (rotary_emb): BiBoRotaryEmbedding()
  )
  (lm_head): Linear(in_features=1024, out_features=128000, bias=False)
)
```