tinycompany
/

BiBo-MoE-Tiny

Model card Files Files and versions

xet

Community

fhai50032 commited on Apr 26, 2025

Commit

f2f9ba0

verified ·

1 Parent(s): 0f18e59

Update README.md

Browse files

Files changed (1) hide show

README.md +88 -3

README.md CHANGED Viewed

@@ -1,3 +1,88 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+---
+0 and -1 layer are dense
+shared causal convulation expert
+top-k noisy routing with load reblancing (deepseel)
+```python
+BiBoForCausalLM(
+  (model): BiBoModel(
+    (embed_tokens): Embedding(128000, 1024)
+    (layers): ModuleList(
+      (0): BiBoDecoderLayer(
+        (self_attn): BiBoAttention(
+          (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
+          (k_proj): Linear(in_features=1024, out_features=170, bias=True)
+          (v_proj): Linear(in_features=1024, out_features=170, bias=True)
+          (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
+        )
+        (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
+        (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
+        (mlp): BiBoMLP(
+          (gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
+          (up_proj): Linear(in_features=1024, out_features=49600, bias=False)
+          (down_proj): Linear(in_features=49600, out_features=1024, bias=False)
+          (act_fn): SiLU()
+        )
+      )
+      (1-10): 10 x BiBoDecoderLayer(
+        (self_attn): BiBoAttention(
+          (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
+          (k_proj): Linear(in_features=1024, out_features=170, bias=True)
+          (v_proj): Linear(in_features=1024, out_features=170, bias=True)
+          (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
+        )
+        (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
+        (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
+        (mlp): BiBoMoELayer(
+          (routed_experts): ModuleList(
+            (0-8): 9 x MLPExpert(
+              (gate_proj): Linear(in_features=1024, out_features=512, bias=False)
+              (up_proj): Linear(in_features=1024, out_features=512, bias=False)
+              (down_proj): Linear(in_features=512, out_features=1024, bias=False)
+              (act_fn): SiLU()
+            )
+            (9): IdentityExpert()
+          )
+          (shared_experts_list): ModuleList(
+            (0): ModifiedConvolutionalExpert(
+              (gate_conv): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), bias=False)
+              (up_proj): Linear(in_features=1024, out_features=512, bias=False)
+              (down_proj): Linear(in_features=512, out_features=1024, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+          (gate): BiBoMoERouter(
+            (gate_proj): Linear(in_features=1024, out_features=10, bias=False)
+          )
+        )
+      )
+      (11): BiBoDecoderLayer(
+        (self_attn): BiBoAttention(
+          (q_proj): Linear(in_features=1024, out_features=1020, bias=True)
+          (k_proj): Linear(in_features=1024, out_features=170, bias=True)
+          (v_proj): Linear(in_features=1024, out_features=170, bias=True)
+          (o_proj): Linear(in_features=1020, out_features=1024, bias=False)
+        )
+        (input_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
+        (post_attention_layernorm): BiBoRMSNorm((1024,), eps=1e-06)
+        (mlp): BiBoMLP(
+          (gate_proj): Linear(in_features=1024, out_features=49600, bias=False)
+          (up_proj): Linear(in_features=1024, out_features=49600, bias=False)
+          (down_proj): Linear(in_features=49600, out_features=1024, bias=False)
+          (act_fn): SiLU()
+        )
+      )
+    )
+    (norm): BiBoRMSNorm((1024,), eps=1e-06)
+    (rotary_emb): BiBoRotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=1024, out_features=128000, bias=False)
+)
+```