Upload GptOssForCausalLM
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +560 -0
- README.md +123 -0
- checkpoint_metadata.json +8 -0
- config.json +162 -0
- generation_config.json +12 -0
- model/lm_head/kernel/.zarray +1 -0
- model/lm_head/kernel/0.0 +3 -0
- model/lm_head/kernel/0.1 +3 -0
- model/lm_head/kernel/0.2 +3 -0
- model/lm_head/kernel/0.3 +3 -0
- model/model/embed_tokens/embedding/.zarray +1 -0
- model/model/embed_tokens/embedding/0.0 +3 -0
- model/model/embed_tokens/embedding/0.1 +3 -0
- model/model/embed_tokens/embedding/0.2 +3 -0
- model/model/embed_tokens/embedding/0.3 +3 -0
- model/model/layers/0/input_layernorm/kernel/.zarray +1 -0
- model/model/layers/0/input_layernorm/kernel/0 +0 -0
- model/model/layers/0/mlp/experts/down_proj/bias/.zarray +1 -0
- model/model/layers/0/mlp/experts/down_proj/bias/0.0 +3 -0
- model/model/layers/0/mlp/experts/down_proj/kernel/.zarray +1 -0
- model/model/layers/0/mlp/experts/down_proj/kernel/0.0.0 +3 -0
- model/model/layers/0/mlp/experts/gate_proj/bias/.zarray +1 -0
- model/model/layers/0/mlp/experts/gate_proj/bias/0.0 +3 -0
- model/model/layers/0/mlp/experts/gate_proj/kernel/.zarray +1 -0
- model/model/layers/0/mlp/experts/gate_proj/kernel/0.0.0 +3 -0
- model/model/layers/0/mlp/experts/up_proj/bias/.zarray +1 -0
- model/model/layers/0/mlp/experts/up_proj/bias/0.0 +3 -0
- model/model/layers/0/mlp/experts/up_proj/kernel/.zarray +1 -0
- model/model/layers/0/mlp/experts/up_proj/kernel/0.0.0 +3 -0
- model/model/layers/0/mlp/router/bias/.zarray +1 -0
- model/model/layers/0/mlp/router/bias/0 +0 -0
- model/model/layers/0/mlp/router/kernel/.zarray +1 -0
- model/model/layers/0/mlp/router/kernel/0.0 +3 -0
- model/model/layers/0/post_attention_layernorm/kernel/.zarray +1 -0
- model/model/layers/0/post_attention_layernorm/kernel/0 +0 -0
- model/model/layers/0/self_attn/k_proj/bias/.zarray +1 -0
- model/model/layers/0/self_attn/k_proj/bias/0 +0 -0
- model/model/layers/0/self_attn/k_proj/kernel/.zarray +1 -0
- model/model/layers/0/self_attn/k_proj/kernel/0.0 +3 -0
- model/model/layers/0/self_attn/k_proj/kernel/0.1 +3 -0
- model/model/layers/0/self_attn/k_proj/kernel/0.2 +3 -0
- model/model/layers/0/self_attn/k_proj/kernel/0.3 +3 -0
- model/model/layers/0/self_attn/o_proj/bias/.zarray +1 -0
- model/model/layers/0/self_attn/o_proj/bias/0 +0 -0
- model/model/layers/0/self_attn/o_proj/kernel/.zarray +1 -0
- model/model/layers/0/self_attn/o_proj/kernel/0.0 +3 -0
- model/model/layers/0/self_attn/o_proj/kernel/1.0 +3 -0
- model/model/layers/0/self_attn/o_proj/kernel/2.0 +3 -0
- model/model/layers/0/self_attn/o_proj/kernel/3.0 +3 -0
- model/model/layers/0/self_attn/q_proj/bias/.zarray +1 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,563 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
model/model/embed_tokens/embedding/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
model/model/embed_tokens/embedding/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
model/model/embed_tokens/embedding/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
model/model/embed_tokens/embedding/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
model/model/layers/21/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
model/model/layers/21/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
model/model/layers/21/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
model/model/layers/21/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
model/model/layers/21/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
model/model/layers/21/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
model/model/layers/21/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
model/model/layers/21/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
model/model/layers/21/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
model/model/layers/21/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
model/model/layers/21/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
model/model/layers/21/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
model/model/layers/21/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
model/model/layers/21/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
model/model/layers/21/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
model/model/layers/21/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
model/model/layers/21/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
model/model/layers/21/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
model/model/layers/21/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
model/model/layers/21/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
model/model/layers/21/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
model/model/layers/21/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
model/model/layers/21/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
model/model/layers/13/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
model/model/layers/13/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
model/model/layers/13/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
model/model/layers/13/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
model/model/layers/13/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
model/model/layers/13/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
model/model/layers/13/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
model/model/layers/13/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
model/model/layers/13/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
model/model/layers/13/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
model/model/layers/13/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
model/model/layers/13/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
model/model/layers/13/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
model/model/layers/13/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
model/model/layers/13/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
model/model/layers/13/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
model/model/layers/13/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
model/model/layers/13/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
model/model/layers/13/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
model/model/layers/13/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
model/model/layers/13/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
model/model/layers/13/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
model/model/layers/13/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
model/model/layers/1/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
model/model/layers/1/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
model/model/layers/1/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
model/model/layers/1/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
model/model/layers/1/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
model/model/layers/1/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
model/model/layers/1/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
model/model/layers/1/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
model/model/layers/1/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
model/model/layers/1/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
model/model/layers/1/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
model/model/layers/1/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
model/model/layers/1/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 99 |
+
model/model/layers/1/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 100 |
+
model/model/layers/1/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 101 |
+
model/model/layers/1/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 102 |
+
model/model/layers/1/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 103 |
+
model/model/layers/1/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 104 |
+
model/model/layers/1/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 105 |
+
model/model/layers/1/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
model/model/layers/1/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 107 |
+
model/model/layers/1/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
model/model/layers/1/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
model/model/layers/3/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
model/model/layers/3/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
model/model/layers/3/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
model/model/layers/3/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
model/model/layers/3/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
model/model/layers/3/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
model/model/layers/3/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
model/model/layers/3/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
model/model/layers/3/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
model/model/layers/3/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
model/model/layers/3/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
model/model/layers/3/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
model/model/layers/3/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
model/model/layers/3/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
model/model/layers/3/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
model/model/layers/3/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
model/model/layers/3/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
model/model/layers/3/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 127 |
+
model/model/layers/3/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 128 |
+
model/model/layers/3/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 129 |
+
model/model/layers/3/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 130 |
+
model/model/layers/3/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 131 |
+
model/model/layers/3/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 132 |
+
model/model/layers/4/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 133 |
+
model/model/layers/4/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 134 |
+
model/model/layers/4/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 135 |
+
model/model/layers/4/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 136 |
+
model/model/layers/4/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 137 |
+
model/model/layers/4/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 138 |
+
model/model/layers/4/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 139 |
+
model/model/layers/4/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 140 |
+
model/model/layers/4/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 141 |
+
model/model/layers/4/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 142 |
+
model/model/layers/4/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 143 |
+
model/model/layers/4/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 144 |
+
model/model/layers/4/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 145 |
+
model/model/layers/4/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 146 |
+
model/model/layers/4/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 147 |
+
model/model/layers/4/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 148 |
+
model/model/layers/4/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 149 |
+
model/model/layers/4/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 150 |
+
model/model/layers/4/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 151 |
+
model/model/layers/4/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 152 |
+
model/model/layers/4/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 153 |
+
model/model/layers/4/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 154 |
+
model/model/layers/4/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 155 |
+
model/model/layers/5/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 156 |
+
model/model/layers/5/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 157 |
+
model/model/layers/5/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 158 |
+
model/model/layers/5/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 159 |
+
model/model/layers/5/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 160 |
+
model/model/layers/5/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 161 |
+
model/model/layers/5/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 162 |
+
model/model/layers/5/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 163 |
+
model/model/layers/5/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 164 |
+
model/model/layers/5/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 165 |
+
model/model/layers/5/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 166 |
+
model/model/layers/5/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 167 |
+
model/model/layers/5/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 168 |
+
model/model/layers/5/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 169 |
+
model/model/layers/5/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 170 |
+
model/model/layers/5/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 171 |
+
model/model/layers/5/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 172 |
+
model/model/layers/5/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 173 |
+
model/model/layers/5/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 174 |
+
model/model/layers/5/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 175 |
+
model/model/layers/5/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 176 |
+
model/model/layers/5/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 177 |
+
model/model/layers/5/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 178 |
+
model/model/layers/20/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 179 |
+
model/model/layers/20/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 180 |
+
model/model/layers/20/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 181 |
+
model/model/layers/20/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 182 |
+
model/model/layers/20/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 183 |
+
model/model/layers/20/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 184 |
+
model/model/layers/20/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 185 |
+
model/model/layers/20/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 186 |
+
model/model/layers/20/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 187 |
+
model/model/layers/20/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 188 |
+
model/model/layers/20/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 189 |
+
model/model/layers/20/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 190 |
+
model/model/layers/20/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 191 |
+
model/model/layers/20/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 192 |
+
model/model/layers/20/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 193 |
+
model/model/layers/20/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 194 |
+
model/model/layers/20/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 195 |
+
model/model/layers/20/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 196 |
+
model/model/layers/20/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 197 |
+
model/model/layers/20/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 198 |
+
model/model/layers/20/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 199 |
+
model/model/layers/20/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 200 |
+
model/model/layers/20/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 201 |
+
model/model/layers/18/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 202 |
+
model/model/layers/18/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 203 |
+
model/model/layers/18/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 204 |
+
model/model/layers/18/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 205 |
+
model/model/layers/18/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 206 |
+
model/model/layers/18/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 207 |
+
model/model/layers/18/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 208 |
+
model/model/layers/18/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 209 |
+
model/model/layers/18/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 210 |
+
model/model/layers/18/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 211 |
+
model/model/layers/18/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 212 |
+
model/model/layers/18/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 213 |
+
model/model/layers/18/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 214 |
+
model/model/layers/18/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 215 |
+
model/model/layers/18/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 216 |
+
model/model/layers/18/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 217 |
+
model/model/layers/18/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 218 |
+
model/model/layers/18/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 219 |
+
model/model/layers/18/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 220 |
+
model/model/layers/18/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 221 |
+
model/model/layers/18/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 222 |
+
model/model/layers/18/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 223 |
+
model/model/layers/18/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 224 |
+
model/model/layers/17/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 225 |
+
model/model/layers/17/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 226 |
+
model/model/layers/17/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 227 |
+
model/model/layers/17/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 228 |
+
model/model/layers/17/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 229 |
+
model/model/layers/17/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 230 |
+
model/model/layers/17/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 231 |
+
model/model/layers/17/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 232 |
+
model/model/layers/17/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 233 |
+
model/model/layers/17/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 234 |
+
model/model/layers/17/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 235 |
+
model/model/layers/17/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 236 |
+
model/model/layers/17/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 237 |
+
model/model/layers/17/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 238 |
+
model/model/layers/17/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 239 |
+
model/model/layers/17/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 240 |
+
model/model/layers/17/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 241 |
+
model/model/layers/17/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 242 |
+
model/model/layers/17/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 243 |
+
model/model/layers/17/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 244 |
+
model/model/layers/17/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 245 |
+
model/model/layers/17/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 246 |
+
model/model/layers/17/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 247 |
+
model/model/layers/19/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 248 |
+
model/model/layers/19/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 249 |
+
model/model/layers/19/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 250 |
+
model/model/layers/19/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 251 |
+
model/model/layers/19/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 252 |
+
model/model/layers/19/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 253 |
+
model/model/layers/19/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 254 |
+
model/model/layers/19/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 255 |
+
model/model/layers/19/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 256 |
+
model/model/layers/19/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 257 |
+
model/model/layers/19/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 258 |
+
model/model/layers/19/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 259 |
+
model/model/layers/19/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 260 |
+
model/model/layers/19/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 261 |
+
model/model/layers/19/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 262 |
+
model/model/layers/19/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 263 |
+
model/model/layers/19/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 264 |
+
model/model/layers/19/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 265 |
+
model/model/layers/19/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 266 |
+
model/model/layers/19/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 267 |
+
model/model/layers/19/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 268 |
+
model/model/layers/19/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 269 |
+
model/model/layers/19/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 270 |
+
model/model/layers/10/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 271 |
+
model/model/layers/10/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 272 |
+
model/model/layers/10/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 273 |
+
model/model/layers/10/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 274 |
+
model/model/layers/10/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 275 |
+
model/model/layers/10/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 276 |
+
model/model/layers/10/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 277 |
+
model/model/layers/10/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 278 |
+
model/model/layers/10/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 279 |
+
model/model/layers/10/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 280 |
+
model/model/layers/10/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 281 |
+
model/model/layers/10/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 282 |
+
model/model/layers/10/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 283 |
+
model/model/layers/10/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 284 |
+
model/model/layers/10/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 285 |
+
model/model/layers/10/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 286 |
+
model/model/layers/10/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 287 |
+
model/model/layers/10/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 288 |
+
model/model/layers/10/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 289 |
+
model/model/layers/10/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 290 |
+
model/model/layers/10/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 291 |
+
model/model/layers/10/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 292 |
+
model/model/layers/10/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 293 |
+
model/model/layers/8/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 294 |
+
model/model/layers/8/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 295 |
+
model/model/layers/8/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 296 |
+
model/model/layers/8/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 297 |
+
model/model/layers/8/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 298 |
+
model/model/layers/8/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 299 |
+
model/model/layers/8/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 300 |
+
model/model/layers/8/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 301 |
+
model/model/layers/8/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 302 |
+
model/model/layers/8/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 303 |
+
model/model/layers/8/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 304 |
+
model/model/layers/8/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 305 |
+
model/model/layers/8/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 306 |
+
model/model/layers/8/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 307 |
+
model/model/layers/8/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 308 |
+
model/model/layers/8/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 309 |
+
model/model/layers/8/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 310 |
+
model/model/layers/8/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 311 |
+
model/model/layers/8/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 312 |
+
model/model/layers/8/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 313 |
+
model/model/layers/8/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 314 |
+
model/model/layers/8/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 315 |
+
model/model/layers/8/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 316 |
+
model/model/layers/16/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 317 |
+
model/model/layers/16/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 318 |
+
model/model/layers/16/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 319 |
+
model/model/layers/16/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 320 |
+
model/model/layers/16/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 321 |
+
model/model/layers/16/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 322 |
+
model/model/layers/16/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 323 |
+
model/model/layers/16/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 324 |
+
model/model/layers/16/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 325 |
+
model/model/layers/16/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 326 |
+
model/model/layers/16/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 327 |
+
model/model/layers/16/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 328 |
+
model/model/layers/16/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 329 |
+
model/model/layers/16/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 330 |
+
model/model/layers/16/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 331 |
+
model/model/layers/16/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 332 |
+
model/model/layers/16/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 333 |
+
model/model/layers/16/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 334 |
+
model/model/layers/16/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 335 |
+
model/model/layers/16/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 336 |
+
model/model/layers/16/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 337 |
+
model/model/layers/16/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 338 |
+
model/model/layers/16/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 339 |
+
model/model/layers/11/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 340 |
+
model/model/layers/11/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 341 |
+
model/model/layers/11/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 342 |
+
model/model/layers/11/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 343 |
+
model/model/layers/11/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 344 |
+
model/model/layers/11/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 345 |
+
model/model/layers/11/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 346 |
+
model/model/layers/11/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 347 |
+
model/model/layers/11/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 348 |
+
model/model/layers/11/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 349 |
+
model/model/layers/11/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 350 |
+
model/model/layers/11/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 351 |
+
model/model/layers/11/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 352 |
+
model/model/layers/11/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 353 |
+
model/model/layers/11/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 354 |
+
model/model/layers/11/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 355 |
+
model/model/layers/11/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 356 |
+
model/model/layers/11/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 357 |
+
model/model/layers/11/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 358 |
+
model/model/layers/11/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 359 |
+
model/model/layers/11/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 360 |
+
model/model/layers/11/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 361 |
+
model/model/layers/11/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 362 |
+
model/model/layers/14/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 363 |
+
model/model/layers/14/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 364 |
+
model/model/layers/14/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 365 |
+
model/model/layers/14/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 366 |
+
model/model/layers/14/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 367 |
+
model/model/layers/14/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 368 |
+
model/model/layers/14/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 369 |
+
model/model/layers/14/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 370 |
+
model/model/layers/14/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 371 |
+
model/model/layers/14/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 372 |
+
model/model/layers/14/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 373 |
+
model/model/layers/14/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 374 |
+
model/model/layers/14/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 375 |
+
model/model/layers/14/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 376 |
+
model/model/layers/14/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 377 |
+
model/model/layers/14/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 378 |
+
model/model/layers/14/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 379 |
+
model/model/layers/14/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 380 |
+
model/model/layers/14/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 381 |
+
model/model/layers/14/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 382 |
+
model/model/layers/14/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 383 |
+
model/model/layers/14/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 384 |
+
model/model/layers/14/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 385 |
+
model/model/layers/6/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 386 |
+
model/model/layers/6/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 387 |
+
model/model/layers/6/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 388 |
+
model/model/layers/6/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 389 |
+
model/model/layers/6/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 390 |
+
model/model/layers/6/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 391 |
+
model/model/layers/6/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 392 |
+
model/model/layers/6/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 393 |
+
model/model/layers/6/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 394 |
+
model/model/layers/6/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 395 |
+
model/model/layers/6/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 396 |
+
model/model/layers/6/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 397 |
+
model/model/layers/6/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 398 |
+
model/model/layers/6/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 399 |
+
model/model/layers/6/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 400 |
+
model/model/layers/6/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 401 |
+
model/model/layers/6/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 402 |
+
model/model/layers/6/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 403 |
+
model/model/layers/6/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 404 |
+
model/model/layers/6/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 405 |
+
model/model/layers/6/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 406 |
+
model/model/layers/6/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 407 |
+
model/model/layers/6/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 408 |
+
model/model/layers/23/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 409 |
+
model/model/layers/23/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 410 |
+
model/model/layers/23/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 411 |
+
model/model/layers/23/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 412 |
+
model/model/layers/23/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 413 |
+
model/model/layers/23/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 414 |
+
model/model/layers/23/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 415 |
+
model/model/layers/23/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 416 |
+
model/model/layers/23/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 417 |
+
model/model/layers/23/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 418 |
+
model/model/layers/23/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 419 |
+
model/model/layers/23/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 420 |
+
model/model/layers/23/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 421 |
+
model/model/layers/23/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 422 |
+
model/model/layers/23/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 423 |
+
model/model/layers/23/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 424 |
+
model/model/layers/23/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 425 |
+
model/model/layers/23/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 426 |
+
model/model/layers/23/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 427 |
+
model/model/layers/23/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 428 |
+
model/model/layers/23/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 429 |
+
model/model/layers/23/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 430 |
+
model/model/layers/23/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 431 |
+
model/model/layers/2/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 432 |
+
model/model/layers/2/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 433 |
+
model/model/layers/2/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 434 |
+
model/model/layers/2/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 435 |
+
model/model/layers/2/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 436 |
+
model/model/layers/2/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 437 |
+
model/model/layers/2/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 438 |
+
model/model/layers/2/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 439 |
+
model/model/layers/2/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 440 |
+
model/model/layers/2/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 441 |
+
model/model/layers/2/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 442 |
+
model/model/layers/2/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 443 |
+
model/model/layers/2/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 444 |
+
model/model/layers/2/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 445 |
+
model/model/layers/2/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 446 |
+
model/model/layers/2/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 447 |
+
model/model/layers/2/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 448 |
+
model/model/layers/2/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 449 |
+
model/model/layers/2/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 450 |
+
model/model/layers/2/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 451 |
+
model/model/layers/2/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 452 |
+
model/model/layers/2/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 453 |
+
model/model/layers/2/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 454 |
+
model/model/layers/9/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 455 |
+
model/model/layers/9/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 456 |
+
model/model/layers/9/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 457 |
+
model/model/layers/9/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 458 |
+
model/model/layers/9/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 459 |
+
model/model/layers/9/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 460 |
+
model/model/layers/9/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 461 |
+
model/model/layers/9/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 462 |
+
model/model/layers/9/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 463 |
+
model/model/layers/9/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 464 |
+
model/model/layers/9/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 465 |
+
model/model/layers/9/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 466 |
+
model/model/layers/9/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 467 |
+
model/model/layers/9/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 468 |
+
model/model/layers/9/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 469 |
+
model/model/layers/9/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 470 |
+
model/model/layers/9/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 471 |
+
model/model/layers/9/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 472 |
+
model/model/layers/9/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 473 |
+
model/model/layers/9/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 474 |
+
model/model/layers/9/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 475 |
+
model/model/layers/9/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 476 |
+
model/model/layers/9/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 477 |
+
model/model/layers/22/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 478 |
+
model/model/layers/22/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 479 |
+
model/model/layers/22/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 480 |
+
model/model/layers/22/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 481 |
+
model/model/layers/22/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 482 |
+
model/model/layers/22/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 483 |
+
model/model/layers/22/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 484 |
+
model/model/layers/22/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 485 |
+
model/model/layers/22/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 486 |
+
model/model/layers/22/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 487 |
+
model/model/layers/22/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 488 |
+
model/model/layers/22/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 489 |
+
model/model/layers/22/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 490 |
+
model/model/layers/22/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 491 |
+
model/model/layers/22/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 492 |
+
model/model/layers/22/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 493 |
+
model/model/layers/22/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 494 |
+
model/model/layers/22/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 495 |
+
model/model/layers/22/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 496 |
+
model/model/layers/22/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 497 |
+
model/model/layers/22/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 498 |
+
model/model/layers/22/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 499 |
+
model/model/layers/22/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 500 |
+
model/model/layers/15/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 501 |
+
model/model/layers/15/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 502 |
+
model/model/layers/15/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 503 |
+
model/model/layers/15/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 504 |
+
model/model/layers/15/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 505 |
+
model/model/layers/15/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 506 |
+
model/model/layers/15/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 507 |
+
model/model/layers/15/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 508 |
+
model/model/layers/15/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 509 |
+
model/model/layers/15/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 510 |
+
model/model/layers/15/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 511 |
+
model/model/layers/15/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 512 |
+
model/model/layers/15/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 513 |
+
model/model/layers/15/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 514 |
+
model/model/layers/15/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 515 |
+
model/model/layers/15/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 516 |
+
model/model/layers/15/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 517 |
+
model/model/layers/15/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 518 |
+
model/model/layers/15/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 519 |
+
model/model/layers/15/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 520 |
+
model/model/layers/15/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 521 |
+
model/model/layers/15/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 522 |
+
model/model/layers/15/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 523 |
+
model/model/layers/7/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 524 |
+
model/model/layers/7/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 525 |
+
model/model/layers/7/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 526 |
+
model/model/layers/7/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 527 |
+
model/model/layers/7/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 528 |
+
model/model/layers/7/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 529 |
+
model/model/layers/7/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 530 |
+
model/model/layers/7/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 531 |
+
model/model/layers/7/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 532 |
+
model/model/layers/7/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 533 |
+
model/model/layers/7/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 534 |
+
model/model/layers/7/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 535 |
+
model/model/layers/7/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 536 |
+
model/model/layers/7/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 537 |
+
model/model/layers/7/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 538 |
+
model/model/layers/7/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 539 |
+
model/model/layers/7/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 540 |
+
model/model/layers/7/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 541 |
+
model/model/layers/7/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 542 |
+
model/model/layers/7/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 543 |
+
model/model/layers/7/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 544 |
+
model/model/layers/7/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 545 |
+
model/model/layers/7/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 546 |
+
model/model/layers/0/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 547 |
+
model/model/layers/0/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 548 |
+
model/model/layers/0/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 549 |
+
model/model/layers/0/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 550 |
+
model/model/layers/0/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 551 |
+
model/model/layers/0/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 552 |
+
model/model/layers/0/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 553 |
+
model/model/layers/0/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 554 |
+
model/model/layers/0/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 555 |
+
model/model/layers/0/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 556 |
+
model/model/layers/0/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 557 |
+
model/model/layers/0/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 558 |
+
model/model/layers/0/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 559 |
+
model/model/layers/0/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 560 |
+
model/model/layers/0/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 561 |
+
model/model/layers/0/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 562 |
+
model/model/layers/0/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 563 |
+
model/model/layers/0/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 564 |
+
model/model/layers/0/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 565 |
+
model/model/layers/0/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 566 |
+
model/model/layers/0/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 567 |
+
model/model/layers/0/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 568 |
+
model/model/layers/0/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 569 |
+
model/model/layers/12/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 570 |
+
model/model/layers/12/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 571 |
+
model/model/layers/12/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 572 |
+
model/model/layers/12/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 573 |
+
model/model/layers/12/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 574 |
+
model/model/layers/12/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 575 |
+
model/model/layers/12/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 576 |
+
model/model/layers/12/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 577 |
+
model/model/layers/12/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
|
| 578 |
+
model/model/layers/12/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
|
| 579 |
+
model/model/layers/12/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 580 |
+
model/model/layers/12/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
|
| 581 |
+
model/model/layers/12/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 582 |
+
model/model/layers/12/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 583 |
+
model/model/layers/12/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 584 |
+
model/model/layers/12/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
| 585 |
+
model/model/layers/12/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 586 |
+
model/model/layers/12/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 587 |
+
model/model/layers/12/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 588 |
+
model/model/layers/12/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 589 |
+
model/model/layers/12/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 590 |
+
model/model/layers/12/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 591 |
+
model/model/layers/12/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 592 |
+
model/lm_head/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
|
| 593 |
+
model/lm_head/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
|
| 594 |
+
model/lm_head/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
|
| 595 |
+
model/lm_head/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- EasyDeL
|
| 4 |
+
- GptOssForCausalLM
|
| 5 |
+
- TaskType.CAUSAL_LM
|
| 6 |
+
- AttentionMechanisms.RAGGED_PAGE_ATTENTION_V3
|
| 7 |
+
- safetensors
|
| 8 |
+
- TPU
|
| 9 |
+
- GPU
|
| 10 |
+
- XLA
|
| 11 |
+
- Flax
|
| 12 |
+
---
|
| 13 |
+
<p align="center">
|
| 14 |
+
<a href="https://github.com/erfanzar/EasyDeL">
|
| 15 |
+
<img src="https://raw.githubusercontent.com/erfanzar/easydel/main/images/easydel-logo-with-text.png" height="80">
|
| 16 |
+
</a>
|
| 17 |
+
</p>
|
| 18 |
+
|
| 19 |
+
<p align="center">
|
| 20 |
+
<a href="https://github.com/erfanzar/EasyDeL">
|
| 21 |
+
<img src="https://img.shields.io/badge/🤗_EasyDeL-0.1.5-blue.svg" />
|
| 22 |
+
</a>
|
| 23 |
+
<a href="https://github.com/erfanzar/EasyDeL">
|
| 24 |
+
<img src="https://img.shields.io/badge/Model_Type-GptOssForCausalLM-green.svg" />
|
| 25 |
+
</a>
|
| 26 |
+
</p>
|
| 27 |
+
|
| 28 |
+
# EasyDeL/gpt-oss-20b
|
| 29 |
+
|
| 30 |
+
A model implemented using the EasyDeL framework, designed to deliver optimal performance for large-scale natural language processing tasks.
|
| 31 |
+
|
| 32 |
+
## Overview
|
| 33 |
+
|
| 34 |
+
This model is built using [EasyDeL](https://github.com/erfanzar/EasyDeL), an open-source framework designed to enhance and streamline the training and serving process of machine learning models, with a primary focus on Jax/Flax on TPU/GPU at scale.
|
| 35 |
+
|
| 36 |
+
EasyDeL provides an efficient, highly-optimized, and customizable machine learning model compatible with both GPU and TPU environments. Built with JAX, this model supports advanced features such as sharded model parallelism, making it suitable for distributed training and inference and customized kernels.
|
| 37 |
+
|
| 38 |
+
## Features Provided by EasyDeL
|
| 39 |
+
|
| 40 |
+
**EasyDeL Framework Features:**
|
| 41 |
+
|
| 42 |
+
- **Efficient Implementation**: Built with JAX/Flax for high-performance computation.
|
| 43 |
+
- **Modern Architecture**: Built on Flax NNX for better integration, modularity, and performance.
|
| 44 |
+
- **Multi-Device Support**: Optimized to run on TPU, GPU, and CPU environments.
|
| 45 |
+
- **Sharded Model Parallelism**: Supports model parallelism across multiple devices for scalability (using `auto_shard_model=True`).
|
| 46 |
+
- **Customizable Precision**: Allows specification of `dtype`, `param_dtype`, and `precision`.
|
| 47 |
+
- **Advanced Serving**: Includes `eSurge` LLM serving engine, `vWhisper` speech endpoints, and OpenAI-compatible APIs.
|
| 48 |
+
- **Optimized Kernels**: Integrates multiple attention mechanisms (like `AttentionMechanisms.RAGGED_PAGE_ATTENTION_V3`) and platform-specific optimizations.
|
| 49 |
+
|
| 50 |
+
## Installation
|
| 51 |
+
|
| 52 |
+
To use this model via EasyDeL, first install EasyDeL:
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
pip install easydel
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Usage
|
| 59 |
+
|
| 60 |
+
### Loading the Pre-trained Model
|
| 61 |
+
|
| 62 |
+
To load this pre-trained model with EasyDeL:
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
from easydel import AutoEasyDeLModelForCausalLM, EasyDeLBaseConfigDict, AttentionMechanisms
|
| 66 |
+
from jax import numpy as jnp, lax
|
| 67 |
+
|
| 68 |
+
# Define max_length if needed for memory optimization
|
| 69 |
+
max_length = None
|
| 70 |
+
|
| 71 |
+
# Load model and parameters
|
| 72 |
+
# Set auto_shard_model=True to automatically distribute across devices
|
| 73 |
+
model = AutoEasyDeLModelForCausalLM.from_pretrained(
|
| 74 |
+
"EasyDeL/gpt-oss-20b",
|
| 75 |
+
config_kwargs=EasyDeLBaseConfigDict(
|
| 76 |
+
# use_scan_mlp=False, # Set to True to potentially reduce memory usage
|
| 77 |
+
attn_dtype=jnp.float16, # Or jnp.bfloat16
|
| 78 |
+
# freq_max_position_embeddings=max_length, # Set if using RoPE and need truncation
|
| 79 |
+
# mask_max_position_embeddings=max_length, # Set if max length is defined
|
| 80 |
+
attn_mechanism=AttentionMechanisms.PAGED # Matches the mechanism used by this model
|
| 81 |
+
),
|
| 82 |
+
dtype=jnp.float16, # Or jnp.bfloat16 - Computation data type
|
| 83 |
+
param_dtype=jnp.float16, # Or jnp.bfloat16 - Parameter data type
|
| 84 |
+
precision=lax.Precision("fastest"), # Like "default", "fastest", "high", "highest"
|
| 85 |
+
auto_shard_model=True, # Auto-shard across available devices
|
| 86 |
+
)
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## Supported Tasks
|
| 90 |
+
|
| 91 |
+
The primary task for this model is **TaskType.CAUSAL_LM**. Further specific supported tasks are not explicitly listed.
|
| 92 |
+
|
| 93 |
+
## Limitations
|
| 94 |
+
|
| 95 |
+
**General Limitations:**
|
| 96 |
+
|
| 97 |
+
- **Hardware Dependency**: Performance can vary significantly based on the hardware (TPU/GPU) used.
|
| 98 |
+
- **JAX/Flax Setup Required**: The environment must support JAX/Flax for optimal use.
|
| 99 |
+
- **Experimental Features**: Some EasyDeL features (like custom kernels) may require additional configuration.
|
| 100 |
+
|
| 101 |
+
## License 📜
|
| 102 |
+
|
| 103 |
+
EasyDeL is released under the Apache v2 license. The license for this specific model might differ; please consult the original model repository or documentation.
|
| 104 |
+
|
| 105 |
+
```code
|
| 106 |
+
# Apache License 2.0 (referring to EasyDeL Framework)
|
| 107 |
+
# ... (Full license text usually included in the main repo) ...
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
## Citation
|
| 111 |
+
|
| 112 |
+
If you use EasyDeL in your research or work, please cite it:
|
| 113 |
+
|
| 114 |
+
```bibtex
|
| 115 |
+
@misc{Zare Chavoshi_2023,
|
| 116 |
+
title={EasyDeL: An open-source library for enhancing and streamlining the training process of machine learning models},
|
| 117 |
+
url={https://github.com/erfanzar/EasyDeL},
|
| 118 |
+
author={Zare Chavoshi, Erfan},
|
| 119 |
+
year={2023}
|
| 120 |
+
}
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
Please also consider citing the original paper or source for the **EasyDeL/gpt-oss-20b** model architecture if applicable.
|
checkpoint_metadata.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "0.0.84",
|
| 3 |
+
"timestamp": "2025-11-24T14:39:38.911214",
|
| 4 |
+
"checksum": {},
|
| 5 |
+
"array_metadata": {},
|
| 6 |
+
"framework_version": null,
|
| 7 |
+
"custom_metadata": {}
|
| 8 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GptOssForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": true,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"attn_mechanism": "ragged_page_attention_v3",
|
| 8 |
+
"backend": null,
|
| 9 |
+
"bits": null,
|
| 10 |
+
"blocksize_b": 1,
|
| 11 |
+
"blocksize_k": 128,
|
| 12 |
+
"blocksize_q": 128,
|
| 13 |
+
"decode_attn_mechanism": null,
|
| 14 |
+
"dtype": "bfloat16",
|
| 15 |
+
"easy_method": "train",
|
| 16 |
+
"eos_token_id": 200002,
|
| 17 |
+
"experts_per_token": 4,
|
| 18 |
+
"fcm_max_ratio": 0.0,
|
| 19 |
+
"fcm_min_ratio": 0.0,
|
| 20 |
+
"flash_attention_backward_pass_impl": "triton",
|
| 21 |
+
"freq_max_position_embeddings": 4096,
|
| 22 |
+
"fsdp_is_ep_bound": true,
|
| 23 |
+
"gradient_checkpointing": "",
|
| 24 |
+
"gradient_checkpointing_targets": null,
|
| 25 |
+
"hardware_abstraction": true,
|
| 26 |
+
"head_dim": 64,
|
| 27 |
+
"hidden_act": "silu",
|
| 28 |
+
"hidden_size": 2880,
|
| 29 |
+
"initial_context_length": 4096,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"intermediate_size": 2880,
|
| 32 |
+
"kv_cache_quantization_blocksize": 128,
|
| 33 |
+
"kv_cache_quantization_method": "None",
|
| 34 |
+
"kv_cache_sharding_sequence_axis_name": "sp",
|
| 35 |
+
"layer_types": [
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"sliding_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"sliding_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"full_attention",
|
| 46 |
+
"sliding_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"sliding_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"sliding_attention",
|
| 57 |
+
"full_attention",
|
| 58 |
+
"sliding_attention",
|
| 59 |
+
"full_attention"
|
| 60 |
+
],
|
| 61 |
+
"mask_max_position_embeddings": 4096,
|
| 62 |
+
"max_position_embeddings": 131072,
|
| 63 |
+
"mlp_activations_limit": 7.0,
|
| 64 |
+
"model_type": "gpt_oss",
|
| 65 |
+
"moe_force_xla_gmm": false,
|
| 66 |
+
"moe_method": "fused_moe",
|
| 67 |
+
"moe_tiling_size_batch": 4,
|
| 68 |
+
"moe_tiling_size_dim": 128,
|
| 69 |
+
"moe_tiling_size_seqlen": 128,
|
| 70 |
+
"num_attention_heads": 64,
|
| 71 |
+
"num_experts_per_tok": 4,
|
| 72 |
+
"num_hidden_layers": 24,
|
| 73 |
+
"num_key_value_heads": 8,
|
| 74 |
+
"num_local_experts": 32,
|
| 75 |
+
"output_router_logits": false,
|
| 76 |
+
"pallas_k_block_size": 128,
|
| 77 |
+
"pallas_m_block_size": 128,
|
| 78 |
+
"pallas_n_block_size": 128,
|
| 79 |
+
"partition_axis": {
|
| 80 |
+
"attention_dim_axis": null,
|
| 81 |
+
"attention_kv_dim_axis": null,
|
| 82 |
+
"batch_axis": [
|
| 83 |
+
"fsdp",
|
| 84 |
+
"dp"
|
| 85 |
+
],
|
| 86 |
+
"bias_head_sequence_axis": null,
|
| 87 |
+
"bias_key_sequence_axis": null,
|
| 88 |
+
"data_parallel_axis": "dp",
|
| 89 |
+
"decode_attention_dim_axis": null,
|
| 90 |
+
"decode_attention_kv_dim_axis": null,
|
| 91 |
+
"decode_batch_axis": [
|
| 92 |
+
"fsdp",
|
| 93 |
+
"dp"
|
| 94 |
+
],
|
| 95 |
+
"decode_head_axis": "tp",
|
| 96 |
+
"decode_key_sequence_axis": "sp",
|
| 97 |
+
"decode_kv_head_axis": "tp",
|
| 98 |
+
"decode_query_sequence_axis": null,
|
| 99 |
+
"expert_axis": "ep",
|
| 100 |
+
"expert_gate_axis": null,
|
| 101 |
+
"expert_parallel_axis": "ep",
|
| 102 |
+
"fully_sharded_data_parallel_axis": "fsdp",
|
| 103 |
+
"head_axis": "tp",
|
| 104 |
+
"hidden_state_axis": "tp",
|
| 105 |
+
"key_sequence_axis": "sp",
|
| 106 |
+
"kv_head_axis": "tp",
|
| 107 |
+
"mlp_intermediate_axis": "tp",
|
| 108 |
+
"query_sequence_axis": "sp",
|
| 109 |
+
"sequence_axis": "sp",
|
| 110 |
+
"sequence_parallel_axis": "sp",
|
| 111 |
+
"tensor_parallel_axis": "tp",
|
| 112 |
+
"vocab_axis": "tp"
|
| 113 |
+
},
|
| 114 |
+
"platform": null,
|
| 115 |
+
"precompute_masks": true,
|
| 116 |
+
"pretraining_tp": 1,
|
| 117 |
+
"quantization_blocksize": 64,
|
| 118 |
+
"quantization_method": "None",
|
| 119 |
+
"quantization_pattern": ".*",
|
| 120 |
+
"rms_norm_eps": 1e-05,
|
| 121 |
+
"rope_scaling": {
|
| 122 |
+
"beta_fast": 32.0,
|
| 123 |
+
"beta_slow": 1.0,
|
| 124 |
+
"factor": 32.0,
|
| 125 |
+
"original_max_position_embeddings": 4096,
|
| 126 |
+
"rope_type": "yarn",
|
| 127 |
+
"truncate": false
|
| 128 |
+
},
|
| 129 |
+
"rope_theta": 150000,
|
| 130 |
+
"router_aux_loss_coef": 0.9,
|
| 131 |
+
"scan_attention_layers": false,
|
| 132 |
+
"scan_mlp_chunk_size": 1024,
|
| 133 |
+
"scan_ring_attention": true,
|
| 134 |
+
"sequence_axis_name": "sp",
|
| 135 |
+
"sharding_axis_dims": [
|
| 136 |
+
1,
|
| 137 |
+
1,
|
| 138 |
+
1,
|
| 139 |
+
-1,
|
| 140 |
+
1
|
| 141 |
+
],
|
| 142 |
+
"sharding_axis_names": [
|
| 143 |
+
"dp",
|
| 144 |
+
"fsdp",
|
| 145 |
+
"ep",
|
| 146 |
+
"tp",
|
| 147 |
+
"sp"
|
| 148 |
+
],
|
| 149 |
+
"sharding_dcn_axis_dims": null,
|
| 150 |
+
"sliding_window": 128,
|
| 151 |
+
"sp_is_ep_bound": true,
|
| 152 |
+
"swiglu_limit": 7.0,
|
| 153 |
+
"tie_word_embeddings": false,
|
| 154 |
+
"transformers_version": "4.57.1",
|
| 155 |
+
"use_cache": true,
|
| 156 |
+
"use_expert_tensor_mode": false,
|
| 157 |
+
"use_ring_of_experts": false,
|
| 158 |
+
"use_scan_mlp": false,
|
| 159 |
+
"use_sharded_kv_caching": false,
|
| 160 |
+
"use_sharding_constraint": false,
|
| 161 |
+
"vocab_size": 201088
|
| 162 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 199998,
|
| 3 |
+
"device": null,
|
| 4 |
+
"do_sample": true,
|
| 5 |
+
"eos_token_id": [
|
| 6 |
+
200002,
|
| 7 |
+
199999,
|
| 8 |
+
200012
|
| 9 |
+
],
|
| 10 |
+
"pad_token_id": 199999,
|
| 11 |
+
"transformers_version": "4.57.1"
|
| 12 |
+
}
|
model/lm_head/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[2880,50272],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,201088],"zarr_format":2}
|
model/lm_head/kernel/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a59a0697e61df795e9d452bca2567b951825a916bdb8e68de07dab2fe96a648
|
| 3 |
+
size 226641977
|
model/lm_head/kernel/0.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:738005c53d3d3985f9e69081cb9e9e43f04cb5471a8cec5325c9cb7ab76ba644
|
| 3 |
+
size 226681894
|
model/lm_head/kernel/0.2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b4589e1d887a07e81513b318316dccd1f5fef87fa942d667ab4fefe7b8291b8
|
| 3 |
+
size 226770968
|
model/lm_head/kernel/0.3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1e1220e3a3d6982aaae3f5bbf444b71e89adf553a954fdcde257b849a70e5f4
|
| 3 |
+
size 227725079
|
model/model/embed_tokens/embedding/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[201088,720],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[201088,2880],"zarr_format":2}
|
model/model/embed_tokens/embedding/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3adc019f3fca5e4a4d2f9d58ccacaa019db91b4658323093331f25f006cc9a00
|
| 3 |
+
size 238375444
|
model/model/embed_tokens/embedding/0.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d93e52183c1eaf471bf4511e12b3241ef43d3a68cc3049d4cf864388ed0d546
|
| 3 |
+
size 238013629
|
model/model/embed_tokens/embedding/0.2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce50c59d118efd64f0902246f08d0313ecb7a22c2390769ace5644ff59805f00
|
| 3 |
+
size 237651927
|
model/model/embed_tokens/embedding/0.3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15b170706b9ab63112ea73a2d78cac3ec75716a09d5bafa4c472c9631996827b
|
| 3 |
+
size 238261890
|
model/model/layers/0/input_layernorm/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2}
|
model/model/layers/0/input_layernorm/kernel/0
ADDED
|
Binary file (3.97 kB). View file
|
|
|
model/model/layers/0/mlp/experts/down_proj/bias/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880],"zarr_format":2}
|
model/model/layers/0/mlp/experts/down_proj/bias/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9aeb585a75b26b0719fd983cf91626a88e0aa9e02010c1ee5cd34bff3d004f5e
|
| 3 |
+
size 148544
|
model/model/layers/0/mlp/experts/down_proj/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880,2880],"zarr_format":2}
|
model/model/layers/0/mlp/experts/down_proj/kernel/0.0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ad0b1eb9f3fc213a92a640b3ba3ddfa2116c573a4f09c9e0f3a2ce56f305659
|
| 3 |
+
size 205875490
|
model/model/layers/0/mlp/experts/gate_proj/bias/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880],"zarr_format":2}
|
model/model/layers/0/mlp/experts/gate_proj/bias/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73bf83d1d04f1f22ac7ee7b9c00b3bfd46dc85d0e2b8a2ab72df38a47591ed3e
|
| 3 |
+
size 126932
|
model/model/layers/0/mlp/experts/gate_proj/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880,2880],"zarr_format":2}
|
model/model/layers/0/mlp/experts/gate_proj/kernel/0.0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8829b36b49b61dea3877236700cc78fd3f289a7a4aabc1c90b815ed96363632
|
| 3 |
+
size 176413980
|
model/model/layers/0/mlp/experts/up_proj/bias/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880],"zarr_format":2}
|
model/model/layers/0/mlp/experts/up_proj/bias/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65c6a12a90133fa6f0d2b34e49478d42e4be9523e7caf2c85b1e3fb401d30921
|
| 3 |
+
size 108408
|
model/model/layers/0/mlp/experts/up_proj/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880,2880],"zarr_format":2}
|
model/model/layers/0/mlp/experts/up_proj/kernel/0.0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1dfbccadccf5d0e6b3e69d45cb18c942432a1a62cc3ec245e4a346077503c98a
|
| 3 |
+
size 175659503
|
model/model/layers/0/mlp/router/bias/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[32],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32],"zarr_format":2}
|
model/model/layers/0/mlp/router/bias/0
ADDED
|
Binary file (73 Bytes). View file
|
|
|
model/model/layers/0/mlp/router/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[2880,32],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,32],"zarr_format":2}
|
model/model/layers/0/mlp/router/kernel/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6916637eaabd97130c84226d31c856e5ec7eba6589c5c7bd598d40ddfbf8ad41
|
| 3 |
+
size 146759
|
model/model/layers/0/post_attention_layernorm/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2}
|
model/model/layers/0/post_attention_layernorm/kernel/0
ADDED
|
Binary file (4.12 kB). View file
|
|
|
model/model/layers/0/self_attn/k_proj/bias/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2}
|
model/model/layers/0/self_attn/k_proj/bias/0
ADDED
|
Binary file (21 Bytes). View file
|
|
|
model/model/layers/0/self_attn/k_proj/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[2880,128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,512],"zarr_format":2}
|
model/model/layers/0/self_attn/k_proj/kernel/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb28d0346a7f6d995f75fc86b6ad1b97c0b08ba9697c7fcb6de652ac26721ed5
|
| 3 |
+
size 591455
|
model/model/layers/0/self_attn/k_proj/kernel/0.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:126aefc77772730e4601722e663c581e96b4c1a6965d7644a244e4d5a33454a7
|
| 3 |
+
size 591779
|
model/model/layers/0/self_attn/k_proj/kernel/0.2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebef9e3d47721de81a32b985eb4808178311bde35775f25e2447686e335ef095
|
| 3 |
+
size 593231
|
model/model/layers/0/self_attn/k_proj/kernel/0.3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3ef0df3311ec6863669f88ca7f78f84cdc5f1b56682eeb304bfdd0d194cad98
|
| 3 |
+
size 590638
|
model/model/layers/0/self_attn/o_proj/bias/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2}
|
model/model/layers/0/self_attn/o_proj/bias/0
ADDED
|
Binary file (4.59 kB). View file
|
|
|
model/model/layers/0/self_attn/o_proj/kernel/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[1024,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096,2880],"zarr_format":2}
|
model/model/layers/0/self_attn/o_proj/kernel/0.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a1ac4e4b9b92bace048723b1910813b433c205f4de2b43aed7194e540dec035
|
| 3 |
+
size 4677833
|
model/model/layers/0/self_attn/o_proj/kernel/1.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e84b5d764029feb8b973b01797488701f432f8937e813ef1c89bb6119529246e
|
| 3 |
+
size 4673577
|
model/model/layers/0/self_attn/o_proj/kernel/2.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d48de19ebb6bc364d0e4ecdb3f9da8d4a0a7fef3d56363aa0ba83181eb4a91d2
|
| 3 |
+
size 4682913
|
model/model/layers/0/self_attn/o_proj/kernel/3.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fb890ca70a46f9f014efae03b2907716c044df9c50c81d86680bede3844c5a0
|
| 3 |
+
size 4695943
|
model/model/layers/0/self_attn/q_proj/bias/.zarray
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"chunks":[4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096],"zarr_format":2}
|