erfanzar commited on
Commit
461cfa6
·
verified ·
1 Parent(s): f980725

Upload GptOssForCausalLM

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +560 -0
  2. README.md +123 -0
  3. checkpoint_metadata.json +8 -0
  4. config.json +162 -0
  5. generation_config.json +12 -0
  6. model/lm_head/kernel/.zarray +1 -0
  7. model/lm_head/kernel/0.0 +3 -0
  8. model/lm_head/kernel/0.1 +3 -0
  9. model/lm_head/kernel/0.2 +3 -0
  10. model/lm_head/kernel/0.3 +3 -0
  11. model/model/embed_tokens/embedding/.zarray +1 -0
  12. model/model/embed_tokens/embedding/0.0 +3 -0
  13. model/model/embed_tokens/embedding/0.1 +3 -0
  14. model/model/embed_tokens/embedding/0.2 +3 -0
  15. model/model/embed_tokens/embedding/0.3 +3 -0
  16. model/model/layers/0/input_layernorm/kernel/.zarray +1 -0
  17. model/model/layers/0/input_layernorm/kernel/0 +0 -0
  18. model/model/layers/0/mlp/experts/down_proj/bias/.zarray +1 -0
  19. model/model/layers/0/mlp/experts/down_proj/bias/0.0 +3 -0
  20. model/model/layers/0/mlp/experts/down_proj/kernel/.zarray +1 -0
  21. model/model/layers/0/mlp/experts/down_proj/kernel/0.0.0 +3 -0
  22. model/model/layers/0/mlp/experts/gate_proj/bias/.zarray +1 -0
  23. model/model/layers/0/mlp/experts/gate_proj/bias/0.0 +3 -0
  24. model/model/layers/0/mlp/experts/gate_proj/kernel/.zarray +1 -0
  25. model/model/layers/0/mlp/experts/gate_proj/kernel/0.0.0 +3 -0
  26. model/model/layers/0/mlp/experts/up_proj/bias/.zarray +1 -0
  27. model/model/layers/0/mlp/experts/up_proj/bias/0.0 +3 -0
  28. model/model/layers/0/mlp/experts/up_proj/kernel/.zarray +1 -0
  29. model/model/layers/0/mlp/experts/up_proj/kernel/0.0.0 +3 -0
  30. model/model/layers/0/mlp/router/bias/.zarray +1 -0
  31. model/model/layers/0/mlp/router/bias/0 +0 -0
  32. model/model/layers/0/mlp/router/kernel/.zarray +1 -0
  33. model/model/layers/0/mlp/router/kernel/0.0 +3 -0
  34. model/model/layers/0/post_attention_layernorm/kernel/.zarray +1 -0
  35. model/model/layers/0/post_attention_layernorm/kernel/0 +0 -0
  36. model/model/layers/0/self_attn/k_proj/bias/.zarray +1 -0
  37. model/model/layers/0/self_attn/k_proj/bias/0 +0 -0
  38. model/model/layers/0/self_attn/k_proj/kernel/.zarray +1 -0
  39. model/model/layers/0/self_attn/k_proj/kernel/0.0 +3 -0
  40. model/model/layers/0/self_attn/k_proj/kernel/0.1 +3 -0
  41. model/model/layers/0/self_attn/k_proj/kernel/0.2 +3 -0
  42. model/model/layers/0/self_attn/k_proj/kernel/0.3 +3 -0
  43. model/model/layers/0/self_attn/o_proj/bias/.zarray +1 -0
  44. model/model/layers/0/self_attn/o_proj/bias/0 +0 -0
  45. model/model/layers/0/self_attn/o_proj/kernel/.zarray +1 -0
  46. model/model/layers/0/self_attn/o_proj/kernel/0.0 +3 -0
  47. model/model/layers/0/self_attn/o_proj/kernel/1.0 +3 -0
  48. model/model/layers/0/self_attn/o_proj/kernel/2.0 +3 -0
  49. model/model/layers/0/self_attn/o_proj/kernel/3.0 +3 -0
  50. model/model/layers/0/self_attn/q_proj/bias/.zarray +1 -0
.gitattributes CHANGED
@@ -33,3 +33,563 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model/model/embed_tokens/embedding/0.1 filter=lfs diff=lfs merge=lfs -text
37
+ model/model/embed_tokens/embedding/0.3 filter=lfs diff=lfs merge=lfs -text
38
+ model/model/embed_tokens/embedding/0.0 filter=lfs diff=lfs merge=lfs -text
39
+ model/model/embed_tokens/embedding/0.2 filter=lfs diff=lfs merge=lfs -text
40
+ model/model/layers/21/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
41
+ model/model/layers/21/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
42
+ model/model/layers/21/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
43
+ model/model/layers/21/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
44
+ model/model/layers/21/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
45
+ model/model/layers/21/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
46
+ model/model/layers/21/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
47
+ model/model/layers/21/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
48
+ model/model/layers/21/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
49
+ model/model/layers/21/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
50
+ model/model/layers/21/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
51
+ model/model/layers/21/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
52
+ model/model/layers/21/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
53
+ model/model/layers/21/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
54
+ model/model/layers/21/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
55
+ model/model/layers/21/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
56
+ model/model/layers/21/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
57
+ model/model/layers/21/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
58
+ model/model/layers/21/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
59
+ model/model/layers/21/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
60
+ model/model/layers/21/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
61
+ model/model/layers/21/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
62
+ model/model/layers/21/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
63
+ model/model/layers/13/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
64
+ model/model/layers/13/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
65
+ model/model/layers/13/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
66
+ model/model/layers/13/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
67
+ model/model/layers/13/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
68
+ model/model/layers/13/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
69
+ model/model/layers/13/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
70
+ model/model/layers/13/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
71
+ model/model/layers/13/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
72
+ model/model/layers/13/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
73
+ model/model/layers/13/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
74
+ model/model/layers/13/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
75
+ model/model/layers/13/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
76
+ model/model/layers/13/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
77
+ model/model/layers/13/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
78
+ model/model/layers/13/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
79
+ model/model/layers/13/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
80
+ model/model/layers/13/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
81
+ model/model/layers/13/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
82
+ model/model/layers/13/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
83
+ model/model/layers/13/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
84
+ model/model/layers/13/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
85
+ model/model/layers/13/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
86
+ model/model/layers/1/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
87
+ model/model/layers/1/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
88
+ model/model/layers/1/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
89
+ model/model/layers/1/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
90
+ model/model/layers/1/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
91
+ model/model/layers/1/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
92
+ model/model/layers/1/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
93
+ model/model/layers/1/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
94
+ model/model/layers/1/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
95
+ model/model/layers/1/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
96
+ model/model/layers/1/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
97
+ model/model/layers/1/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
98
+ model/model/layers/1/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
99
+ model/model/layers/1/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
100
+ model/model/layers/1/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
101
+ model/model/layers/1/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
102
+ model/model/layers/1/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
103
+ model/model/layers/1/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
104
+ model/model/layers/1/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
105
+ model/model/layers/1/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
106
+ model/model/layers/1/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
107
+ model/model/layers/1/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
108
+ model/model/layers/1/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
109
+ model/model/layers/3/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
110
+ model/model/layers/3/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
111
+ model/model/layers/3/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
112
+ model/model/layers/3/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
113
+ model/model/layers/3/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
114
+ model/model/layers/3/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
115
+ model/model/layers/3/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
116
+ model/model/layers/3/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
117
+ model/model/layers/3/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
118
+ model/model/layers/3/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
119
+ model/model/layers/3/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
120
+ model/model/layers/3/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
121
+ model/model/layers/3/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
122
+ model/model/layers/3/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
123
+ model/model/layers/3/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
124
+ model/model/layers/3/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
125
+ model/model/layers/3/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
126
+ model/model/layers/3/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
127
+ model/model/layers/3/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
128
+ model/model/layers/3/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
129
+ model/model/layers/3/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
130
+ model/model/layers/3/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
131
+ model/model/layers/3/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
132
+ model/model/layers/4/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
133
+ model/model/layers/4/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
134
+ model/model/layers/4/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
135
+ model/model/layers/4/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
136
+ model/model/layers/4/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
137
+ model/model/layers/4/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
138
+ model/model/layers/4/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
139
+ model/model/layers/4/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
140
+ model/model/layers/4/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
141
+ model/model/layers/4/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
142
+ model/model/layers/4/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
143
+ model/model/layers/4/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
144
+ model/model/layers/4/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
145
+ model/model/layers/4/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
146
+ model/model/layers/4/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
147
+ model/model/layers/4/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
148
+ model/model/layers/4/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
149
+ model/model/layers/4/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
150
+ model/model/layers/4/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
151
+ model/model/layers/4/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
152
+ model/model/layers/4/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
153
+ model/model/layers/4/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
154
+ model/model/layers/4/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
155
+ model/model/layers/5/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
156
+ model/model/layers/5/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
157
+ model/model/layers/5/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
158
+ model/model/layers/5/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
159
+ model/model/layers/5/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
160
+ model/model/layers/5/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
161
+ model/model/layers/5/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
162
+ model/model/layers/5/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
163
+ model/model/layers/5/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
164
+ model/model/layers/5/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
165
+ model/model/layers/5/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
166
+ model/model/layers/5/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
167
+ model/model/layers/5/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
168
+ model/model/layers/5/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
169
+ model/model/layers/5/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
170
+ model/model/layers/5/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
171
+ model/model/layers/5/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
172
+ model/model/layers/5/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
173
+ model/model/layers/5/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
174
+ model/model/layers/5/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
175
+ model/model/layers/5/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
176
+ model/model/layers/5/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
177
+ model/model/layers/5/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
178
+ model/model/layers/20/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
179
+ model/model/layers/20/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
180
+ model/model/layers/20/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
181
+ model/model/layers/20/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
182
+ model/model/layers/20/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
183
+ model/model/layers/20/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
184
+ model/model/layers/20/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
185
+ model/model/layers/20/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
186
+ model/model/layers/20/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
187
+ model/model/layers/20/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
188
+ model/model/layers/20/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
189
+ model/model/layers/20/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
190
+ model/model/layers/20/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
191
+ model/model/layers/20/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
192
+ model/model/layers/20/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
193
+ model/model/layers/20/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
194
+ model/model/layers/20/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
195
+ model/model/layers/20/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
196
+ model/model/layers/20/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
197
+ model/model/layers/20/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
198
+ model/model/layers/20/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
199
+ model/model/layers/20/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
200
+ model/model/layers/20/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
201
+ model/model/layers/18/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
202
+ model/model/layers/18/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
203
+ model/model/layers/18/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
204
+ model/model/layers/18/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
205
+ model/model/layers/18/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
206
+ model/model/layers/18/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
207
+ model/model/layers/18/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
208
+ model/model/layers/18/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
209
+ model/model/layers/18/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
210
+ model/model/layers/18/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
211
+ model/model/layers/18/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
212
+ model/model/layers/18/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
213
+ model/model/layers/18/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
214
+ model/model/layers/18/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
215
+ model/model/layers/18/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
216
+ model/model/layers/18/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
217
+ model/model/layers/18/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
218
+ model/model/layers/18/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
219
+ model/model/layers/18/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
220
+ model/model/layers/18/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
221
+ model/model/layers/18/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
222
+ model/model/layers/18/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
223
+ model/model/layers/18/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
224
+ model/model/layers/17/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
225
+ model/model/layers/17/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
226
+ model/model/layers/17/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
227
+ model/model/layers/17/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
228
+ model/model/layers/17/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
229
+ model/model/layers/17/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
230
+ model/model/layers/17/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
231
+ model/model/layers/17/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
232
+ model/model/layers/17/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
233
+ model/model/layers/17/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
234
+ model/model/layers/17/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
235
+ model/model/layers/17/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
236
+ model/model/layers/17/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
237
+ model/model/layers/17/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
238
+ model/model/layers/17/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
239
+ model/model/layers/17/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
240
+ model/model/layers/17/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
241
+ model/model/layers/17/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
242
+ model/model/layers/17/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
243
+ model/model/layers/17/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
244
+ model/model/layers/17/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
245
+ model/model/layers/17/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
246
+ model/model/layers/17/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
247
+ model/model/layers/19/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
248
+ model/model/layers/19/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
249
+ model/model/layers/19/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
250
+ model/model/layers/19/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
251
+ model/model/layers/19/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
252
+ model/model/layers/19/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
253
+ model/model/layers/19/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
254
+ model/model/layers/19/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
255
+ model/model/layers/19/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
256
+ model/model/layers/19/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
257
+ model/model/layers/19/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
258
+ model/model/layers/19/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
259
+ model/model/layers/19/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
260
+ model/model/layers/19/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
261
+ model/model/layers/19/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
262
+ model/model/layers/19/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
263
+ model/model/layers/19/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
264
+ model/model/layers/19/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
265
+ model/model/layers/19/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
266
+ model/model/layers/19/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
267
+ model/model/layers/19/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
268
+ model/model/layers/19/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
269
+ model/model/layers/19/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
270
+ model/model/layers/10/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
271
+ model/model/layers/10/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
272
+ model/model/layers/10/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
273
+ model/model/layers/10/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
274
+ model/model/layers/10/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
275
+ model/model/layers/10/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
276
+ model/model/layers/10/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
277
+ model/model/layers/10/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
278
+ model/model/layers/10/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
279
+ model/model/layers/10/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
280
+ model/model/layers/10/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
281
+ model/model/layers/10/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
282
+ model/model/layers/10/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
283
+ model/model/layers/10/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
284
+ model/model/layers/10/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
285
+ model/model/layers/10/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
286
+ model/model/layers/10/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
287
+ model/model/layers/10/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
288
+ model/model/layers/10/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
289
+ model/model/layers/10/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
290
+ model/model/layers/10/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
291
+ model/model/layers/10/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
292
+ model/model/layers/10/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
293
+ model/model/layers/8/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
294
+ model/model/layers/8/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
295
+ model/model/layers/8/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
296
+ model/model/layers/8/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
297
+ model/model/layers/8/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
298
+ model/model/layers/8/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
299
+ model/model/layers/8/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
300
+ model/model/layers/8/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
301
+ model/model/layers/8/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
302
+ model/model/layers/8/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
303
+ model/model/layers/8/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
304
+ model/model/layers/8/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
305
+ model/model/layers/8/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
306
+ model/model/layers/8/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
307
+ model/model/layers/8/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
308
+ model/model/layers/8/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
309
+ model/model/layers/8/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
310
+ model/model/layers/8/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
311
+ model/model/layers/8/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
312
+ model/model/layers/8/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
313
+ model/model/layers/8/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
314
+ model/model/layers/8/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
315
+ model/model/layers/8/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
316
+ model/model/layers/16/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
317
+ model/model/layers/16/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
318
+ model/model/layers/16/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
319
+ model/model/layers/16/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
320
+ model/model/layers/16/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
321
+ model/model/layers/16/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
322
+ model/model/layers/16/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
323
+ model/model/layers/16/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
324
+ model/model/layers/16/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
325
+ model/model/layers/16/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
326
+ model/model/layers/16/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
327
+ model/model/layers/16/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
328
+ model/model/layers/16/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
329
+ model/model/layers/16/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
330
+ model/model/layers/16/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
331
+ model/model/layers/16/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
332
+ model/model/layers/16/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
333
+ model/model/layers/16/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
334
+ model/model/layers/16/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
335
+ model/model/layers/16/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
336
+ model/model/layers/16/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
337
+ model/model/layers/16/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
338
+ model/model/layers/16/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
339
+ model/model/layers/11/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
340
+ model/model/layers/11/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
341
+ model/model/layers/11/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
342
+ model/model/layers/11/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
343
+ model/model/layers/11/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
344
+ model/model/layers/11/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
345
+ model/model/layers/11/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
346
+ model/model/layers/11/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
347
+ model/model/layers/11/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
348
+ model/model/layers/11/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
349
+ model/model/layers/11/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
350
+ model/model/layers/11/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
351
+ model/model/layers/11/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
352
+ model/model/layers/11/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
353
+ model/model/layers/11/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
354
+ model/model/layers/11/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
355
+ model/model/layers/11/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
356
+ model/model/layers/11/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
357
+ model/model/layers/11/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
358
+ model/model/layers/11/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
359
+ model/model/layers/11/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
360
+ model/model/layers/11/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
361
+ model/model/layers/11/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
362
+ model/model/layers/14/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
363
+ model/model/layers/14/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
364
+ model/model/layers/14/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
365
+ model/model/layers/14/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
366
+ model/model/layers/14/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
367
+ model/model/layers/14/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
368
+ model/model/layers/14/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
369
+ model/model/layers/14/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
370
+ model/model/layers/14/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
371
+ model/model/layers/14/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
372
+ model/model/layers/14/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
373
+ model/model/layers/14/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
374
+ model/model/layers/14/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
375
+ model/model/layers/14/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
376
+ model/model/layers/14/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
377
+ model/model/layers/14/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
378
+ model/model/layers/14/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
379
+ model/model/layers/14/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
380
+ model/model/layers/14/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
381
+ model/model/layers/14/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
382
+ model/model/layers/14/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
383
+ model/model/layers/14/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
384
+ model/model/layers/14/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
385
+ model/model/layers/6/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
386
+ model/model/layers/6/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
387
+ model/model/layers/6/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
388
+ model/model/layers/6/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
389
+ model/model/layers/6/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
390
+ model/model/layers/6/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
391
+ model/model/layers/6/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
392
+ model/model/layers/6/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
393
+ model/model/layers/6/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
394
+ model/model/layers/6/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
395
+ model/model/layers/6/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
396
+ model/model/layers/6/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
397
+ model/model/layers/6/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
398
+ model/model/layers/6/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
399
+ model/model/layers/6/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
400
+ model/model/layers/6/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
401
+ model/model/layers/6/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
402
+ model/model/layers/6/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
403
+ model/model/layers/6/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
404
+ model/model/layers/6/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
405
+ model/model/layers/6/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
406
+ model/model/layers/6/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
407
+ model/model/layers/6/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
408
+ model/model/layers/23/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
409
+ model/model/layers/23/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
410
+ model/model/layers/23/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
411
+ model/model/layers/23/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
412
+ model/model/layers/23/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
413
+ model/model/layers/23/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
414
+ model/model/layers/23/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
415
+ model/model/layers/23/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
416
+ model/model/layers/23/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
417
+ model/model/layers/23/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
418
+ model/model/layers/23/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
419
+ model/model/layers/23/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
420
+ model/model/layers/23/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
421
+ model/model/layers/23/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
422
+ model/model/layers/23/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
423
+ model/model/layers/23/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
424
+ model/model/layers/23/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
425
+ model/model/layers/23/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
426
+ model/model/layers/23/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
427
+ model/model/layers/23/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
428
+ model/model/layers/23/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
429
+ model/model/layers/23/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
430
+ model/model/layers/23/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
431
+ model/model/layers/2/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
432
+ model/model/layers/2/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
433
+ model/model/layers/2/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
434
+ model/model/layers/2/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
435
+ model/model/layers/2/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
436
+ model/model/layers/2/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
437
+ model/model/layers/2/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
438
+ model/model/layers/2/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
439
+ model/model/layers/2/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
440
+ model/model/layers/2/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
441
+ model/model/layers/2/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
442
+ model/model/layers/2/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
443
+ model/model/layers/2/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
444
+ model/model/layers/2/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
445
+ model/model/layers/2/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
446
+ model/model/layers/2/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
447
+ model/model/layers/2/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
448
+ model/model/layers/2/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
449
+ model/model/layers/2/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
450
+ model/model/layers/2/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
451
+ model/model/layers/2/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
452
+ model/model/layers/2/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
453
+ model/model/layers/2/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
454
+ model/model/layers/9/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
455
+ model/model/layers/9/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
456
+ model/model/layers/9/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
457
+ model/model/layers/9/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
458
+ model/model/layers/9/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
459
+ model/model/layers/9/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
460
+ model/model/layers/9/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
461
+ model/model/layers/9/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
462
+ model/model/layers/9/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
463
+ model/model/layers/9/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
464
+ model/model/layers/9/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
465
+ model/model/layers/9/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
466
+ model/model/layers/9/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
467
+ model/model/layers/9/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
468
+ model/model/layers/9/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
469
+ model/model/layers/9/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
470
+ model/model/layers/9/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
471
+ model/model/layers/9/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
472
+ model/model/layers/9/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
473
+ model/model/layers/9/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
474
+ model/model/layers/9/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
475
+ model/model/layers/9/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
476
+ model/model/layers/9/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
477
+ model/model/layers/22/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
478
+ model/model/layers/22/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
479
+ model/model/layers/22/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
480
+ model/model/layers/22/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
481
+ model/model/layers/22/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
482
+ model/model/layers/22/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
483
+ model/model/layers/22/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
484
+ model/model/layers/22/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
485
+ model/model/layers/22/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
486
+ model/model/layers/22/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
487
+ model/model/layers/22/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
488
+ model/model/layers/22/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
489
+ model/model/layers/22/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
490
+ model/model/layers/22/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
491
+ model/model/layers/22/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
492
+ model/model/layers/22/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
493
+ model/model/layers/22/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
494
+ model/model/layers/22/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
495
+ model/model/layers/22/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
496
+ model/model/layers/22/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
497
+ model/model/layers/22/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
498
+ model/model/layers/22/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
499
+ model/model/layers/22/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
500
+ model/model/layers/15/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
501
+ model/model/layers/15/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
502
+ model/model/layers/15/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
503
+ model/model/layers/15/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
504
+ model/model/layers/15/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
505
+ model/model/layers/15/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
506
+ model/model/layers/15/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
507
+ model/model/layers/15/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
508
+ model/model/layers/15/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
509
+ model/model/layers/15/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
510
+ model/model/layers/15/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
511
+ model/model/layers/15/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
512
+ model/model/layers/15/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
513
+ model/model/layers/15/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
514
+ model/model/layers/15/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
515
+ model/model/layers/15/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
516
+ model/model/layers/15/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
517
+ model/model/layers/15/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
518
+ model/model/layers/15/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
519
+ model/model/layers/15/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
520
+ model/model/layers/15/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
521
+ model/model/layers/15/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
522
+ model/model/layers/15/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
523
+ model/model/layers/7/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
524
+ model/model/layers/7/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
525
+ model/model/layers/7/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
526
+ model/model/layers/7/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
527
+ model/model/layers/7/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
528
+ model/model/layers/7/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
529
+ model/model/layers/7/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
530
+ model/model/layers/7/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
531
+ model/model/layers/7/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
532
+ model/model/layers/7/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
533
+ model/model/layers/7/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
534
+ model/model/layers/7/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
535
+ model/model/layers/7/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
536
+ model/model/layers/7/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
537
+ model/model/layers/7/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
538
+ model/model/layers/7/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
539
+ model/model/layers/7/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
540
+ model/model/layers/7/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
541
+ model/model/layers/7/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
542
+ model/model/layers/7/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
543
+ model/model/layers/7/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
544
+ model/model/layers/7/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
545
+ model/model/layers/7/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
546
+ model/model/layers/0/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
547
+ model/model/layers/0/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
548
+ model/model/layers/0/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
549
+ model/model/layers/0/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
550
+ model/model/layers/0/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
551
+ model/model/layers/0/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
552
+ model/model/layers/0/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
553
+ model/model/layers/0/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
554
+ model/model/layers/0/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
555
+ model/model/layers/0/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
556
+ model/model/layers/0/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
557
+ model/model/layers/0/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
558
+ model/model/layers/0/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
559
+ model/model/layers/0/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
560
+ model/model/layers/0/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
561
+ model/model/layers/0/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
562
+ model/model/layers/0/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
563
+ model/model/layers/0/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
564
+ model/model/layers/0/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
565
+ model/model/layers/0/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
566
+ model/model/layers/0/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
567
+ model/model/layers/0/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
568
+ model/model/layers/0/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
569
+ model/model/layers/12/self_attn/v_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
570
+ model/model/layers/12/self_attn/v_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
571
+ model/model/layers/12/self_attn/v_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
572
+ model/model/layers/12/self_attn/v_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
573
+ model/model/layers/12/self_attn/q_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
574
+ model/model/layers/12/self_attn/q_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
575
+ model/model/layers/12/self_attn/q_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
576
+ model/model/layers/12/self_attn/q_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
577
+ model/model/layers/12/self_attn/o_proj/kernel/1.0 filter=lfs diff=lfs merge=lfs -text
578
+ model/model/layers/12/self_attn/o_proj/kernel/3.0 filter=lfs diff=lfs merge=lfs -text
579
+ model/model/layers/12/self_attn/o_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
580
+ model/model/layers/12/self_attn/o_proj/kernel/2.0 filter=lfs diff=lfs merge=lfs -text
581
+ model/model/layers/12/self_attn/k_proj/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
582
+ model/model/layers/12/self_attn/k_proj/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
583
+ model/model/layers/12/self_attn/k_proj/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
584
+ model/model/layers/12/self_attn/k_proj/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
585
+ model/model/layers/12/mlp/experts/gate_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
586
+ model/model/layers/12/mlp/experts/gate_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
587
+ model/model/layers/12/mlp/experts/up_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
588
+ model/model/layers/12/mlp/experts/up_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
589
+ model/model/layers/12/mlp/experts/down_proj/kernel/0.0.0 filter=lfs diff=lfs merge=lfs -text
590
+ model/model/layers/12/mlp/experts/down_proj/bias/0.0 filter=lfs diff=lfs merge=lfs -text
591
+ model/model/layers/12/mlp/router/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
592
+ model/lm_head/kernel/0.1 filter=lfs diff=lfs merge=lfs -text
593
+ model/lm_head/kernel/0.3 filter=lfs diff=lfs merge=lfs -text
594
+ model/lm_head/kernel/0.0 filter=lfs diff=lfs merge=lfs -text
595
+ model/lm_head/kernel/0.2 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - EasyDeL
4
+ - GptOssForCausalLM
5
+ - TaskType.CAUSAL_LM
6
+ - AttentionMechanisms.RAGGED_PAGE_ATTENTION_V3
7
+ - safetensors
8
+ - TPU
9
+ - GPU
10
+ - XLA
11
+ - Flax
12
+ ---
13
+ <p align="center">
14
+ <a href="https://github.com/erfanzar/EasyDeL">
15
+ <img src="https://raw.githubusercontent.com/erfanzar/easydel/main/images/easydel-logo-with-text.png" height="80">
16
+ </a>
17
+ </p>
18
+
19
+ <p align="center">
20
+ <a href="https://github.com/erfanzar/EasyDeL">
21
+ <img src="https://img.shields.io/badge/🤗_EasyDeL-0.1.5-blue.svg" />
22
+ </a>
23
+ <a href="https://github.com/erfanzar/EasyDeL">
24
+ <img src="https://img.shields.io/badge/Model_Type-GptOssForCausalLM-green.svg" />
25
+ </a>
26
+ </p>
27
+
28
+ # EasyDeL/gpt-oss-20b
29
+
30
+ A model implemented using the EasyDeL framework, designed to deliver optimal performance for large-scale natural language processing tasks.
31
+
32
+ ## Overview
33
+
34
+ This model is built using [EasyDeL](https://github.com/erfanzar/EasyDeL), an open-source framework designed to enhance and streamline the training and serving process of machine learning models, with a primary focus on Jax/Flax on TPU/GPU at scale.
35
+
36
+ EasyDeL provides an efficient, highly-optimized, and customizable machine learning model compatible with both GPU and TPU environments. Built with JAX, this model supports advanced features such as sharded model parallelism, making it suitable for distributed training and inference and customized kernels.
37
+
38
+ ## Features Provided by EasyDeL
39
+
40
+ **EasyDeL Framework Features:**
41
+
42
+ - **Efficient Implementation**: Built with JAX/Flax for high-performance computation.
43
+ - **Modern Architecture**: Built on Flax NNX for better integration, modularity, and performance.
44
+ - **Multi-Device Support**: Optimized to run on TPU, GPU, and CPU environments.
45
+ - **Sharded Model Parallelism**: Supports model parallelism across multiple devices for scalability (using `auto_shard_model=True`).
46
+ - **Customizable Precision**: Allows specification of `dtype`, `param_dtype`, and `precision`.
47
+ - **Advanced Serving**: Includes `eSurge` LLM serving engine, `vWhisper` speech endpoints, and OpenAI-compatible APIs.
48
+ - **Optimized Kernels**: Integrates multiple attention mechanisms (like `AttentionMechanisms.RAGGED_PAGE_ATTENTION_V3`) and platform-specific optimizations.
49
+
50
+ ## Installation
51
+
52
+ To use this model via EasyDeL, first install EasyDeL:
53
+
54
+ ```bash
55
+ pip install easydel
56
+ ```
57
+
58
+ ## Usage
59
+
60
+ ### Loading the Pre-trained Model
61
+
62
+ To load this pre-trained model with EasyDeL:
63
+
64
+ ```python
65
+ from easydel import AutoEasyDeLModelForCausalLM, EasyDeLBaseConfigDict, AttentionMechanisms
66
+ from jax import numpy as jnp, lax
67
+
68
+ # Define max_length if needed for memory optimization
69
+ max_length = None
70
+
71
+ # Load model and parameters
72
+ # Set auto_shard_model=True to automatically distribute across devices
73
+ model = AutoEasyDeLModelForCausalLM.from_pretrained(
74
+ "EasyDeL/gpt-oss-20b",
75
+ config_kwargs=EasyDeLBaseConfigDict(
76
+ # use_scan_mlp=False, # Set to True to potentially reduce memory usage
77
+ attn_dtype=jnp.float16, # Or jnp.bfloat16
78
+ # freq_max_position_embeddings=max_length, # Set if using RoPE and need truncation
79
+ # mask_max_position_embeddings=max_length, # Set if max length is defined
80
+ attn_mechanism=AttentionMechanisms.PAGED # Matches the mechanism used by this model
81
+ ),
82
+ dtype=jnp.float16, # Or jnp.bfloat16 - Computation data type
83
+ param_dtype=jnp.float16, # Or jnp.bfloat16 - Parameter data type
84
+ precision=lax.Precision("fastest"), # Like "default", "fastest", "high", "highest"
85
+ auto_shard_model=True, # Auto-shard across available devices
86
+ )
87
+ ```
88
+
89
+ ## Supported Tasks
90
+
91
+ The primary task for this model is **TaskType.CAUSAL_LM**. Further specific supported tasks are not explicitly listed.
92
+
93
+ ## Limitations
94
+
95
+ **General Limitations:**
96
+
97
+ - **Hardware Dependency**: Performance can vary significantly based on the hardware (TPU/GPU) used.
98
+ - **JAX/Flax Setup Required**: The environment must support JAX/Flax for optimal use.
99
+ - **Experimental Features**: Some EasyDeL features (like custom kernels) may require additional configuration.
100
+
101
+ ## License 📜
102
+
103
+ EasyDeL is released under the Apache v2 license. The license for this specific model might differ; please consult the original model repository or documentation.
104
+
105
+ ```code
106
+ # Apache License 2.0 (referring to EasyDeL Framework)
107
+ # ... (Full license text usually included in the main repo) ...
108
+ ```
109
+
110
+ ## Citation
111
+
112
+ If you use EasyDeL in your research or work, please cite it:
113
+
114
+ ```bibtex
115
+ @misc{Zare Chavoshi_2023,
116
+ title={EasyDeL: An open-source library for enhancing and streamlining the training process of machine learning models},
117
+ url={https://github.com/erfanzar/EasyDeL},
118
+ author={Zare Chavoshi, Erfan},
119
+ year={2023}
120
+ }
121
+ ```
122
+
123
+ Please also consider citing the original paper or source for the **EasyDeL/gpt-oss-20b** model architecture if applicable.
checkpoint_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.0.84",
3
+ "timestamp": "2025-11-24T14:39:38.911214",
4
+ "checksum": {},
5
+ "array_metadata": {},
6
+ "framework_version": null,
7
+ "custom_metadata": {}
8
+ }
config.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GptOssForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "attn_mechanism": "ragged_page_attention_v3",
8
+ "backend": null,
9
+ "bits": null,
10
+ "blocksize_b": 1,
11
+ "blocksize_k": 128,
12
+ "blocksize_q": 128,
13
+ "decode_attn_mechanism": null,
14
+ "dtype": "bfloat16",
15
+ "easy_method": "train",
16
+ "eos_token_id": 200002,
17
+ "experts_per_token": 4,
18
+ "fcm_max_ratio": 0.0,
19
+ "fcm_min_ratio": 0.0,
20
+ "flash_attention_backward_pass_impl": "triton",
21
+ "freq_max_position_embeddings": 4096,
22
+ "fsdp_is_ep_bound": true,
23
+ "gradient_checkpointing": "",
24
+ "gradient_checkpointing_targets": null,
25
+ "hardware_abstraction": true,
26
+ "head_dim": 64,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 2880,
29
+ "initial_context_length": 4096,
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 2880,
32
+ "kv_cache_quantization_blocksize": 128,
33
+ "kv_cache_quantization_method": "None",
34
+ "kv_cache_sharding_sequence_axis_name": "sp",
35
+ "layer_types": [
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "full_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "full_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "full_attention",
54
+ "sliding_attention",
55
+ "full_attention",
56
+ "sliding_attention",
57
+ "full_attention",
58
+ "sliding_attention",
59
+ "full_attention"
60
+ ],
61
+ "mask_max_position_embeddings": 4096,
62
+ "max_position_embeddings": 131072,
63
+ "mlp_activations_limit": 7.0,
64
+ "model_type": "gpt_oss",
65
+ "moe_force_xla_gmm": false,
66
+ "moe_method": "fused_moe",
67
+ "moe_tiling_size_batch": 4,
68
+ "moe_tiling_size_dim": 128,
69
+ "moe_tiling_size_seqlen": 128,
70
+ "num_attention_heads": 64,
71
+ "num_experts_per_tok": 4,
72
+ "num_hidden_layers": 24,
73
+ "num_key_value_heads": 8,
74
+ "num_local_experts": 32,
75
+ "output_router_logits": false,
76
+ "pallas_k_block_size": 128,
77
+ "pallas_m_block_size": 128,
78
+ "pallas_n_block_size": 128,
79
+ "partition_axis": {
80
+ "attention_dim_axis": null,
81
+ "attention_kv_dim_axis": null,
82
+ "batch_axis": [
83
+ "fsdp",
84
+ "dp"
85
+ ],
86
+ "bias_head_sequence_axis": null,
87
+ "bias_key_sequence_axis": null,
88
+ "data_parallel_axis": "dp",
89
+ "decode_attention_dim_axis": null,
90
+ "decode_attention_kv_dim_axis": null,
91
+ "decode_batch_axis": [
92
+ "fsdp",
93
+ "dp"
94
+ ],
95
+ "decode_head_axis": "tp",
96
+ "decode_key_sequence_axis": "sp",
97
+ "decode_kv_head_axis": "tp",
98
+ "decode_query_sequence_axis": null,
99
+ "expert_axis": "ep",
100
+ "expert_gate_axis": null,
101
+ "expert_parallel_axis": "ep",
102
+ "fully_sharded_data_parallel_axis": "fsdp",
103
+ "head_axis": "tp",
104
+ "hidden_state_axis": "tp",
105
+ "key_sequence_axis": "sp",
106
+ "kv_head_axis": "tp",
107
+ "mlp_intermediate_axis": "tp",
108
+ "query_sequence_axis": "sp",
109
+ "sequence_axis": "sp",
110
+ "sequence_parallel_axis": "sp",
111
+ "tensor_parallel_axis": "tp",
112
+ "vocab_axis": "tp"
113
+ },
114
+ "platform": null,
115
+ "precompute_masks": true,
116
+ "pretraining_tp": 1,
117
+ "quantization_blocksize": 64,
118
+ "quantization_method": "None",
119
+ "quantization_pattern": ".*",
120
+ "rms_norm_eps": 1e-05,
121
+ "rope_scaling": {
122
+ "beta_fast": 32.0,
123
+ "beta_slow": 1.0,
124
+ "factor": 32.0,
125
+ "original_max_position_embeddings": 4096,
126
+ "rope_type": "yarn",
127
+ "truncate": false
128
+ },
129
+ "rope_theta": 150000,
130
+ "router_aux_loss_coef": 0.9,
131
+ "scan_attention_layers": false,
132
+ "scan_mlp_chunk_size": 1024,
133
+ "scan_ring_attention": true,
134
+ "sequence_axis_name": "sp",
135
+ "sharding_axis_dims": [
136
+ 1,
137
+ 1,
138
+ 1,
139
+ -1,
140
+ 1
141
+ ],
142
+ "sharding_axis_names": [
143
+ "dp",
144
+ "fsdp",
145
+ "ep",
146
+ "tp",
147
+ "sp"
148
+ ],
149
+ "sharding_dcn_axis_dims": null,
150
+ "sliding_window": 128,
151
+ "sp_is_ep_bound": true,
152
+ "swiglu_limit": 7.0,
153
+ "tie_word_embeddings": false,
154
+ "transformers_version": "4.57.1",
155
+ "use_cache": true,
156
+ "use_expert_tensor_mode": false,
157
+ "use_ring_of_experts": false,
158
+ "use_scan_mlp": false,
159
+ "use_sharded_kv_caching": false,
160
+ "use_sharding_constraint": false,
161
+ "vocab_size": 201088
162
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 199998,
3
+ "device": null,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 200002,
7
+ 199999,
8
+ 200012
9
+ ],
10
+ "pad_token_id": 199999,
11
+ "transformers_version": "4.57.1"
12
+ }
model/lm_head/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[2880,50272],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,201088],"zarr_format":2}
model/lm_head/kernel/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a59a0697e61df795e9d452bca2567b951825a916bdb8e68de07dab2fe96a648
3
+ size 226641977
model/lm_head/kernel/0.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:738005c53d3d3985f9e69081cb9e9e43f04cb5471a8cec5325c9cb7ab76ba644
3
+ size 226681894
model/lm_head/kernel/0.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b4589e1d887a07e81513b318316dccd1f5fef87fa942d667ab4fefe7b8291b8
3
+ size 226770968
model/lm_head/kernel/0.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1e1220e3a3d6982aaae3f5bbf444b71e89adf553a954fdcde257b849a70e5f4
3
+ size 227725079
model/model/embed_tokens/embedding/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[201088,720],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[201088,2880],"zarr_format":2}
model/model/embed_tokens/embedding/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3adc019f3fca5e4a4d2f9d58ccacaa019db91b4658323093331f25f006cc9a00
3
+ size 238375444
model/model/embed_tokens/embedding/0.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d93e52183c1eaf471bf4511e12b3241ef43d3a68cc3049d4cf864388ed0d546
3
+ size 238013629
model/model/embed_tokens/embedding/0.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce50c59d118efd64f0902246f08d0313ecb7a22c2390769ace5644ff59805f00
3
+ size 237651927
model/model/embed_tokens/embedding/0.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15b170706b9ab63112ea73a2d78cac3ec75716a09d5bafa4c472c9631996827b
3
+ size 238261890
model/model/layers/0/input_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2}
model/model/layers/0/input_layernorm/kernel/0 ADDED
Binary file (3.97 kB). View file
 
model/model/layers/0/mlp/experts/down_proj/bias/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880],"zarr_format":2}
model/model/layers/0/mlp/experts/down_proj/bias/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aeb585a75b26b0719fd983cf91626a88e0aa9e02010c1ee5cd34bff3d004f5e
3
+ size 148544
model/model/layers/0/mlp/experts/down_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880,2880],"zarr_format":2}
model/model/layers/0/mlp/experts/down_proj/kernel/0.0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ad0b1eb9f3fc213a92a640b3ba3ddfa2116c573a4f09c9e0f3a2ce56f305659
3
+ size 205875490
model/model/layers/0/mlp/experts/gate_proj/bias/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880],"zarr_format":2}
model/model/layers/0/mlp/experts/gate_proj/bias/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73bf83d1d04f1f22ac7ee7b9c00b3bfd46dc85d0e2b8a2ab72df38a47591ed3e
3
+ size 126932
model/model/layers/0/mlp/experts/gate_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880,2880],"zarr_format":2}
model/model/layers/0/mlp/experts/gate_proj/kernel/0.0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8829b36b49b61dea3877236700cc78fd3f289a7a4aabc1c90b815ed96363632
3
+ size 176413980
model/model/layers/0/mlp/experts/up_proj/bias/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880],"zarr_format":2}
model/model/layers/0/mlp/experts/up_proj/bias/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65c6a12a90133fa6f0d2b34e49478d42e4be9523e7caf2c85b1e3fb401d30921
3
+ size 108408
model/model/layers/0/mlp/experts/up_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32,2880,2880],"zarr_format":2}
model/model/layers/0/mlp/experts/up_proj/kernel/0.0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dfbccadccf5d0e6b3e69d45cb18c942432a1a62cc3ec245e4a346077503c98a
3
+ size 175659503
model/model/layers/0/mlp/router/bias/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[32],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[32],"zarr_format":2}
model/model/layers/0/mlp/router/bias/0 ADDED
Binary file (73 Bytes). View file
 
model/model/layers/0/mlp/router/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[2880,32],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,32],"zarr_format":2}
model/model/layers/0/mlp/router/kernel/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6916637eaabd97130c84226d31c856e5ec7eba6589c5c7bd598d40ddfbf8ad41
3
+ size 146759
model/model/layers/0/post_attention_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2}
model/model/layers/0/post_attention_layernorm/kernel/0 ADDED
Binary file (4.12 kB). View file
 
model/model/layers/0/self_attn/k_proj/bias/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2}
model/model/layers/0/self_attn/k_proj/bias/0 ADDED
Binary file (21 Bytes). View file
 
model/model/layers/0/self_attn/k_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[2880,128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,512],"zarr_format":2}
model/model/layers/0/self_attn/k_proj/kernel/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb28d0346a7f6d995f75fc86b6ad1b97c0b08ba9697c7fcb6de652ac26721ed5
3
+ size 591455
model/model/layers/0/self_attn/k_proj/kernel/0.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:126aefc77772730e4601722e663c581e96b4c1a6965d7644a244e4d5a33454a7
3
+ size 591779
model/model/layers/0/self_attn/k_proj/kernel/0.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebef9e3d47721de81a32b985eb4808178311bde35775f25e2447686e335ef095
3
+ size 593231
model/model/layers/0/self_attn/k_proj/kernel/0.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3ef0df3311ec6863669f88ca7f78f84cdc5f1b56682eeb304bfdd0d194cad98
3
+ size 590638
model/model/layers/0/self_attn/o_proj/bias/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2}
model/model/layers/0/self_attn/o_proj/bias/0 ADDED
Binary file (4.59 kB). View file
 
model/model/layers/0/self_attn/o_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096,2880],"zarr_format":2}
model/model/layers/0/self_attn/o_proj/kernel/0.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a1ac4e4b9b92bace048723b1910813b433c205f4de2b43aed7194e540dec035
3
+ size 4677833
model/model/layers/0/self_attn/o_proj/kernel/1.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e84b5d764029feb8b973b01797488701f432f8937e813ef1c89bb6119529246e
3
+ size 4673577
model/model/layers/0/self_attn/o_proj/kernel/2.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d48de19ebb6bc364d0e4ecdb3f9da8d4a0a7fef3d56363aa0ba83181eb4a91d2
3
+ size 4682913
model/model/layers/0/self_attn/o_proj/kernel/3.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb890ca70a46f9f014efae03b2907716c044df9c50c81d86680bede3844c5a0
3
+ size 4695943
model/model/layers/0/self_attn/q_proj/bias/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096],"zarr_format":2}