krishnateja95 commited on
Commit
9a0b26b
·
verified ·
1 Parent(s): 6c69754

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -88,9 +88,11 @@
88
  "num_bits": 8,
89
  "observer": null,
90
  "observer_kwargs": {},
 
91
  "strategy": "group",
92
  "symmetric": true,
93
- "type": "float"
 
94
  },
95
  "output_activations": null,
96
  "targets": [
@@ -107,9 +109,11 @@
107
  "num_bits": 8,
108
  "observer": "minmax",
109
  "observer_kwargs": {},
 
110
  "strategy": "block",
111
  "symmetric": true,
112
- "type": "float"
 
113
  }
114
  }
115
  },
@@ -118,120 +122,120 @@
118
  "ignore": [
119
  "model.layers.0.block_sparse_moe.router.layer",
120
  "model.layers.0.shared_mlp.input_linear",
121
- "model.layers.0.mixer.in_proj",
122
  "model.layers.1.block_sparse_moe.router.layer",
123
  "model.layers.1.shared_mlp.input_linear",
124
- "model.layers.1.mixer.in_proj",
125
  "model.layers.2.block_sparse_moe.router.layer",
126
  "model.layers.2.shared_mlp.input_linear",
127
- "model.layers.2.mixer.in_proj",
128
  "model.layers.3.block_sparse_moe.router.layer",
129
  "model.layers.3.shared_mlp.input_linear",
130
- "model.layers.3.mixer.in_proj",
131
  "model.layers.4.block_sparse_moe.router.layer",
132
  "model.layers.4.shared_mlp.input_linear",
133
- "model.layers.4.mixer.in_proj",
134
  "model.layers.5.block_sparse_moe.router.layer",
135
  "model.layers.5.shared_mlp.input_linear",
136
  "model.layers.6.block_sparse_moe.router.layer",
137
  "model.layers.6.shared_mlp.input_linear",
138
- "model.layers.6.mixer.in_proj",
139
  "model.layers.7.block_sparse_moe.router.layer",
140
  "model.layers.7.shared_mlp.input_linear",
141
- "model.layers.7.mixer.in_proj",
142
  "model.layers.8.block_sparse_moe.router.layer",
143
  "model.layers.8.shared_mlp.input_linear",
144
- "model.layers.8.mixer.in_proj",
145
  "model.layers.9.block_sparse_moe.router.layer",
146
  "model.layers.9.shared_mlp.input_linear",
147
- "model.layers.9.mixer.in_proj",
148
  "model.layers.10.block_sparse_moe.router.layer",
149
  "model.layers.10.shared_mlp.input_linear",
150
- "model.layers.10.mixer.in_proj",
151
  "model.layers.11.block_sparse_moe.router.layer",
152
  "model.layers.11.shared_mlp.input_linear",
153
- "model.layers.11.mixer.in_proj",
154
  "model.layers.12.block_sparse_moe.router.layer",
155
  "model.layers.12.shared_mlp.input_linear",
156
- "model.layers.12.mixer.in_proj",
157
  "model.layers.13.block_sparse_moe.router.layer",
158
  "model.layers.13.shared_mlp.input_linear",
159
- "model.layers.13.mixer.in_proj",
160
  "model.layers.14.block_sparse_moe.router.layer",
161
  "model.layers.14.shared_mlp.input_linear",
162
- "model.layers.14.mixer.in_proj",
163
  "model.layers.15.block_sparse_moe.router.layer",
164
  "model.layers.15.shared_mlp.input_linear",
165
  "model.layers.16.block_sparse_moe.router.layer",
166
  "model.layers.16.shared_mlp.input_linear",
167
- "model.layers.16.mixer.in_proj",
168
  "model.layers.17.block_sparse_moe.router.layer",
169
  "model.layers.17.shared_mlp.input_linear",
170
- "model.layers.17.mixer.in_proj",
171
  "model.layers.18.block_sparse_moe.router.layer",
172
  "model.layers.18.shared_mlp.input_linear",
173
- "model.layers.18.mixer.in_proj",
174
  "model.layers.19.block_sparse_moe.router.layer",
175
  "model.layers.19.shared_mlp.input_linear",
176
- "model.layers.19.mixer.in_proj",
177
  "model.layers.20.block_sparse_moe.router.layer",
178
  "model.layers.20.shared_mlp.input_linear",
179
- "model.layers.20.mixer.in_proj",
180
  "model.layers.21.block_sparse_moe.router.layer",
181
  "model.layers.21.shared_mlp.input_linear",
182
- "model.layers.21.mixer.in_proj",
183
  "model.layers.22.block_sparse_moe.router.layer",
184
  "model.layers.22.shared_mlp.input_linear",
185
- "model.layers.22.mixer.in_proj",
186
  "model.layers.23.block_sparse_moe.router.layer",
187
  "model.layers.23.shared_mlp.input_linear",
188
- "model.layers.23.mixer.in_proj",
189
  "model.layers.24.block_sparse_moe.router.layer",
190
  "model.layers.24.shared_mlp.input_linear",
191
- "model.layers.24.mixer.in_proj",
192
  "model.layers.25.block_sparse_moe.router.layer",
193
  "model.layers.25.shared_mlp.input_linear",
194
  "model.layers.26.block_sparse_moe.router.layer",
195
  "model.layers.26.shared_mlp.input_linear",
196
- "model.layers.26.mixer.in_proj",
197
  "model.layers.27.block_sparse_moe.router.layer",
198
  "model.layers.27.shared_mlp.input_linear",
199
- "model.layers.27.mixer.in_proj",
200
  "model.layers.28.block_sparse_moe.router.layer",
201
  "model.layers.28.shared_mlp.input_linear",
202
- "model.layers.28.mixer.in_proj",
203
  "model.layers.29.block_sparse_moe.router.layer",
204
  "model.layers.29.shared_mlp.input_linear",
205
- "model.layers.29.mixer.in_proj",
206
  "model.layers.30.block_sparse_moe.router.layer",
207
  "model.layers.30.shared_mlp.input_linear",
208
- "model.layers.30.mixer.in_proj",
209
  "model.layers.31.block_sparse_moe.router.layer",
210
  "model.layers.31.shared_mlp.input_linear",
211
- "model.layers.31.mixer.in_proj",
212
  "model.layers.32.block_sparse_moe.router.layer",
213
  "model.layers.32.shared_mlp.input_linear",
214
- "model.layers.32.mixer.in_proj",
215
  "model.layers.33.block_sparse_moe.router.layer",
216
  "model.layers.33.shared_mlp.input_linear",
217
- "model.layers.33.mixer.in_proj",
218
  "model.layers.34.block_sparse_moe.router.layer",
219
  "model.layers.34.shared_mlp.input_linear",
220
- "model.layers.34.mixer.in_proj",
221
  "model.layers.35.block_sparse_moe.router.layer",
222
  "model.layers.35.shared_mlp.input_linear",
223
  "model.layers.36.block_sparse_moe.router.layer",
224
  "model.layers.36.shared_mlp.input_linear",
225
- "model.layers.36.mixer.in_proj",
226
  "model.layers.37.block_sparse_moe.router.layer",
227
  "model.layers.37.shared_mlp.input_linear",
228
- "model.layers.37.mixer.in_proj",
229
  "model.layers.38.block_sparse_moe.router.layer",
230
  "model.layers.38.shared_mlp.input_linear",
231
- "model.layers.38.mixer.in_proj",
232
  "model.layers.39.block_sparse_moe.router.layer",
233
  "model.layers.39.shared_mlp.input_linear",
234
- "model.layers.39.mixer.in_proj",
235
  "lm_head"
236
  ],
237
  "kv_cache_scheme": null,
@@ -239,7 +243,7 @@
239
  "quantization_status": "compressed",
240
  "sparsity_config": {},
241
  "transform_config": {},
242
- "version": "0.12.3.dev15+g1c72e96"
243
  },
244
  "residual_multiplier": 0.22,
245
  "rms_norm_eps": 1e-05,
@@ -247,7 +251,7 @@
247
  "rope_theta": 10000,
248
  "router_aux_loss_coef": 0.0,
249
  "shared_intermediate_size": 1536,
250
- "tie_word_embeddings": false,
251
  "transformers_version": "4.57.1",
252
  "use_cache": true,
253
  "vocab_size": 100352
 
88
  "num_bits": 8,
89
  "observer": null,
90
  "observer_kwargs": {},
91
+ "scale_dtype": null,
92
  "strategy": "group",
93
  "symmetric": true,
94
+ "type": "float",
95
+ "zp_dtype": null
96
  },
97
  "output_activations": null,
98
  "targets": [
 
109
  "num_bits": 8,
110
  "observer": "minmax",
111
  "observer_kwargs": {},
112
+ "scale_dtype": null,
113
  "strategy": "block",
114
  "symmetric": true,
115
+ "type": "float",
116
+ "zp_dtype": null
117
  }
118
  }
119
  },
 
122
  "ignore": [
123
  "model.layers.0.block_sparse_moe.router.layer",
124
  "model.layers.0.shared_mlp.input_linear",
125
+ "model.layers.0.mamba.in_proj",
126
  "model.layers.1.block_sparse_moe.router.layer",
127
  "model.layers.1.shared_mlp.input_linear",
128
+ "model.layers.1.mamba.in_proj",
129
  "model.layers.2.block_sparse_moe.router.layer",
130
  "model.layers.2.shared_mlp.input_linear",
131
+ "model.layers.2.mamba.in_proj",
132
  "model.layers.3.block_sparse_moe.router.layer",
133
  "model.layers.3.shared_mlp.input_linear",
134
+ "model.layers.3.mamba.in_proj",
135
  "model.layers.4.block_sparse_moe.router.layer",
136
  "model.layers.4.shared_mlp.input_linear",
137
+ "model.layers.4.mamba.in_proj",
138
  "model.layers.5.block_sparse_moe.router.layer",
139
  "model.layers.5.shared_mlp.input_linear",
140
  "model.layers.6.block_sparse_moe.router.layer",
141
  "model.layers.6.shared_mlp.input_linear",
142
+ "model.layers.6.mamba.in_proj",
143
  "model.layers.7.block_sparse_moe.router.layer",
144
  "model.layers.7.shared_mlp.input_linear",
145
+ "model.layers.7.mamba.in_proj",
146
  "model.layers.8.block_sparse_moe.router.layer",
147
  "model.layers.8.shared_mlp.input_linear",
148
+ "model.layers.8.mamba.in_proj",
149
  "model.layers.9.block_sparse_moe.router.layer",
150
  "model.layers.9.shared_mlp.input_linear",
151
+ "model.layers.9.mamba.in_proj",
152
  "model.layers.10.block_sparse_moe.router.layer",
153
  "model.layers.10.shared_mlp.input_linear",
154
+ "model.layers.10.mamba.in_proj",
155
  "model.layers.11.block_sparse_moe.router.layer",
156
  "model.layers.11.shared_mlp.input_linear",
157
+ "model.layers.11.mamba.in_proj",
158
  "model.layers.12.block_sparse_moe.router.layer",
159
  "model.layers.12.shared_mlp.input_linear",
160
+ "model.layers.12.mamba.in_proj",
161
  "model.layers.13.block_sparse_moe.router.layer",
162
  "model.layers.13.shared_mlp.input_linear",
163
+ "model.layers.13.mamba.in_proj",
164
  "model.layers.14.block_sparse_moe.router.layer",
165
  "model.layers.14.shared_mlp.input_linear",
166
+ "model.layers.14.mamba.in_proj",
167
  "model.layers.15.block_sparse_moe.router.layer",
168
  "model.layers.15.shared_mlp.input_linear",
169
  "model.layers.16.block_sparse_moe.router.layer",
170
  "model.layers.16.shared_mlp.input_linear",
171
+ "model.layers.16.mamba.in_proj",
172
  "model.layers.17.block_sparse_moe.router.layer",
173
  "model.layers.17.shared_mlp.input_linear",
174
+ "model.layers.17.mamba.in_proj",
175
  "model.layers.18.block_sparse_moe.router.layer",
176
  "model.layers.18.shared_mlp.input_linear",
177
+ "model.layers.18.mamba.in_proj",
178
  "model.layers.19.block_sparse_moe.router.layer",
179
  "model.layers.19.shared_mlp.input_linear",
180
+ "model.layers.19.mamba.in_proj",
181
  "model.layers.20.block_sparse_moe.router.layer",
182
  "model.layers.20.shared_mlp.input_linear",
183
+ "model.layers.20.mamba.in_proj",
184
  "model.layers.21.block_sparse_moe.router.layer",
185
  "model.layers.21.shared_mlp.input_linear",
186
+ "model.layers.21.mamba.in_proj",
187
  "model.layers.22.block_sparse_moe.router.layer",
188
  "model.layers.22.shared_mlp.input_linear",
189
+ "model.layers.22.mamba.in_proj",
190
  "model.layers.23.block_sparse_moe.router.layer",
191
  "model.layers.23.shared_mlp.input_linear",
192
+ "model.layers.23.mamba.in_proj",
193
  "model.layers.24.block_sparse_moe.router.layer",
194
  "model.layers.24.shared_mlp.input_linear",
195
+ "model.layers.24.mamba.in_proj",
196
  "model.layers.25.block_sparse_moe.router.layer",
197
  "model.layers.25.shared_mlp.input_linear",
198
  "model.layers.26.block_sparse_moe.router.layer",
199
  "model.layers.26.shared_mlp.input_linear",
200
+ "model.layers.26.mamba.in_proj",
201
  "model.layers.27.block_sparse_moe.router.layer",
202
  "model.layers.27.shared_mlp.input_linear",
203
+ "model.layers.27.mamba.in_proj",
204
  "model.layers.28.block_sparse_moe.router.layer",
205
  "model.layers.28.shared_mlp.input_linear",
206
+ "model.layers.28.mamba.in_proj",
207
  "model.layers.29.block_sparse_moe.router.layer",
208
  "model.layers.29.shared_mlp.input_linear",
209
+ "model.layers.29.mamba.in_proj",
210
  "model.layers.30.block_sparse_moe.router.layer",
211
  "model.layers.30.shared_mlp.input_linear",
212
+ "model.layers.30.mamba.in_proj",
213
  "model.layers.31.block_sparse_moe.router.layer",
214
  "model.layers.31.shared_mlp.input_linear",
215
+ "model.layers.31.mamba.in_proj",
216
  "model.layers.32.block_sparse_moe.router.layer",
217
  "model.layers.32.shared_mlp.input_linear",
218
+ "model.layers.32.mamba.in_proj",
219
  "model.layers.33.block_sparse_moe.router.layer",
220
  "model.layers.33.shared_mlp.input_linear",
221
+ "model.layers.33.mamba.in_proj",
222
  "model.layers.34.block_sparse_moe.router.layer",
223
  "model.layers.34.shared_mlp.input_linear",
224
+ "model.layers.34.mamba.in_proj",
225
  "model.layers.35.block_sparse_moe.router.layer",
226
  "model.layers.35.shared_mlp.input_linear",
227
  "model.layers.36.block_sparse_moe.router.layer",
228
  "model.layers.36.shared_mlp.input_linear",
229
+ "model.layers.36.mamba.in_proj",
230
  "model.layers.37.block_sparse_moe.router.layer",
231
  "model.layers.37.shared_mlp.input_linear",
232
+ "model.layers.37.mamba.in_proj",
233
  "model.layers.38.block_sparse_moe.router.layer",
234
  "model.layers.38.shared_mlp.input_linear",
235
+ "model.layers.38.mamba.in_proj",
236
  "model.layers.39.block_sparse_moe.router.layer",
237
  "model.layers.39.shared_mlp.input_linear",
238
+ "model.layers.39.mamba.in_proj",
239
  "lm_head"
240
  ],
241
  "kv_cache_scheme": null,
 
243
  "quantization_status": "compressed",
244
  "sparsity_config": {},
245
  "transform_config": {},
246
+ "version": "0.12.3.dev28+g2763f81"
247
  },
248
  "residual_multiplier": 0.22,
249
  "rms_norm_eps": 1e-05,
 
251
  "rope_theta": 10000,
252
  "router_aux_loss_coef": 0.0,
253
  "shared_intermediate_size": 1536,
254
+ "tie_word_embeddings": true,
255
  "transformers_version": "4.57.1",
256
  "use_cache": true,
257
  "vocab_size": 100352
model-00001-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6079e8b47f4af64eec779eb54270564d6709fac81eb9ab6ce44f9424c8d4e57
3
- size 4559751176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75458e16c3887a62c563c793dee5594a53853529fa58c3811be027cd414d79a2
3
+ size 4997838432
model-00002-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:023d69432e84fb302850992a8fce05abc4c3c7517f10088c54ffc570813cc551
3
- size 4600816904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74db08e9e04b6f492e9d524eee5b4408f015880185e4c15fe74b1f04bed15d46
3
+ size 4997162928
model-00003-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edd4d5a25cea6bb45ea9d5d3825cb95c1044dc7834488fe83dee1037449a3320
3
- size 4525943648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c00d9caf3319d881d9b818b72302309dd903d3a7c27eba76825dc9cc29b169b8
3
+ size 4994025096
model-00004-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6367da9acbedcbbdf0eb47f5efbe9bb3d54aa39293fab38d06ce6134c099bac6
3
- size 4511676880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66579e49df70283bf3c410f4ed8cee49b05e6d6cdbfea3830fccbdbc688df10a
3
+ size 4994104968
model-00005-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65edbaef60d5262ffcc8558f80b802399cb2230d0b9a56396221b964736e4dc8
3
- size 4521081440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60f126a6c9b00e193cd2fd76ba1fa8d7e20489ff681b9dc2226d4a1e06f648a6
3
+ size 4996130720
model-00006-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b715b3970a2f024fb2446add306bdc7184fbb0387436cca26d1cfd899a3bc662
3
- size 4507962856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebe97666e703d2910654e03e9ba4e66f0f833107e27e7e1a2faa180dd9bcae2b
3
+ size 4999699800
model-00007-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05af2aa6e913c9a399c2f75411f11af50adb48eb92ea869b60492a7bd2fee937
3
- size 4581798360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a9e8e08dcc001f495d247c72bdf63042f6213455b23a35ee39496b44bf095a
3
+ size 4997172264
model-00008-of-00008.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81afcb5cb3ed2b78e2358b9d042be229bc3fc2b8d25c0fa548fcd7efd77047c8
3
- size 4624854264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b19fb0b7629518a94f08d907514b753d3c2cf6b848b300e7470549f547ffefa9
3
+ size 637290328
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff