Upload folder using huggingface_hub
Browse files- config.json +44 -40
- model-00001-of-00008.safetensors +2 -2
- model-00002-of-00008.safetensors +2 -2
- model-00003-of-00008.safetensors +2 -2
- model-00004-of-00008.safetensors +2 -2
- model-00005-of-00008.safetensors +2 -2
- model-00006-of-00008.safetensors +2 -2
- model-00007-of-00008.safetensors +2 -2
- model-00008-of-00008.safetensors +2 -2
- model.safetensors.index.json +0 -0
config.json
CHANGED
|
@@ -88,9 +88,11 @@
|
|
| 88 |
"num_bits": 8,
|
| 89 |
"observer": null,
|
| 90 |
"observer_kwargs": {},
|
|
|
|
| 91 |
"strategy": "group",
|
| 92 |
"symmetric": true,
|
| 93 |
-
"type": "float"
|
|
|
|
| 94 |
},
|
| 95 |
"output_activations": null,
|
| 96 |
"targets": [
|
|
@@ -107,9 +109,11 @@
|
|
| 107 |
"num_bits": 8,
|
| 108 |
"observer": "minmax",
|
| 109 |
"observer_kwargs": {},
|
|
|
|
| 110 |
"strategy": "block",
|
| 111 |
"symmetric": true,
|
| 112 |
-
"type": "float"
|
|
|
|
| 113 |
}
|
| 114 |
}
|
| 115 |
},
|
|
@@ -118,120 +122,120 @@
|
|
| 118 |
"ignore": [
|
| 119 |
"model.layers.0.block_sparse_moe.router.layer",
|
| 120 |
"model.layers.0.shared_mlp.input_linear",
|
| 121 |
-
"model.layers.0.
|
| 122 |
"model.layers.1.block_sparse_moe.router.layer",
|
| 123 |
"model.layers.1.shared_mlp.input_linear",
|
| 124 |
-
"model.layers.1.
|
| 125 |
"model.layers.2.block_sparse_moe.router.layer",
|
| 126 |
"model.layers.2.shared_mlp.input_linear",
|
| 127 |
-
"model.layers.2.
|
| 128 |
"model.layers.3.block_sparse_moe.router.layer",
|
| 129 |
"model.layers.3.shared_mlp.input_linear",
|
| 130 |
-
"model.layers.3.
|
| 131 |
"model.layers.4.block_sparse_moe.router.layer",
|
| 132 |
"model.layers.4.shared_mlp.input_linear",
|
| 133 |
-
"model.layers.4.
|
| 134 |
"model.layers.5.block_sparse_moe.router.layer",
|
| 135 |
"model.layers.5.shared_mlp.input_linear",
|
| 136 |
"model.layers.6.block_sparse_moe.router.layer",
|
| 137 |
"model.layers.6.shared_mlp.input_linear",
|
| 138 |
-
"model.layers.6.
|
| 139 |
"model.layers.7.block_sparse_moe.router.layer",
|
| 140 |
"model.layers.7.shared_mlp.input_linear",
|
| 141 |
-
"model.layers.7.
|
| 142 |
"model.layers.8.block_sparse_moe.router.layer",
|
| 143 |
"model.layers.8.shared_mlp.input_linear",
|
| 144 |
-
"model.layers.8.
|
| 145 |
"model.layers.9.block_sparse_moe.router.layer",
|
| 146 |
"model.layers.9.shared_mlp.input_linear",
|
| 147 |
-
"model.layers.9.
|
| 148 |
"model.layers.10.block_sparse_moe.router.layer",
|
| 149 |
"model.layers.10.shared_mlp.input_linear",
|
| 150 |
-
"model.layers.10.
|
| 151 |
"model.layers.11.block_sparse_moe.router.layer",
|
| 152 |
"model.layers.11.shared_mlp.input_linear",
|
| 153 |
-
"model.layers.11.
|
| 154 |
"model.layers.12.block_sparse_moe.router.layer",
|
| 155 |
"model.layers.12.shared_mlp.input_linear",
|
| 156 |
-
"model.layers.12.
|
| 157 |
"model.layers.13.block_sparse_moe.router.layer",
|
| 158 |
"model.layers.13.shared_mlp.input_linear",
|
| 159 |
-
"model.layers.13.
|
| 160 |
"model.layers.14.block_sparse_moe.router.layer",
|
| 161 |
"model.layers.14.shared_mlp.input_linear",
|
| 162 |
-
"model.layers.14.
|
| 163 |
"model.layers.15.block_sparse_moe.router.layer",
|
| 164 |
"model.layers.15.shared_mlp.input_linear",
|
| 165 |
"model.layers.16.block_sparse_moe.router.layer",
|
| 166 |
"model.layers.16.shared_mlp.input_linear",
|
| 167 |
-
"model.layers.16.
|
| 168 |
"model.layers.17.block_sparse_moe.router.layer",
|
| 169 |
"model.layers.17.shared_mlp.input_linear",
|
| 170 |
-
"model.layers.17.
|
| 171 |
"model.layers.18.block_sparse_moe.router.layer",
|
| 172 |
"model.layers.18.shared_mlp.input_linear",
|
| 173 |
-
"model.layers.18.
|
| 174 |
"model.layers.19.block_sparse_moe.router.layer",
|
| 175 |
"model.layers.19.shared_mlp.input_linear",
|
| 176 |
-
"model.layers.19.
|
| 177 |
"model.layers.20.block_sparse_moe.router.layer",
|
| 178 |
"model.layers.20.shared_mlp.input_linear",
|
| 179 |
-
"model.layers.20.
|
| 180 |
"model.layers.21.block_sparse_moe.router.layer",
|
| 181 |
"model.layers.21.shared_mlp.input_linear",
|
| 182 |
-
"model.layers.21.
|
| 183 |
"model.layers.22.block_sparse_moe.router.layer",
|
| 184 |
"model.layers.22.shared_mlp.input_linear",
|
| 185 |
-
"model.layers.22.
|
| 186 |
"model.layers.23.block_sparse_moe.router.layer",
|
| 187 |
"model.layers.23.shared_mlp.input_linear",
|
| 188 |
-
"model.layers.23.
|
| 189 |
"model.layers.24.block_sparse_moe.router.layer",
|
| 190 |
"model.layers.24.shared_mlp.input_linear",
|
| 191 |
-
"model.layers.24.
|
| 192 |
"model.layers.25.block_sparse_moe.router.layer",
|
| 193 |
"model.layers.25.shared_mlp.input_linear",
|
| 194 |
"model.layers.26.block_sparse_moe.router.layer",
|
| 195 |
"model.layers.26.shared_mlp.input_linear",
|
| 196 |
-
"model.layers.26.
|
| 197 |
"model.layers.27.block_sparse_moe.router.layer",
|
| 198 |
"model.layers.27.shared_mlp.input_linear",
|
| 199 |
-
"model.layers.27.
|
| 200 |
"model.layers.28.block_sparse_moe.router.layer",
|
| 201 |
"model.layers.28.shared_mlp.input_linear",
|
| 202 |
-
"model.layers.28.
|
| 203 |
"model.layers.29.block_sparse_moe.router.layer",
|
| 204 |
"model.layers.29.shared_mlp.input_linear",
|
| 205 |
-
"model.layers.29.
|
| 206 |
"model.layers.30.block_sparse_moe.router.layer",
|
| 207 |
"model.layers.30.shared_mlp.input_linear",
|
| 208 |
-
"model.layers.30.
|
| 209 |
"model.layers.31.block_sparse_moe.router.layer",
|
| 210 |
"model.layers.31.shared_mlp.input_linear",
|
| 211 |
-
"model.layers.31.
|
| 212 |
"model.layers.32.block_sparse_moe.router.layer",
|
| 213 |
"model.layers.32.shared_mlp.input_linear",
|
| 214 |
-
"model.layers.32.
|
| 215 |
"model.layers.33.block_sparse_moe.router.layer",
|
| 216 |
"model.layers.33.shared_mlp.input_linear",
|
| 217 |
-
"model.layers.33.
|
| 218 |
"model.layers.34.block_sparse_moe.router.layer",
|
| 219 |
"model.layers.34.shared_mlp.input_linear",
|
| 220 |
-
"model.layers.34.
|
| 221 |
"model.layers.35.block_sparse_moe.router.layer",
|
| 222 |
"model.layers.35.shared_mlp.input_linear",
|
| 223 |
"model.layers.36.block_sparse_moe.router.layer",
|
| 224 |
"model.layers.36.shared_mlp.input_linear",
|
| 225 |
-
"model.layers.36.
|
| 226 |
"model.layers.37.block_sparse_moe.router.layer",
|
| 227 |
"model.layers.37.shared_mlp.input_linear",
|
| 228 |
-
"model.layers.37.
|
| 229 |
"model.layers.38.block_sparse_moe.router.layer",
|
| 230 |
"model.layers.38.shared_mlp.input_linear",
|
| 231 |
-
"model.layers.38.
|
| 232 |
"model.layers.39.block_sparse_moe.router.layer",
|
| 233 |
"model.layers.39.shared_mlp.input_linear",
|
| 234 |
-
"model.layers.39.
|
| 235 |
"lm_head"
|
| 236 |
],
|
| 237 |
"kv_cache_scheme": null,
|
|
@@ -239,7 +243,7 @@
|
|
| 239 |
"quantization_status": "compressed",
|
| 240 |
"sparsity_config": {},
|
| 241 |
"transform_config": {},
|
| 242 |
-
"version": "0.12.3.
|
| 243 |
},
|
| 244 |
"residual_multiplier": 0.22,
|
| 245 |
"rms_norm_eps": 1e-05,
|
|
@@ -247,7 +251,7 @@
|
|
| 247 |
"rope_theta": 10000,
|
| 248 |
"router_aux_loss_coef": 0.0,
|
| 249 |
"shared_intermediate_size": 1536,
|
| 250 |
-
"tie_word_embeddings":
|
| 251 |
"transformers_version": "4.57.1",
|
| 252 |
"use_cache": true,
|
| 253 |
"vocab_size": 100352
|
|
|
|
| 88 |
"num_bits": 8,
|
| 89 |
"observer": null,
|
| 90 |
"observer_kwargs": {},
|
| 91 |
+
"scale_dtype": null,
|
| 92 |
"strategy": "group",
|
| 93 |
"symmetric": true,
|
| 94 |
+
"type": "float",
|
| 95 |
+
"zp_dtype": null
|
| 96 |
},
|
| 97 |
"output_activations": null,
|
| 98 |
"targets": [
|
|
|
|
| 109 |
"num_bits": 8,
|
| 110 |
"observer": "minmax",
|
| 111 |
"observer_kwargs": {},
|
| 112 |
+
"scale_dtype": null,
|
| 113 |
"strategy": "block",
|
| 114 |
"symmetric": true,
|
| 115 |
+
"type": "float",
|
| 116 |
+
"zp_dtype": null
|
| 117 |
}
|
| 118 |
}
|
| 119 |
},
|
|
|
|
| 122 |
"ignore": [
|
| 123 |
"model.layers.0.block_sparse_moe.router.layer",
|
| 124 |
"model.layers.0.shared_mlp.input_linear",
|
| 125 |
+
"model.layers.0.mamba.in_proj",
|
| 126 |
"model.layers.1.block_sparse_moe.router.layer",
|
| 127 |
"model.layers.1.shared_mlp.input_linear",
|
| 128 |
+
"model.layers.1.mamba.in_proj",
|
| 129 |
"model.layers.2.block_sparse_moe.router.layer",
|
| 130 |
"model.layers.2.shared_mlp.input_linear",
|
| 131 |
+
"model.layers.2.mamba.in_proj",
|
| 132 |
"model.layers.3.block_sparse_moe.router.layer",
|
| 133 |
"model.layers.3.shared_mlp.input_linear",
|
| 134 |
+
"model.layers.3.mamba.in_proj",
|
| 135 |
"model.layers.4.block_sparse_moe.router.layer",
|
| 136 |
"model.layers.4.shared_mlp.input_linear",
|
| 137 |
+
"model.layers.4.mamba.in_proj",
|
| 138 |
"model.layers.5.block_sparse_moe.router.layer",
|
| 139 |
"model.layers.5.shared_mlp.input_linear",
|
| 140 |
"model.layers.6.block_sparse_moe.router.layer",
|
| 141 |
"model.layers.6.shared_mlp.input_linear",
|
| 142 |
+
"model.layers.6.mamba.in_proj",
|
| 143 |
"model.layers.7.block_sparse_moe.router.layer",
|
| 144 |
"model.layers.7.shared_mlp.input_linear",
|
| 145 |
+
"model.layers.7.mamba.in_proj",
|
| 146 |
"model.layers.8.block_sparse_moe.router.layer",
|
| 147 |
"model.layers.8.shared_mlp.input_linear",
|
| 148 |
+
"model.layers.8.mamba.in_proj",
|
| 149 |
"model.layers.9.block_sparse_moe.router.layer",
|
| 150 |
"model.layers.9.shared_mlp.input_linear",
|
| 151 |
+
"model.layers.9.mamba.in_proj",
|
| 152 |
"model.layers.10.block_sparse_moe.router.layer",
|
| 153 |
"model.layers.10.shared_mlp.input_linear",
|
| 154 |
+
"model.layers.10.mamba.in_proj",
|
| 155 |
"model.layers.11.block_sparse_moe.router.layer",
|
| 156 |
"model.layers.11.shared_mlp.input_linear",
|
| 157 |
+
"model.layers.11.mamba.in_proj",
|
| 158 |
"model.layers.12.block_sparse_moe.router.layer",
|
| 159 |
"model.layers.12.shared_mlp.input_linear",
|
| 160 |
+
"model.layers.12.mamba.in_proj",
|
| 161 |
"model.layers.13.block_sparse_moe.router.layer",
|
| 162 |
"model.layers.13.shared_mlp.input_linear",
|
| 163 |
+
"model.layers.13.mamba.in_proj",
|
| 164 |
"model.layers.14.block_sparse_moe.router.layer",
|
| 165 |
"model.layers.14.shared_mlp.input_linear",
|
| 166 |
+
"model.layers.14.mamba.in_proj",
|
| 167 |
"model.layers.15.block_sparse_moe.router.layer",
|
| 168 |
"model.layers.15.shared_mlp.input_linear",
|
| 169 |
"model.layers.16.block_sparse_moe.router.layer",
|
| 170 |
"model.layers.16.shared_mlp.input_linear",
|
| 171 |
+
"model.layers.16.mamba.in_proj",
|
| 172 |
"model.layers.17.block_sparse_moe.router.layer",
|
| 173 |
"model.layers.17.shared_mlp.input_linear",
|
| 174 |
+
"model.layers.17.mamba.in_proj",
|
| 175 |
"model.layers.18.block_sparse_moe.router.layer",
|
| 176 |
"model.layers.18.shared_mlp.input_linear",
|
| 177 |
+
"model.layers.18.mamba.in_proj",
|
| 178 |
"model.layers.19.block_sparse_moe.router.layer",
|
| 179 |
"model.layers.19.shared_mlp.input_linear",
|
| 180 |
+
"model.layers.19.mamba.in_proj",
|
| 181 |
"model.layers.20.block_sparse_moe.router.layer",
|
| 182 |
"model.layers.20.shared_mlp.input_linear",
|
| 183 |
+
"model.layers.20.mamba.in_proj",
|
| 184 |
"model.layers.21.block_sparse_moe.router.layer",
|
| 185 |
"model.layers.21.shared_mlp.input_linear",
|
| 186 |
+
"model.layers.21.mamba.in_proj",
|
| 187 |
"model.layers.22.block_sparse_moe.router.layer",
|
| 188 |
"model.layers.22.shared_mlp.input_linear",
|
| 189 |
+
"model.layers.22.mamba.in_proj",
|
| 190 |
"model.layers.23.block_sparse_moe.router.layer",
|
| 191 |
"model.layers.23.shared_mlp.input_linear",
|
| 192 |
+
"model.layers.23.mamba.in_proj",
|
| 193 |
"model.layers.24.block_sparse_moe.router.layer",
|
| 194 |
"model.layers.24.shared_mlp.input_linear",
|
| 195 |
+
"model.layers.24.mamba.in_proj",
|
| 196 |
"model.layers.25.block_sparse_moe.router.layer",
|
| 197 |
"model.layers.25.shared_mlp.input_linear",
|
| 198 |
"model.layers.26.block_sparse_moe.router.layer",
|
| 199 |
"model.layers.26.shared_mlp.input_linear",
|
| 200 |
+
"model.layers.26.mamba.in_proj",
|
| 201 |
"model.layers.27.block_sparse_moe.router.layer",
|
| 202 |
"model.layers.27.shared_mlp.input_linear",
|
| 203 |
+
"model.layers.27.mamba.in_proj",
|
| 204 |
"model.layers.28.block_sparse_moe.router.layer",
|
| 205 |
"model.layers.28.shared_mlp.input_linear",
|
| 206 |
+
"model.layers.28.mamba.in_proj",
|
| 207 |
"model.layers.29.block_sparse_moe.router.layer",
|
| 208 |
"model.layers.29.shared_mlp.input_linear",
|
| 209 |
+
"model.layers.29.mamba.in_proj",
|
| 210 |
"model.layers.30.block_sparse_moe.router.layer",
|
| 211 |
"model.layers.30.shared_mlp.input_linear",
|
| 212 |
+
"model.layers.30.mamba.in_proj",
|
| 213 |
"model.layers.31.block_sparse_moe.router.layer",
|
| 214 |
"model.layers.31.shared_mlp.input_linear",
|
| 215 |
+
"model.layers.31.mamba.in_proj",
|
| 216 |
"model.layers.32.block_sparse_moe.router.layer",
|
| 217 |
"model.layers.32.shared_mlp.input_linear",
|
| 218 |
+
"model.layers.32.mamba.in_proj",
|
| 219 |
"model.layers.33.block_sparse_moe.router.layer",
|
| 220 |
"model.layers.33.shared_mlp.input_linear",
|
| 221 |
+
"model.layers.33.mamba.in_proj",
|
| 222 |
"model.layers.34.block_sparse_moe.router.layer",
|
| 223 |
"model.layers.34.shared_mlp.input_linear",
|
| 224 |
+
"model.layers.34.mamba.in_proj",
|
| 225 |
"model.layers.35.block_sparse_moe.router.layer",
|
| 226 |
"model.layers.35.shared_mlp.input_linear",
|
| 227 |
"model.layers.36.block_sparse_moe.router.layer",
|
| 228 |
"model.layers.36.shared_mlp.input_linear",
|
| 229 |
+
"model.layers.36.mamba.in_proj",
|
| 230 |
"model.layers.37.block_sparse_moe.router.layer",
|
| 231 |
"model.layers.37.shared_mlp.input_linear",
|
| 232 |
+
"model.layers.37.mamba.in_proj",
|
| 233 |
"model.layers.38.block_sparse_moe.router.layer",
|
| 234 |
"model.layers.38.shared_mlp.input_linear",
|
| 235 |
+
"model.layers.38.mamba.in_proj",
|
| 236 |
"model.layers.39.block_sparse_moe.router.layer",
|
| 237 |
"model.layers.39.shared_mlp.input_linear",
|
| 238 |
+
"model.layers.39.mamba.in_proj",
|
| 239 |
"lm_head"
|
| 240 |
],
|
| 241 |
"kv_cache_scheme": null,
|
|
|
|
| 243 |
"quantization_status": "compressed",
|
| 244 |
"sparsity_config": {},
|
| 245 |
"transform_config": {},
|
| 246 |
+
"version": "0.12.3.dev28+g2763f81"
|
| 247 |
},
|
| 248 |
"residual_multiplier": 0.22,
|
| 249 |
"rms_norm_eps": 1e-05,
|
|
|
|
| 251 |
"rope_theta": 10000,
|
| 252 |
"router_aux_loss_coef": 0.0,
|
| 253 |
"shared_intermediate_size": 1536,
|
| 254 |
+
"tie_word_embeddings": true,
|
| 255 |
"transformers_version": "4.57.1",
|
| 256 |
"use_cache": true,
|
| 257 |
"vocab_size": 100352
|
model-00001-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75458e16c3887a62c563c793dee5594a53853529fa58c3811be027cd414d79a2
|
| 3 |
+
size 4997838432
|
model-00002-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74db08e9e04b6f492e9d524eee5b4408f015880185e4c15fe74b1f04bed15d46
|
| 3 |
+
size 4997162928
|
model-00003-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c00d9caf3319d881d9b818b72302309dd903d3a7c27eba76825dc9cc29b169b8
|
| 3 |
+
size 4994025096
|
model-00004-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66579e49df70283bf3c410f4ed8cee49b05e6d6cdbfea3830fccbdbc688df10a
|
| 3 |
+
size 4994104968
|
model-00005-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60f126a6c9b00e193cd2fd76ba1fa8d7e20489ff681b9dc2226d4a1e06f648a6
|
| 3 |
+
size 4996130720
|
model-00006-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebe97666e703d2910654e03e9ba4e66f0f833107e27e7e1a2faa180dd9bcae2b
|
| 3 |
+
size 4999699800
|
model-00007-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02a9e8e08dcc001f495d247c72bdf63042f6213455b23a35ee39496b44bf095a
|
| 3 |
+
size 4997172264
|
model-00008-of-00008.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b19fb0b7629518a94f08d907514b753d3c2cf6b848b300e7470549f547ffefa9
|
| 3 |
+
size 637290328
|
model.safetensors.index.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|