Upload quantized EXAONE model

Files changed (3) hide show

config.json CHANGED Viewed

@@ -90,6 +90,15 @@
     "format": "int-quantized",
     "global_compression_ratio": null,
     "ignore": [
       "lm_head"
     ],
     "kv_cache_scheme": {

     "format": "int-quantized",
     "global_compression_ratio": null,
     "ignore": [
+      "model.layers.27.mlp.gate_proj",
+      "model.layers.27.mlp.up_proj",
+      "model.layers.27.mlp.down_proj",
+      "model.layers.28.mlp.gate_proj",
+      "model.layers.28.mlp.up_proj",
+      "model.layers.28.mlp.down_proj",
+      "model.layers.29.mlp.gate_proj",
+      "model.layers.29.mlp.up_proj",
+      "model.layers.29.mlp.down_proj",
       "lm_head"
     ],
     "kv_cache_scheme": {

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8dc16c2492096a6d724994f4e1ae746d323b9d2a32a9c3788e50b7094d7ba5f1
-size 1909656064

 version https://git-lfs.github.com/spec/v1
+oid sha256:da147ceac30e7a9680fd4ac87f8633e40eeba4a024659b4b5c34a94775440a37
+size 1985091120

recipe.yaml CHANGED Viewed

@@ -6,9 +6,12 @@ default_stage:
       - - - .*norm.*
           - [.*(q|k|v)_proj]
       ignore: []
-    QuantizationModifier:
       targets: [Linear]
-      ignore: [embed_tokens, lm_head]
       scheme: W8A8
       kv_cache_scheme:
         num_bits: 8
@@ -24,3 +27,7 @@ default_stage:
         observer: minmax
         observer_kwargs: {}
       observer: {weights: minmax, input: minmax}

       - - - .*norm.*
           - [.*(q|k|v)_proj]
       ignore: []
+    GPTQModifier:
       targets: [Linear]
+      ignore: [embed_tokens, lm_head, model.layers.27.mlp.gate_proj, model.layers.27.mlp.up_proj,
+        model.layers.27.mlp.down_proj, model.layers.28.mlp.gate_proj, model.layers.28.mlp.up_proj,
+        model.layers.28.mlp.down_proj, model.layers.29.mlp.gate_proj, model.layers.29.mlp.up_proj,
+        model.layers.29.mlp.down_proj]
       scheme: W8A8
       kv_cache_scheme:
         num_bits: 8
         observer: minmax
         observer_kwargs: {}
       observer: {weights: minmax, input: minmax}
+      block_size: 128
+      dampening_frac: 0.01
+      actorder: static
+      offload_hessians: false