Update README.md
Browse files
README.md
CHANGED
|
@@ -33,15 +33,36 @@ You can either perform the dequantization manually using this [conversion script
|
|
| 33 |
|
| 34 |
**Quantization scripts:**
|
| 35 |
```
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
```
|
| 46 |
|
| 47 |
# Deployment
|
|
|
|
| 33 |
|
| 34 |
**Quantization scripts:**
|
| 35 |
```
|
| 36 |
+
# pip install amd-quark
|
| 37 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 38 |
+
|
| 39 |
+
from quark.torch import ModelQuantizer, export_safetensors
|
| 40 |
+
from quark.torch.quantization import FP8E4M3PerChannelSpec
|
| 41 |
+
from quark.torch.quantization.config.config import Config, QuantizationConfig
|
| 42 |
+
|
| 43 |
+
ckpt_path = "unsloth/DeepSeek-R1-0528-BF16"
|
| 44 |
+
exclude_layers = ["lm_head","*mlp.gate"]
|
| 45 |
+
output_dir = ckpt_path.rstrip("/").split("/")[-1] + "-ptpc"
|
| 46 |
+
|
| 47 |
+
# Load the original floating-point model
|
| 48 |
+
model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="auto", torch_dtype="auto", trust_remote_code=True)
|
| 49 |
+
model.eval()
|
| 50 |
+
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
|
| 51 |
+
|
| 52 |
+
# Set the quantization configuration
|
| 53 |
+
FP8_PER_CHANNEL_SPEC = FP8E4M3PerChannelSpec(is_dynamic=False, ch_axis=0).to_quantization_spec()
|
| 54 |
+
FP8_PER_TOKEN_DYNAMIC_SPEC = FP8E4M3PerChannelSpec(is_dynamic=True, ch_axis=1).to_quantization_spec()
|
| 55 |
+
W_FP8_PER_CHANNEL_STATIC_A_FP8_PER_TOKEN_DYNAMIC_CONFIG = QuantizationConfig(input_tensors=FP8_PER_TOKEN_DYNAMIC_SPEC, weight=FP8_PER_CHANNEL_SPEC)
|
| 56 |
+
quant_config = Config(global_quant_config=W_FP8_PER_CHANNEL_STATIC_A_FP8_PER_TOKEN_DYNAMIC_CONFIG, exclude=exclude_layers)
|
| 57 |
+
|
| 58 |
+
# Apply quantization
|
| 59 |
+
quantizer = ModelQuantizer(quant_config)
|
| 60 |
+
model = quantizer.quantize_model(model)
|
| 61 |
+
|
| 62 |
+
# Export quantized model
|
| 63 |
+
model = quantizer.freeze(model)
|
| 64 |
+
export_safetensors(model, output_dir)
|
| 65 |
+
tokenizer.save_pretrained(output_dir)
|
| 66 |
```
|
| 67 |
|
| 68 |
# Deployment
|