haoyang-amd commited on
Commit
2c46775
·
verified ·
1 Parent(s): ce2b5eb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +30 -9
README.md CHANGED
@@ -33,15 +33,36 @@ You can either perform the dequantization manually using this [conversion script
33
 
34
  **Quantization scripts:**
35
  ```
36
- cd Quark/examples/torch/language_modeling/llm_ptq/
37
- python3 internal_scripts/quantize_quark.py \
38
- --model_dir deepseek-ai/DeepSeek-R1-0528-bf16 \
39
- --quant_scheme w_fp8_per_channel_static_a_fp8_per_token_dynamic \
40
- --exclude_layers "*lm_head" "*mlp.gate" \
41
- --num_calib_data 128 \
42
- --output_dir DeepSeek-R1-0528-ptpc \
43
- --model_export hf_format \
44
- --multi_gpu
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  ```
46
 
47
  # Deployment
 
33
 
34
  **Quantization scripts:**
35
  ```
36
+ # pip install amd-quark
37
+ from transformers import AutoTokenizer, AutoModelForCausalLM
38
+
39
+ from quark.torch import ModelQuantizer, export_safetensors
40
+ from quark.torch.quantization import FP8E4M3PerChannelSpec
41
+ from quark.torch.quantization.config.config import Config, QuantizationConfig
42
+
43
+ ckpt_path = "unsloth/DeepSeek-R1-0528-BF16"
44
+ exclude_layers = ["lm_head","*mlp.gate"]
45
+ output_dir = ckpt_path.rstrip("/").split("/")[-1] + "-ptpc"
46
+
47
+ # Load the original floating-point model
48
+ model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="auto", torch_dtype="auto", trust_remote_code=True)
49
+ model.eval()
50
+ tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
51
+
52
+ # Set the quantization configuration
53
+ FP8_PER_CHANNEL_SPEC = FP8E4M3PerChannelSpec(is_dynamic=False, ch_axis=0).to_quantization_spec()
54
+ FP8_PER_TOKEN_DYNAMIC_SPEC = FP8E4M3PerChannelSpec(is_dynamic=True, ch_axis=1).to_quantization_spec()
55
+ W_FP8_PER_CHANNEL_STATIC_A_FP8_PER_TOKEN_DYNAMIC_CONFIG = QuantizationConfig(input_tensors=FP8_PER_TOKEN_DYNAMIC_SPEC, weight=FP8_PER_CHANNEL_SPEC)
56
+ quant_config = Config(global_quant_config=W_FP8_PER_CHANNEL_STATIC_A_FP8_PER_TOKEN_DYNAMIC_CONFIG, exclude=exclude_layers)
57
+
58
+ # Apply quantization
59
+ quantizer = ModelQuantizer(quant_config)
60
+ model = quantizer.quantize_model(model)
61
+
62
+ # Export quantized model
63
+ model = quantizer.freeze(model)
64
+ export_safetensors(model, output_dir)
65
+ tokenizer.save_pretrained(output_dir)
66
  ```
67
 
68
  # Deployment