amd
/

DeepSeek-R1-0528-ptpc

Model card Files Files and versions

haoyang-amd commited on Nov 18

Commit

2c46775

·

verified ·

1 Parent(s): ce2b5eb

Update README.md

Files changed (1) hide show

README.md +30 -9

README.md CHANGED Viewed

@@ -33,15 +33,36 @@ You can either perform the dequantization manually using this [conversion script
 **Quantization scripts:**
 ```
-cd Quark/examples/torch/language_modeling/llm_ptq/
-python3 internal_scripts/quantize_quark.py \
-    --model_dir deepseek-ai/DeepSeek-R1-0528-bf16 \
-    --quant_scheme w_fp8_per_channel_static_a_fp8_per_token_dynamic \
-    --exclude_layers "*lm_head" "*mlp.gate" \
-    --num_calib_data 128 \
-    --output_dir DeepSeek-R1-0528-ptpc \
-    --model_export hf_format \
-    --multi_gpu
 ```
 # Deployment

 **Quantization scripts:**
 ```
+# pip install amd-quark
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from quark.torch import ModelQuantizer, export_safetensors
+from quark.torch.quantization import FP8E4M3PerChannelSpec
+from quark.torch.quantization.config.config import Config, QuantizationConfig
+ckpt_path = "unsloth/DeepSeek-R1-0528-BF16"
+exclude_layers = ["lm_head","*mlp.gate"]
+output_dir = ckpt_path.rstrip("/").split("/")[-1] + "-ptpc"
+# Load the original floating-point model
+model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="auto", torch_dtype="auto", trust_remote_code=True)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
+# Set the quantization configuration
+FP8_PER_CHANNEL_SPEC = FP8E4M3PerChannelSpec(is_dynamic=False, ch_axis=0).to_quantization_spec()
+FP8_PER_TOKEN_DYNAMIC_SPEC = FP8E4M3PerChannelSpec(is_dynamic=True, ch_axis=1).to_quantization_spec()
+W_FP8_PER_CHANNEL_STATIC_A_FP8_PER_TOKEN_DYNAMIC_CONFIG = QuantizationConfig(input_tensors=FP8_PER_TOKEN_DYNAMIC_SPEC, weight=FP8_PER_CHANNEL_SPEC)
+quant_config = Config(global_quant_config=W_FP8_PER_CHANNEL_STATIC_A_FP8_PER_TOKEN_DYNAMIC_CONFIG, exclude=exclude_layers)
+# Apply quantization
+quantizer = ModelQuantizer(quant_config)
+model = quantizer.quantize_model(model)
+# Export quantized model
+model = quantizer.freeze(model)
+export_safetensors(model, output_dir)
+tokenizer.save_pretrained(output_dir)
 ```
 # Deployment