File size: 1,151 Bytes
bdd08f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# take in a model path and quantization args
import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_path", type=str, default="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
)
parser.add_argument("--quant_path", type=str, default="r1-14b-awq-max-ptb")
args = parser.parse_args()

model_path = args.model_path
quant_path = args.quant_path
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM",
}


# Load model
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True,
)


# Quantize
model.quantize(
    tokenizer,
    quant_config=quant_config,
    # calib_data="neuralmagic/LLM_compression_calibration",
    # calib_data=get_long_dataset(),
    # calib_data="ptb",
    # max_calib_samples=128,
    # max_calib_seq_len=12288,
    # n_parallel_calib_samples=128,
)

# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

print(f'Model is quantized and saved at "{quant_path}"')