readctrl / code /convert_awq.py
shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
import os
# Set GPU environment variables
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
# Paths
model_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16"
quant_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B-subclaims-support-check-8b_ctx_AWQ"
# Quantization configuration
quant_config = {
"zero_point": True,
"q_group_size": 128,
"w_bit": 4,
"version": "GEMM"
}
# Load model and tokenizer
print("Loading model...")
model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize
print("Starting quantization (this may take a while)...")
# AutoAWQ uses a default calibration dataset (pile-val)
model.quantize(tokenizer, quant_config=quant_config)
# Save quantized model
print(f"Saving quantized model to {quant_path}...")
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print("Quantization Complete!")