| import os | |
| # Set GPU environment variables | |
| os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "2" | |
| from awq import AutoAWQForCausalLM | |
| from transformers import AutoTokenizer | |
| # Paths | |
| model_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16" | |
| quant_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B-subclaims-support-check-8b_ctx_AWQ" | |
| # Quantization configuration | |
| quant_config = { | |
| "zero_point": True, | |
| "q_group_size": 128, | |
| "w_bit": 4, | |
| "version": "GEMM" | |
| } | |
| # Load model and tokenizer | |
| print("Loading model...") | |
| model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | |
| # Quantize | |
| print("Starting quantization (this may take a while)...") | |
| # AutoAWQ uses a default calibration dataset (pile-val) | |
| model.quantize(tokenizer, quant_config=quant_config) | |
| # Save quantized model | |
| print(f"Saving quantized model to {quant_path}...") | |
| model.save_quantized(quant_path) | |
| tokenizer.save_pretrained(quant_path) | |
| print("Quantization Complete!") |