import os # Set GPU environment variables os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" from awq import AutoAWQForCausalLM from transformers import AutoTokenizer # Paths model_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16" quant_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B-subclaims-support-check-8b_ctx_AWQ" # Quantization configuration quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } # Load model and tokenizer print("Loading model...") model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) # Quantize print("Starting quantization (this may take a while)...") # AutoAWQ uses a default calibration dataset (pile-val) model.quantize(tokenizer, quant_config=quant_config) # Save quantized model print(f"Saving quantized model to {quant_path}...") model.save_quantized(quant_path) tokenizer.save_pretrained(quant_path) print("Quantization Complete!")