import json import sys import os from transformers import AutoTokenizer from llmtuner.compression.quantization.AutoAWQ.awq import AutoAWQForCausalLM model_path = sys.argv[1] quant_path = sys.argv[2] bits = sys.argv[3] quant_config = { "zero_point": True, "q_group_size": q_group_size, "w_bit": int(bits), "version": "GEMM", } print(f"quant_config: {quant_config}") # Load model model = AutoAWQForCausalLM.from_pretrained( model_path, **{"low_cpu_mem_usage": True, "use_cache": False} ) try: tokenizer = AutoTokenizer.from_pretrained( model_path, use_fast=False, trust_remote_code=True, ) except: tokenizer = AutoTokenizer.from_pretrained( model_path, use_fast=True, trust_remote_code=True, ) # Quantize model.quantize(tokenizer, quant_config=quant_config) # Save quantized model model.save_quantized(quant_path) tokenizer.save_pretrained(quant_path) f = open(os.path.join(quant_path, "quantize_config.json"), 'w') config_to_save = json.dumps(quant_config, indent=2, sort_keys=True) f.write(config_to_save) f.close() print(f'Model is quantized and saved at "{quant_path}"')