| from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AwqConfig | |
| from huggingface_hub import notebook_login, HfApi | |
| from peft import PeftModel, PeftConfig | |
| from optimum.gptq import GPTQQuantizer, load_quantized_model | |
| from accelerate import Accelerator | |
| import torch | |
| model_id = "mistralai/Mistral-Nemo-Instruct-2407" | |
| quant_dataset = "c4" | |
| gptq_repo = "Granther/Mistral-Nemo-Instruct-GPTQ" | |
| awq_repo = "" | |
| # GPTQ | |
| gptq_dir = "gptq/" | |
| # AWQ | |
| awq_dir = "awq/" | |
| accelerator = Accelerator() | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id, ignore_mismatched_sizes=True) | |
| model, tokenizer = accelerator.prepare(model, tokenizer) | |
| quantizer = GPTQQuantizer(bits=4, | |
| dataset="c4", | |
| group_size=64, # The size of groups to perform quant calcs on | |
| desc_act=True, # Perplexity is better, compute speed is worse | |
| sym=True, # Symetrical quant | |
| true_sequential=True, # | |
| #block_name_to_quantize = "layers.0", | |
| tokenizer=tokenizer) | |
| print("Made it to quant_model") | |
| quantized_model = quantizer.quantize_model(model, tokenizer=tokenizer) | |
| tokenizer.save_pretrained(gptq_dir) | |
| #gptq_config.save_pretrained(gptq_dir) | |
| quantized_model.save_pretrained(gptq_dir) |