naruto-lora / quantize_llama.py
poppingout1325's picture
End of training
5f58fbe verified
from transformers import LlamaForCausalLM, LlamaTokenizer
import torch
import bitsandbytes as bnb
# Load the model and tokenizer
model_name = 'huggingface/llama-3b'
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)
# Quantize the model using 8-bit integers
model = bnb.nn.quantize_linear(model, torch.float16, torch.int8)
# Save the quantized model
model.save_pretrained('llama-3b-quantized')
tokenizer.save_pretrained('llama-3b-quantized')
print("Model quantized and saved successfully.")