| from transformers import LlamaForCausalLM, LlamaTokenizer | |
| import torch | |
| import bitsandbytes as bnb | |
| # Load the model and tokenizer | |
| model_name = 'huggingface/llama-3b' | |
| tokenizer = LlamaTokenizer.from_pretrained(model_name) | |
| model = LlamaForCausalLM.from_pretrained(model_name) | |
| # Quantize the model using 8-bit integers | |
| model = bnb.nn.quantize_linear(model, torch.float16, torch.int8) | |
| # Save the quantized model | |
| model.save_pretrained('llama-3b-quantized') | |
| tokenizer.save_pretrained('llama-3b-quantized') | |
| print("Model quantized and saved successfully.") | |