File size: 3,283 Bytes
abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 abf7d79 86e3856 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
"""
This module handles loading and saving of LLaMA models with efficient quantization.
This is already implemented and ready to use -- you don't need to modify this file.
Key Features:
- Loads LLaMA models from Hugging Face or local storage
- Implements 4-bit quantization for memory efficiency
- Provides save/load functionality for model persistence
- Handles model loading errors gracefully
Example Usage:
from model import load_model, save_model
# Load a model (will download if not found locally)
model, tokenizer = load_model("meta-llama/Llama-2-7b-chat-hf")
# Save model after making changes
save_model(model, tokenizer)
"""
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc
# Choose a model
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Change this to your preferred model
# Other options:
# MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
# MODEL_NAME = "openlm-research/open_llama_3b"
# Path to save and load models
MODEL_SAVE_PATH = "models/school_chatbot"
def save_model(model, tokenizer, save_directory="models/school_chatbot"):
"""
Save the model and tokenizer to a local directory with CPU memory optimization
"""
# Create directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)
# Move model to CPU if it's on GPU
model = model.cpu()
# Save in half precision to reduce file size
model.half() # Convert to float16
try:
# Save in smaller chunks
model.save_pretrained(
save_directory,
safe_serialization=True, # More memory efficient serialization
max_shard_size="500MB" # Split into smaller files
)
# Save tokenizer (relatively small, no special handling needed)
tokenizer.save_pretrained(save_directory)
print(f"Model and tokenizer saved to {save_directory}")
finally:
# Clean up memory
gc.collect()
# Convert back to float32 for continued use if needed
model.float()
def load_model():
"""
Load the model for CPU usage
"""
try:
if os.path.exists(MODEL_SAVE_PATH):
print("Loading model from local storage...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)
model = AutoModelForCausalLM.from_pretrained(
MODEL_SAVE_PATH,
low_cpu_mem_usage=True,
torch_dtype=torch.float32
)
else:
print("Downloading model from Hugging Face... Should take 2-3 minutes.")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
low_cpu_mem_usage=True,
torch_dtype=torch.float32
)
# Save for future use
save_model(model, tokenizer)
# Move model to CPU
model = model.to("cpu")
return model, tokenizer
except Exception as e:
print(f"Error loading model: {e}")
return None, None
if __name__ == "__main__":
model, tokenizer = load_model()
print(model)
print(tokenizer)
|