|
|
""" |
|
|
This module handles loading and saving of LLaMA models with efficient quantization. |
|
|
This is already implemented and ready to use -- you don't need to modify this file. |
|
|
|
|
|
Key Features: |
|
|
- Loads LLaMA models from Hugging Face or local storage |
|
|
- Implements 4-bit quantization for memory efficiency |
|
|
- Provides save/load functionality for model persistence |
|
|
- Handles model loading errors gracefully |
|
|
|
|
|
Example Usage: |
|
|
from model import load_model, save_model |
|
|
|
|
|
# Load a model (will download if not found locally) |
|
|
model, tokenizer = load_model("meta-llama/Llama-2-7b-chat-hf") |
|
|
|
|
|
# Save model after making changes |
|
|
save_model(model, tokenizer) |
|
|
""" |
|
|
|
|
|
import os |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
|
import torch |
|
|
import gc |
|
|
|
|
|
|
|
|
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_SAVE_PATH = "models/school_chatbot" |
|
|
|
|
|
|
|
|
def save_model(model, tokenizer, save_directory="models/school_chatbot"): |
|
|
""" |
|
|
Save the model and tokenizer to a local directory with CPU memory optimization |
|
|
""" |
|
|
|
|
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
|
|
|
|
|
model = model.cpu() |
|
|
|
|
|
|
|
|
model.half() |
|
|
|
|
|
try: |
|
|
|
|
|
model.save_pretrained( |
|
|
save_directory, |
|
|
safe_serialization=True, |
|
|
max_shard_size="500MB" |
|
|
) |
|
|
|
|
|
|
|
|
tokenizer.save_pretrained(save_directory) |
|
|
|
|
|
print(f"Model and tokenizer saved to {save_directory}") |
|
|
finally: |
|
|
|
|
|
gc.collect() |
|
|
|
|
|
|
|
|
model.float() |
|
|
|
|
|
|
|
|
def load_model(): |
|
|
""" |
|
|
Load the model for CPU usage |
|
|
""" |
|
|
try: |
|
|
if os.path.exists(MODEL_SAVE_PATH): |
|
|
print("Loading model from local storage...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_SAVE_PATH, |
|
|
low_cpu_mem_usage=True, |
|
|
torch_dtype=torch.float32 |
|
|
) |
|
|
else: |
|
|
print("Downloading model from Hugging Face... Should take 2-3 minutes.") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
low_cpu_mem_usage=True, |
|
|
torch_dtype=torch.float32 |
|
|
) |
|
|
|
|
|
save_model(model, tokenizer) |
|
|
|
|
|
|
|
|
model = model.to("cpu") |
|
|
return model, tokenizer |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error loading model: {e}") |
|
|
return None, None |
|
|
|
|
|
if __name__ == "__main__": |
|
|
model, tokenizer = load_model() |
|
|
print(model) |
|
|
print(tokenizer) |
|
|
|