Spaces:

laur0613
/

chatbot

Build error

chatbot / src /model.py

Sarah Bentley

adding metadata

86e3856 9 months ago

3.28 kB

	"""
	This module handles loading and saving of LLaMA models with efficient quantization.
	This is already implemented and ready to use -- you don't need to modify this file.

	Key Features:
	- Loads LLaMA models from Hugging Face or local storage
	- Implements 4-bit quantization for memory efficiency
	- Provides save/load functionality for model persistence
	- Handles model loading errors gracefully

	Example Usage:
	from model import load_model, save_model

	# Load a model (will download if not found locally)
	model, tokenizer = load_model("meta-llama/Llama-2-7b-chat-hf")

	# Save model after making changes
	save_model(model, tokenizer)
	"""

	import os
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	import torch
	import gc

	# Choose a model
	MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Change this to your preferred model
	# Other options:
	# MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
	# MODEL_NAME = "openlm-research/open_llama_3b"

	# Path to save and load models
	MODEL_SAVE_PATH = "models/school_chatbot"


	def save_model(model, tokenizer, save_directory="models/school_chatbot"):
	"""
	Save the model and tokenizer to a local directory with CPU memory optimization
	"""
	# Create directory if it doesn't exist
	os.makedirs(save_directory, exist_ok=True)

	# Move model to CPU if it's on GPU
	model = model.cpu()

	# Save in half precision to reduce file size
	model.half() # Convert to float16

	try:
	# Save in smaller chunks
	model.save_pretrained(
	save_directory,
	safe_serialization=True, # More memory efficient serialization
	max_shard_size="500MB" # Split into smaller files
	)

	# Save tokenizer (relatively small, no special handling needed)
	tokenizer.save_pretrained(save_directory)

	print(f"Model and tokenizer saved to {save_directory}")
	finally:
	# Clean up memory
	gc.collect()

	# Convert back to float32 for continued use if needed
	model.float()


	def load_model():
	"""
	Load the model for CPU usage
	"""
	try:
	if os.path.exists(MODEL_SAVE_PATH):
	print("Loading model from local storage...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_SAVE_PATH,
	low_cpu_mem_usage=True,
	torch_dtype=torch.float32
	)
	else:
	print("Downloading model from Hugging Face... Should take 2-3 minutes.")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	low_cpu_mem_usage=True,
	torch_dtype=torch.float32
	)
	# Save for future use
	save_model(model, tokenizer)

	# Move model to CPU
	model = model.to("cpu")
	return model, tokenizer

	except Exception as e:
	print(f"Error loading model: {e}")
	return None, None

	if __name__ == "__main__":
	model, tokenizer = load_model()
	print(model)
	print(tokenizer)