Mistral_Test / optimized_loading.py
eesfeg's picture
Add application file
1e639fb
raw
history blame contribute delete
592 Bytes
# optimized_loading.py
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from accelerate import infer_auto_device_map
# 4-bit quantization (reduces memory by 75%)
model = AutoModelForCausalLM.from_pretrained(
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
load_in_4bit=True, # or load_in_8bit=True for 8-bit
device_map="auto",
torch_dtype=torch.float16,
)
# CPU offloading (for low RAM)
model = AutoModelForCausalLM.from_pretrained(
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
device_map="auto",
offload_folder="offload",
offload_state_dict=True,
)