# optimized_loading.py from transformers import AutoModelForCausalLM, AutoTokenizer import torch from accelerate import infer_auto_device_map # 4-bit quantization (reduces memory by 75%) model = AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", load_in_4bit=True, # or load_in_8bit=True for 8-bit device_map="auto", torch_dtype=torch.float16, ) # CPU offloading (for low RAM) model = AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto", offload_folder="offload", offload_state_dict=True, )