File size: 592 Bytes
1e639fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# optimized_loading.py
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from accelerate import infer_auto_device_map

# 4-bit quantization (reduces memory by 75%)
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    load_in_4bit=True,  # or load_in_8bit=True for 8-bit
    device_map="auto",
    torch_dtype=torch.float16,
)

# CPU offloading (for low RAM)
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto",
    offload_folder="offload",
    offload_state_dict=True,
)