File size: 1,911 Bytes
2eaa513 7381a1f 2eaa513 7381a1f 2eaa513 e71c280 2eaa513 7381a1f e71c280 7381a1f e71c280 7381a1f e71c280 7381a1f 2eaa513 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# model_loader.py
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from safetensors.torch import load_file
from config import DEVICE, MODEL_LIST
def load_model(model_name):
"""
Load a model efficiently with memory optimization.
Supports:
- Hugging Face repos
- Local safetensor weights
Optimizations:
- FP16/BF16
- CPU offloading if GPU memory is low
"""
try:
if model_name.endswith(".safetensors"):
print(f"[INFO] Loading safetensor model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained(
"gpt2",
state_dict=load_file(model_name),
device_map="auto", # Automatically places layers on GPU/CPU
torch_dtype=torch.float16
)
else:
print(f"[INFO] Loading Hugging Face model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.float16
)
except RuntimeError as e:
print(f"[WARN] GPU memory insufficient, switching to CPU offload. {e}")
# CPU offload
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
model = load_checkpoint_and_dispatch(
model,
model_name,
device_map={"": "cpu"},
no_split_module_classes=["GPT2Block"]
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.to(DEVICE)
return tokenizer, model
|