|
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
import torch |
|
|
from safetensors.torch import load_file |
|
|
from config import DEVICE, MODEL_LIST |
|
|
|
|
|
def load_model(model_name): |
|
|
""" |
|
|
Load a model efficiently with memory optimization. |
|
|
Supports: |
|
|
- Hugging Face repos |
|
|
- Local safetensor weights |
|
|
Optimizations: |
|
|
- FP16/BF16 |
|
|
- CPU offloading if GPU memory is low |
|
|
""" |
|
|
try: |
|
|
if model_name.endswith(".safetensors"): |
|
|
print(f"[INFO] Loading safetensor model: {model_name}") |
|
|
tokenizer = AutoTokenizer.from_pretrained("gpt2") |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"gpt2", |
|
|
state_dict=load_file(model_name), |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16 |
|
|
) |
|
|
else: |
|
|
print(f"[INFO] Loading Hugging Face model: {model_name}") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16 |
|
|
) |
|
|
except RuntimeError as e: |
|
|
print(f"[WARN] GPU memory insufficient, switching to CPU offload. {e}") |
|
|
|
|
|
from accelerate import init_empty_weights, load_checkpoint_and_dispatch |
|
|
from transformers import AutoConfig |
|
|
|
|
|
config = AutoConfig.from_pretrained(model_name) |
|
|
with init_empty_weights(): |
|
|
model = AutoModelForCausalLM.from_config(config) |
|
|
model = load_checkpoint_and_dispatch( |
|
|
model, |
|
|
model_name, |
|
|
device_map={"": "cpu"}, |
|
|
no_split_module_classes=["GPT2Block"] |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
model.to(DEVICE) |
|
|
return tokenizer, model |
|
|
|