# model_loader.py from transformers import AutoModelForCausalLM, AutoTokenizer import torch from safetensors.torch import load_file from config import DEVICE, MODEL_LIST def load_model(model_name): """ Load a model efficiently with memory optimization. Supports: - Hugging Face repos - Local safetensor weights Optimizations: - FP16/BF16 - CPU offloading if GPU memory is low """ try: if model_name.endswith(".safetensors"): print(f"[INFO] Loading safetensor model: {model_name}") tokenizer = AutoTokenizer.from_pretrained("gpt2") model = AutoModelForCausalLM.from_pretrained( "gpt2", state_dict=load_file(model_name), device_map="auto", # Automatically places layers on GPU/CPU torch_dtype=torch.float16 ) else: print(f"[INFO] Loading Hugging Face model: {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", torch_dtype=torch.float16 ) except RuntimeError as e: print(f"[WARN] GPU memory insufficient, switching to CPU offload. {e}") # CPU offload from accelerate import init_empty_weights, load_checkpoint_and_dispatch from transformers import AutoConfig config = AutoConfig.from_pretrained(model_name) with init_empty_weights(): model = AutoModelForCausalLM.from_config(config) model = load_checkpoint_and_dispatch( model, model_name, device_map={"": "cpu"}, no_split_module_classes=["GPT2Block"] ) tokenizer = AutoTokenizer.from_pretrained(model_name) model.to(DEVICE) return tokenizer, model