# model_loader.py
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from safetensors.torch import load_file
from config import DEVICE, MODEL_LIST

def load_model(model_name):
    """
    Load a model efficiently with memory optimization.
    Supports:
    - Hugging Face repos
    - Local safetensor weights
    Optimizations:
    - FP16/BF16
    - CPU offloading if GPU memory is low
    """
    try:
        if model_name.endswith(".safetensors"):
            print(f"[INFO] Loading safetensor model: {model_name}")
            tokenizer = AutoTokenizer.from_pretrained("gpt2")
            model = AutoModelForCausalLM.from_pretrained(
                "gpt2",
                state_dict=load_file(model_name),
                device_map="auto",           # Automatically places layers on GPU/CPU
                torch_dtype=torch.float16
            )
        else:
            print(f"[INFO] Loading Hugging Face model: {model_name}")
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.float16
            )
    except RuntimeError as e:
        print(f"[WARN] GPU memory insufficient, switching to CPU offload. {e}")
        # CPU offload
        from accelerate import init_empty_weights, load_checkpoint_and_dispatch
        from transformers import AutoConfig

        config = AutoConfig.from_pretrained(model_name)
        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(config)
        model = load_checkpoint_and_dispatch(
            model,
            model_name,
            device_map={"": "cpu"},
            no_split_module_classes=["GPT2Block"]
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)

    model.to(DEVICE)
    return tokenizer, model