"""Load Gemma-4 E2B + DataSense SFT LoRA via Unsloth (Gemma4ClippableLinear-aware)."""

from __future__ import annotations

import os
from functools import lru_cache
from typing import Any

import torch

from config import ADAPTER_MODEL, LOAD_IN_4BIT, MAX_SEQ_LENGTH


@lru_cache(maxsize=1)
def load_model_and_tokenizer() -> tuple[Any, Any]:
    from unsloth import FastModel

    token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")

    print(f"Loading checkpoint: {ADAPTER_MODEL}")
    print(f"  4-bit: {LOAD_IN_4BIT and torch.cuda.is_available()}")

    model, processor = FastModel.from_pretrained(
        model_name=ADAPTER_MODEL,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=LOAD_IN_4BIT and torch.cuda.is_available(),
        dtype=None,
        device_map="auto" if torch.cuda.is_available() else None,
        token=token,
    )
    tokenizer = getattr(processor, "tokenizer", processor)
    if getattr(tokenizer, "pad_token", None) is None:
        tokenizer.pad_token = tokenizer.eos_token

    FastModel.for_inference(model)
    model.eval()

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"✓ Loaded — trainable {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
    if torch.cuda.is_available():
        print(f"  VRAM: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    return model, tokenizer