"""Load Gemma-4 E2B + DataSense SFT LoRA via Unsloth (Gemma4ClippableLinear-aware).""" from __future__ import annotations import os from functools import lru_cache from typing import Any import torch from config import ADAPTER_MODEL, LOAD_IN_4BIT, MAX_SEQ_LENGTH @lru_cache(maxsize=1) def load_model_and_tokenizer() -> tuple[Any, Any]: from unsloth import FastModel token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") print(f"Loading checkpoint: {ADAPTER_MODEL}") print(f" 4-bit: {LOAD_IN_4BIT and torch.cuda.is_available()}") model, processor = FastModel.from_pretrained( model_name=ADAPTER_MODEL, max_seq_length=MAX_SEQ_LENGTH, load_in_4bit=LOAD_IN_4BIT and torch.cuda.is_available(), dtype=None, device_map="auto" if torch.cuda.is_available() else None, token=token, ) tokenizer = getattr(processor, "tokenizer", processor) if getattr(tokenizer, "pad_token", None) is None: tokenizer.pad_token = tokenizer.eos_token FastModel.for_inference(model) model.eval() trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f"✓ Loaded — trainable {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)") if torch.cuda.is_available(): print(f" VRAM: {torch.cuda.memory_allocated() / 1e9:.2f} GB") return model, tokenizer