DataSense_E2B / model_loader.py
sanjaymalladi's picture
Fix LoRA via Unsloth, custom UI, CSV upload, loading states
6214995 verified
Raw
History Blame Contribute Delete
1.46 kB
"""Load Gemma-4 E2B + DataSense SFT LoRA via Unsloth (Gemma4ClippableLinear-aware)."""
from __future__ import annotations
import os
from functools import lru_cache
from typing import Any
import torch
from config import ADAPTER_MODEL, LOAD_IN_4BIT, MAX_SEQ_LENGTH
@lru_cache(maxsize=1)
def load_model_and_tokenizer() -> tuple[Any, Any]:
from unsloth import FastModel
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
print(f"Loading checkpoint: {ADAPTER_MODEL}")
print(f" 4-bit: {LOAD_IN_4BIT and torch.cuda.is_available()}")
model, processor = FastModel.from_pretrained(
model_name=ADAPTER_MODEL,
max_seq_length=MAX_SEQ_LENGTH,
load_in_4bit=LOAD_IN_4BIT and torch.cuda.is_available(),
dtype=None,
device_map="auto" if torch.cuda.is_available() else None,
token=token,
)
tokenizer = getattr(processor, "tokenizer", processor)
if getattr(tokenizer, "pad_token", None) is None:
tokenizer.pad_token = tokenizer.eos_token
FastModel.for_inference(model)
model.eval()
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"βœ“ Loaded β€” trainable {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
if torch.cuda.is_available():
print(f" VRAM: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
return model, tokenizer