Spaces:

build-small-hackathon
/

DataSense_E2B

Running on Zero

DataSense_E2B / model_loader.py

Fix LoRA via Unsloth, custom UI, CSV upload, loading states

6214995 verified 19 days ago

1.46 kB

	"""Load Gemma-4 E2B + DataSense SFT LoRA via Unsloth (Gemma4ClippableLinear-aware)."""

	from __future__ import annotations

	import os
	from functools import lru_cache
	from typing import Any

	import torch

	from config import ADAPTER_MODEL, LOAD_IN_4BIT, MAX_SEQ_LENGTH


	@lru_cache(maxsize=1)
	def load_model_and_tokenizer() -> tuple[Any, Any]:
	from unsloth import FastModel

	token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")

	print(f"Loading checkpoint: {ADAPTER_MODEL}")
	print(f" 4-bit: {LOAD_IN_4BIT and torch.cuda.is_available()}")

	model, processor = FastModel.from_pretrained(
	model_name=ADAPTER_MODEL,
	max_seq_length=MAX_SEQ_LENGTH,
	load_in_4bit=LOAD_IN_4BIT and torch.cuda.is_available(),
	dtype=None,
	device_map="auto" if torch.cuda.is_available() else None,
	token=token,
	)
	tokenizer = getattr(processor, "tokenizer", processor)
	if getattr(tokenizer, "pad_token", None) is None:
	tokenizer.pad_token = tokenizer.eos_token

	FastModel.for_inference(model)
	model.eval()

	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	total = sum(p.numel() for p in model.parameters())
	print(f"✓ Loaded — trainable {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
	if torch.cuda.is_available():
	print(f" VRAM: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

	return model, tokenizer