Spaces:
Running on Zero
Running on Zero
| """Load Gemma-4 E2B + DataSense SFT LoRA via Unsloth (Gemma4ClippableLinear-aware).""" | |
| from __future__ import annotations | |
| import os | |
| from functools import lru_cache | |
| from typing import Any | |
| import torch | |
| from config import ADAPTER_MODEL, LOAD_IN_4BIT, MAX_SEQ_LENGTH | |
| def load_model_and_tokenizer() -> tuple[Any, Any]: | |
| from unsloth import FastModel | |
| token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") | |
| print(f"Loading checkpoint: {ADAPTER_MODEL}") | |
| print(f" 4-bit: {LOAD_IN_4BIT and torch.cuda.is_available()}") | |
| model, processor = FastModel.from_pretrained( | |
| model_name=ADAPTER_MODEL, | |
| max_seq_length=MAX_SEQ_LENGTH, | |
| load_in_4bit=LOAD_IN_4BIT and torch.cuda.is_available(), | |
| dtype=None, | |
| device_map="auto" if torch.cuda.is_available() else None, | |
| token=token, | |
| ) | |
| tokenizer = getattr(processor, "tokenizer", processor) | |
| if getattr(tokenizer, "pad_token", None) is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| FastModel.for_inference(model) | |
| model.eval() | |
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| total = sum(p.numel() for p in model.parameters()) | |
| print(f"β Loaded β trainable {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)") | |
| if torch.cuda.is_available(): | |
| print(f" VRAM: {torch.cuda.memory_allocated() / 1e9:.2f} GB") | |
| return model, tokenizer | |