rag-backend / model /loader.py
imtrt004
fix: update model
ab16882
"""
Self-hosted LLM using HuggingFace Transformers β€” zero external API, no C++ compilation.
All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier).
Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
#1 TinyLlama/TinyLlama-1.1B-Chat-v1.0 ~1 GB 40-60 tok/s Apache 2.0 demos, prototypes
#2 Qwen/Qwen3-0.6B ~0.5 GB 45-55 tok/s Apache 2.0 speed-critical, Think mode
#3 meta-llama/Llama-3.2-1B-Instruct ~1.5 GB 35-50 tok/s Community 128K ctx, long-context (needs HF_TOKEN)
#4 HuggingFaceTB/SmolLM2-1.7B-Instruct ~2 GB 25-35 tok/s Apache 2.0 good quality/size ratio
#5 HuggingFaceTB/SmolLM2-360M-Instruct [DEF] ~0.4 GB 60-80 tok/s Apache 2.0 fastest, no token needed
#5 Qwen/Qwen2.5-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 multilingual, 32K ctx
#6 stabilityai/stablelm-2-zephyr-1_6b ~2 GB 25-40 tok/s MIT DPO-tuned chat feel
#7 Qwen/Qwen2.5-Coder-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 code completion/review
#8 microsoft/phi-2 ~3 GB 18-28 tok/s MIT reasoning & logic
#9 google/gemma-3-1b-it ~1.5 GB 35-48 tok/s Gemma ToU multilingual, 140+ langs
Note:
- Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
- Qwen3 supports /think and /no_think prefixes for reasoning depth control.
"""
import os
import time
import threading
import warnings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from model.log import section, step, ok, warn, error
# Suppress torch_dtype deprecation warning from transformers dev build
warnings.filterwarnings(
"ignore",
message=r".*torch_dtype.*is deprecated.*Use.*dtype.*",
category=FutureWarning,
)
MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")
# Models that need trust_remote_code=True (custom architectures)
_TRUST_REMOTE_CODE_MODELS = (
"LiquidAI/",
"DavidAU/LFM",
)
def _needs_trust_remote_code(model_id: str) -> bool:
return any(model_id.startswith(prefix) for prefix in _TRUST_REMOTE_CODE_MODELS)
_tokenizer: AutoTokenizer | None = None
_llm: AutoModelForCausalLM | None = None
_llm_ready: bool = False
_loading: bool = False
_loading_msg: str = ""
_loading_error: str | None = None
_switch_lock = threading.Lock()
def _load() -> None:
global _tokenizer, _llm, _llm_ready, _loading_msg
if _llm is not None:
return
t0 = time.perf_counter()
_trc = _needs_trust_remote_code(MODEL_ID)
section("MODEL", f"Loading {MODEL_ID}")
if _trc:
step("MODEL", "trust_remote_code=True (custom architecture)")
# ── Tokenizer ─────────────────────────────────────────────────────────────
_loading_msg = f"Loading tokenizer…"
step("MODEL", f"Fetching tokenizer…")
try:
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=_trc)
ok("MODEL", "Tokenizer loaded")
except Exception as exc:
error("MODEL", f"Tokenizer load failed β†’ {exc}")
raise
# ── Weights ───────────────────────────────────────────────────────────────
_loading_msg = "Loading model weights… (downloads on first run, then cached)"
step("MODEL", "Loading weights (first run will download β€” subsequent boots use cache)")
device_info = "CUDA" if torch.cuda.is_available() else "CPU"
step("MODEL", f"Device: {device_info} Β· dtype: bfloat16")
try:
_llm = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
trust_remote_code=_trc,
device_map="cpu",
low_cpu_mem_usage=True,
)
except Exception as exc:
error("MODEL", str(exc))
raise
_llm.eval()
_llm_ready = True
_loading_msg = ""
elapsed = time.perf_counter() - t0
params = sum(p.numel() for p in _llm.parameters()) / 1e6
ok("MODEL", f"Ready Β· {params:.0f}M params Β· {elapsed:.1f}s")
section("MODEL", "Model online")
def get_tokenizer() -> AutoTokenizer:
_load()
return _tokenizer # type: ignore
def get_llm() -> AutoModelForCausalLM:
_load()
return _llm # type: ignore
def get_model_name() -> str:
return MODEL_ID
def is_loading() -> bool:
return _loading
def get_loading_status() -> dict:
"""Return the current model loading state for admin polling."""
return {
"model": MODEL_ID,
"ready": _llm_ready,
"loading": _loading,
"msg": _loading_msg,
"error": _loading_error,
}
def switch_model(new_model_id: str) -> None:
"""Unload the current model, update MODEL_ID, and load the new model.
Designed to be called from a background thread."""
global MODEL_ID, _tokenizer, _llm, _llm_ready
global _loading, _loading_msg, _loading_error
with _switch_lock:
prev = MODEL_ID
section("SWITCH", f"{prev} β†’ {new_model_id}")
_loading = True
_loading_error = None
_loading_msg = f"Unloading {prev}…"
_llm_ready = False
# ── Release current model from memory ─────────────────────────────────
step("SWITCH", f"Unloading {prev}")
try:
import gc
_llm = None
_tokenizer = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
step("SWITCH", "CUDA cache cleared")
ok("SWITCH", "Memory freed")
except Exception as exc:
warn("SWITCH", f"Cleanup warning: {exc}")
MODEL_ID = new_model_id
step("SWITCH", f"Starting load of {new_model_id}")
try:
_load() # uses updated MODEL_ID; sets _llm_ready = True
_loading = False
ok("SWITCH", f"Switch complete β†’ {new_model_id}")
except Exception as exc:
_loading_error = str(exc)
_loading = False
_loading_msg = ""
error("SWITCH", f"Failed to load {new_model_id}\n {exc}")
def is_llm_ready() -> bool:
return _llm_ready