Spaces:

tusarway
/

rag-backend

Running

imtrt004

fix: update model

ab16882 12 days ago

6.63 kB

	"""
	Self-hosted LLM using HuggingFace Transformers — zero external API, no C++ compilation.
	All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier).

	Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
	#1 TinyLlama/TinyLlama-1.1B-Chat-v1.0 ~1 GB 40-60 tok/s Apache 2.0 demos, prototypes
	#2 Qwen/Qwen3-0.6B ~0.5 GB 45-55 tok/s Apache 2.0 speed-critical, Think mode
	#3 meta-llama/Llama-3.2-1B-Instruct ~1.5 GB 35-50 tok/s Community 128K ctx, long-context (needs HF_TOKEN)
	#4 HuggingFaceTB/SmolLM2-1.7B-Instruct ~2 GB 25-35 tok/s Apache 2.0 good quality/size ratio
	#5 HuggingFaceTB/SmolLM2-360M-Instruct [DEF] ~0.4 GB 60-80 tok/s Apache 2.0 fastest, no token needed
	#5 Qwen/Qwen2.5-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 multilingual, 32K ctx
	#6 stabilityai/stablelm-2-zephyr-1_6b ~2 GB 25-40 tok/s MIT DPO-tuned chat feel
	#7 Qwen/Qwen2.5-Coder-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 code completion/review
	#8 microsoft/phi-2 ~3 GB 18-28 tok/s MIT reasoning & logic
	#9 google/gemma-3-1b-it ~1.5 GB 35-48 tok/s Gemma ToU multilingual, 140+ langs

	Note:
	- Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
	- Qwen3 supports /think and /no_think prefixes for reasoning depth control.
	"""

	import os
	import time
	import threading

	import warnings
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from model.log import section, step, ok, warn, error

	# Suppress torch_dtype deprecation warning from transformers dev build
	warnings.filterwarnings(
	"ignore",
	message=r".torch_dtype.is deprecated.Use.dtype.*",
	category=FutureWarning,
	)

	MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")

	# Models that need trust_remote_code=True (custom architectures)
	_TRUST_REMOTE_CODE_MODELS = (
	"LiquidAI/",
	"DavidAU/LFM",
	)

	def _needs_trust_remote_code(model_id: str) -> bool:
	return any(model_id.startswith(prefix) for prefix in _TRUST_REMOTE_CODE_MODELS)

	_tokenizer: AutoTokenizer \| None = None
	_llm: AutoModelForCausalLM \| None = None
	_llm_ready: bool = False
	_loading: bool = False
	_loading_msg: str = ""
	_loading_error: str \| None = None
	_switch_lock = threading.Lock()


	def _load() -> None:
	global _tokenizer, _llm, _llm_ready, _loading_msg

	if _llm is not None:
	return

	t0 = time.perf_counter()
	_trc = _needs_trust_remote_code(MODEL_ID)

	section("MODEL", f"Loading {MODEL_ID}")

	if _trc:
	step("MODEL", "trust_remote_code=True (custom architecture)")

	# ── Tokenizer ─────────────────────────────────────────────────────────────
	_loading_msg = f"Loading tokenizer…"
	step("MODEL", f"Fetching tokenizer…")
	try:
	_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=_trc)
	ok("MODEL", "Tokenizer loaded")
	except Exception as exc:
	error("MODEL", f"Tokenizer load failed → {exc}")
	raise

	# ── Weights ───────────────────────────────────────────────────────────────
	_loading_msg = "Loading model weights… (downloads on first run, then cached)"
	step("MODEL", "Loading weights (first run will download — subsequent boots use cache)")

	device_info = "CUDA" if torch.cuda.is_available() else "CPU"
	step("MODEL", f"Device: {device_info} · dtype: bfloat16")

	try:
	_llm = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16,
	trust_remote_code=_trc,
	device_map="cpu",
	low_cpu_mem_usage=True,
	)
	except Exception as exc:
	error("MODEL", str(exc))
	raise

	_llm.eval()
	_llm_ready = True
	_loading_msg = ""

	elapsed = time.perf_counter() - t0
	params = sum(p.numel() for p in _llm.parameters()) / 1e6
	ok("MODEL", f"Ready · {params:.0f}M params · {elapsed:.1f}s")
	section("MODEL", "Model online")


	def get_tokenizer() -> AutoTokenizer:
	_load()
	return _tokenizer # type: ignore


	def get_llm() -> AutoModelForCausalLM:
	_load()
	return _llm # type: ignore


	def get_model_name() -> str:
	return MODEL_ID


	def is_loading() -> bool:
	return _loading


	def get_loading_status() -> dict:
	"""Return the current model loading state for admin polling."""
	return {
	"model": MODEL_ID,
	"ready": _llm_ready,
	"loading": _loading,
	"msg": _loading_msg,
	"error": _loading_error,
	}


	def switch_model(new_model_id: str) -> None:
	"""Unload the current model, update MODEL_ID, and load the new model.
	Designed to be called from a background thread."""
	global MODEL_ID, _tokenizer, _llm, _llm_ready
	global _loading, _loading_msg, _loading_error

	with _switch_lock:
	prev = MODEL_ID
	section("SWITCH", f"{prev} → {new_model_id}")

	_loading = True
	_loading_error = None
	_loading_msg = f"Unloading {prev}…"
	_llm_ready = False

	# ── Release current model from memory ─────────────────────────────────
	step("SWITCH", f"Unloading {prev}")
	try:
	import gc
	_llm = None
	_tokenizer = None
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	step("SWITCH", "CUDA cache cleared")
	ok("SWITCH", "Memory freed")
	except Exception as exc:
	warn("SWITCH", f"Cleanup warning: {exc}")

	MODEL_ID = new_model_id
	step("SWITCH", f"Starting load of {new_model_id}")

	try:
	_load() # uses updated MODEL_ID; sets _llm_ready = True
	_loading = False
	ok("SWITCH", f"Switch complete → {new_model_id}")
	except Exception as exc:
	_loading_error = str(exc)
	_loading = False
	_loading_msg = ""
	error("SWITCH", f"Failed to load {new_model_id}\n {exc}")


	def is_llm_ready() -> bool:
	return _llm_ready