""" Self-hosted LLM using HuggingFace Transformers — zero external API, no C++ compilation. All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier). Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed): #1 TinyLlama/TinyLlama-1.1B-Chat-v1.0 ~1 GB 40-60 tok/s Apache 2.0 demos, prototypes #2 Qwen/Qwen3-0.6B ~0.5 GB 45-55 tok/s Apache 2.0 speed-critical, Think mode #3 meta-llama/Llama-3.2-1B-Instruct ~1.5 GB 35-50 tok/s Community 128K ctx, long-context (needs HF_TOKEN) #4 HuggingFaceTB/SmolLM2-1.7B-Instruct ~2 GB 25-35 tok/s Apache 2.0 good quality/size ratio #5 HuggingFaceTB/SmolLM2-360M-Instruct [DEF] ~0.4 GB 60-80 tok/s Apache 2.0 fastest, no token needed #5 Qwen/Qwen2.5-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 multilingual, 32K ctx #6 stabilityai/stablelm-2-zephyr-1_6b ~2 GB 25-40 tok/s MIT DPO-tuned chat feel #7 Qwen/Qwen2.5-Coder-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 code completion/review #8 microsoft/phi-2 ~3 GB 18-28 tok/s MIT reasoning & logic #9 google/gemma-3-1b-it ~1.5 GB 35-48 tok/s Gemma ToU multilingual, 140+ langs Note: - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models). - Qwen3 supports /think and /no_think prefixes for reasoning depth control. """ import os import time import threading import warnings import torch from transformers import AutoTokenizer, AutoModelForCausalLM from model.log import section, step, ok, warn, error # Suppress torch_dtype deprecation warning from transformers dev build warnings.filterwarnings( "ignore", message=r".*torch_dtype.*is deprecated.*Use.*dtype.*", category=FutureWarning, ) MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct") # Models that need trust_remote_code=True (custom architectures) _TRUST_REMOTE_CODE_MODELS = ( "LiquidAI/", "DavidAU/LFM", ) def _needs_trust_remote_code(model_id: str) -> bool: return any(model_id.startswith(prefix) for prefix in _TRUST_REMOTE_CODE_MODELS) _tokenizer: AutoTokenizer | None = None _llm: AutoModelForCausalLM | None = None _llm_ready: bool = False _loading: bool = False _loading_msg: str = "" _loading_error: str | None = None _switch_lock = threading.Lock() def _load() -> None: global _tokenizer, _llm, _llm_ready, _loading_msg if _llm is not None: return t0 = time.perf_counter() _trc = _needs_trust_remote_code(MODEL_ID) section("MODEL", f"Loading {MODEL_ID}") if _trc: step("MODEL", "trust_remote_code=True (custom architecture)") # ── Tokenizer ───────────────────────────────────────────────────────────── _loading_msg = f"Loading tokenizer…" step("MODEL", f"Fetching tokenizer…") try: _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=_trc) ok("MODEL", "Tokenizer loaded") except Exception as exc: error("MODEL", f"Tokenizer load failed → {exc}") raise # ── Weights ─────────────────────────────────────────────────────────────── _loading_msg = "Loading model weights… (downloads on first run, then cached)" step("MODEL", "Loading weights (first run will download — subsequent boots use cache)") device_info = "CUDA" if torch.cuda.is_available() else "CPU" step("MODEL", f"Device: {device_info} · dtype: bfloat16") try: _llm = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=_trc, device_map="cpu", low_cpu_mem_usage=True, ) except Exception as exc: error("MODEL", str(exc)) raise _llm.eval() _llm_ready = True _loading_msg = "" elapsed = time.perf_counter() - t0 params = sum(p.numel() for p in _llm.parameters()) / 1e6 ok("MODEL", f"Ready · {params:.0f}M params · {elapsed:.1f}s") section("MODEL", "Model online") def get_tokenizer() -> AutoTokenizer: _load() return _tokenizer # type: ignore def get_llm() -> AutoModelForCausalLM: _load() return _llm # type: ignore def get_model_name() -> str: return MODEL_ID def is_loading() -> bool: return _loading def get_loading_status() -> dict: """Return the current model loading state for admin polling.""" return { "model": MODEL_ID, "ready": _llm_ready, "loading": _loading, "msg": _loading_msg, "error": _loading_error, } def switch_model(new_model_id: str) -> None: """Unload the current model, update MODEL_ID, and load the new model. Designed to be called from a background thread.""" global MODEL_ID, _tokenizer, _llm, _llm_ready global _loading, _loading_msg, _loading_error with _switch_lock: prev = MODEL_ID section("SWITCH", f"{prev} → {new_model_id}") _loading = True _loading_error = None _loading_msg = f"Unloading {prev}…" _llm_ready = False # ── Release current model from memory ───────────────────────────────── step("SWITCH", f"Unloading {prev}") try: import gc _llm = None _tokenizer = None gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() step("SWITCH", "CUDA cache cleared") ok("SWITCH", "Memory freed") except Exception as exc: warn("SWITCH", f"Cleanup warning: {exc}") MODEL_ID = new_model_id step("SWITCH", f"Starting load of {new_model_id}") try: _load() # uses updated MODEL_ID; sets _llm_ready = True _loading = False ok("SWITCH", f"Switch complete → {new_model_id}") except Exception as exc: _loading_error = str(exc) _loading = False _loading_msg = "" error("SWITCH", f"Failed to load {new_model_id}\n {exc}") def is_llm_ready() -> bool: return _llm_ready