Spaces:
Running
Running
| """ | |
| Self-hosted LLM using HuggingFace Transformers β zero external API, no C++ compilation. | |
| All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier). | |
| Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed): | |
| #1 TinyLlama/TinyLlama-1.1B-Chat-v1.0 ~1 GB 40-60 tok/s Apache 2.0 demos, prototypes | |
| #2 Qwen/Qwen3-0.6B ~0.5 GB 45-55 tok/s Apache 2.0 speed-critical, Think mode | |
| #3 meta-llama/Llama-3.2-1B-Instruct ~1.5 GB 35-50 tok/s Community 128K ctx, long-context (needs HF_TOKEN) | |
| #4 HuggingFaceTB/SmolLM2-1.7B-Instruct ~2 GB 25-35 tok/s Apache 2.0 good quality/size ratio | |
| #5 HuggingFaceTB/SmolLM2-360M-Instruct [DEF] ~0.4 GB 60-80 tok/s Apache 2.0 fastest, no token needed | |
| #5 Qwen/Qwen2.5-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 multilingual, 32K ctx | |
| #6 stabilityai/stablelm-2-zephyr-1_6b ~2 GB 25-40 tok/s MIT DPO-tuned chat feel | |
| #7 Qwen/Qwen2.5-Coder-1.5B-Instruct ~2 GB 25-40 tok/s Apache 2.0 code completion/review | |
| #8 microsoft/phi-2 ~3 GB 18-28 tok/s MIT reasoning & logic | |
| #9 google/gemma-3-1b-it ~1.5 GB 35-48 tok/s Gemma ToU multilingual, 140+ langs | |
| Note: | |
| - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models). | |
| - Qwen3 supports /think and /no_think prefixes for reasoning depth control. | |
| """ | |
| import os | |
| import time | |
| import threading | |
| import warnings | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from model.log import section, step, ok, warn, error | |
| # Suppress torch_dtype deprecation warning from transformers dev build | |
| warnings.filterwarnings( | |
| "ignore", | |
| message=r".*torch_dtype.*is deprecated.*Use.*dtype.*", | |
| category=FutureWarning, | |
| ) | |
| MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct") | |
| # Models that need trust_remote_code=True (custom architectures) | |
| _TRUST_REMOTE_CODE_MODELS = ( | |
| "LiquidAI/", | |
| "DavidAU/LFM", | |
| ) | |
| def _needs_trust_remote_code(model_id: str) -> bool: | |
| return any(model_id.startswith(prefix) for prefix in _TRUST_REMOTE_CODE_MODELS) | |
| _tokenizer: AutoTokenizer | None = None | |
| _llm: AutoModelForCausalLM | None = None | |
| _llm_ready: bool = False | |
| _loading: bool = False | |
| _loading_msg: str = "" | |
| _loading_error: str | None = None | |
| _switch_lock = threading.Lock() | |
| def _load() -> None: | |
| global _tokenizer, _llm, _llm_ready, _loading_msg | |
| if _llm is not None: | |
| return | |
| t0 = time.perf_counter() | |
| _trc = _needs_trust_remote_code(MODEL_ID) | |
| section("MODEL", f"Loading {MODEL_ID}") | |
| if _trc: | |
| step("MODEL", "trust_remote_code=True (custom architecture)") | |
| # ββ Tokenizer βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _loading_msg = f"Loading tokenizerβ¦" | |
| step("MODEL", f"Fetching tokenizerβ¦") | |
| try: | |
| _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=_trc) | |
| ok("MODEL", "Tokenizer loaded") | |
| except Exception as exc: | |
| error("MODEL", f"Tokenizer load failed β {exc}") | |
| raise | |
| # ββ Weights βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _loading_msg = "Loading model weights⦠(downloads on first run, then cached)" | |
| step("MODEL", "Loading weights (first run will download β subsequent boots use cache)") | |
| device_info = "CUDA" if torch.cuda.is_available() else "CPU" | |
| step("MODEL", f"Device: {device_info} Β· dtype: bfloat16") | |
| try: | |
| _llm = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=_trc, | |
| device_map="cpu", | |
| low_cpu_mem_usage=True, | |
| ) | |
| except Exception as exc: | |
| error("MODEL", str(exc)) | |
| raise | |
| _llm.eval() | |
| _llm_ready = True | |
| _loading_msg = "" | |
| elapsed = time.perf_counter() - t0 | |
| params = sum(p.numel() for p in _llm.parameters()) / 1e6 | |
| ok("MODEL", f"Ready Β· {params:.0f}M params Β· {elapsed:.1f}s") | |
| section("MODEL", "Model online") | |
| def get_tokenizer() -> AutoTokenizer: | |
| _load() | |
| return _tokenizer # type: ignore | |
| def get_llm() -> AutoModelForCausalLM: | |
| _load() | |
| return _llm # type: ignore | |
| def get_model_name() -> str: | |
| return MODEL_ID | |
| def is_loading() -> bool: | |
| return _loading | |
| def get_loading_status() -> dict: | |
| """Return the current model loading state for admin polling.""" | |
| return { | |
| "model": MODEL_ID, | |
| "ready": _llm_ready, | |
| "loading": _loading, | |
| "msg": _loading_msg, | |
| "error": _loading_error, | |
| } | |
| def switch_model(new_model_id: str) -> None: | |
| """Unload the current model, update MODEL_ID, and load the new model. | |
| Designed to be called from a background thread.""" | |
| global MODEL_ID, _tokenizer, _llm, _llm_ready | |
| global _loading, _loading_msg, _loading_error | |
| with _switch_lock: | |
| prev = MODEL_ID | |
| section("SWITCH", f"{prev} β {new_model_id}") | |
| _loading = True | |
| _loading_error = None | |
| _loading_msg = f"Unloading {prev}β¦" | |
| _llm_ready = False | |
| # ββ Release current model from memory βββββββββββββββββββββββββββββββββ | |
| step("SWITCH", f"Unloading {prev}") | |
| try: | |
| import gc | |
| _llm = None | |
| _tokenizer = None | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| step("SWITCH", "CUDA cache cleared") | |
| ok("SWITCH", "Memory freed") | |
| except Exception as exc: | |
| warn("SWITCH", f"Cleanup warning: {exc}") | |
| MODEL_ID = new_model_id | |
| step("SWITCH", f"Starting load of {new_model_id}") | |
| try: | |
| _load() # uses updated MODEL_ID; sets _llm_ready = True | |
| _loading = False | |
| ok("SWITCH", f"Switch complete β {new_model_id}") | |
| except Exception as exc: | |
| _loading_error = str(exc) | |
| _loading = False | |
| _loading_msg = "" | |
| error("SWITCH", f"Failed to load {new_model_id}\n {exc}") | |
| def is_llm_ready() -> bool: | |
| return _llm_ready | |