Spaces:
Running on Zero
Running on Zero
| """ | |
| Static catalogue: the frozen facts the advisor reasons over. | |
| Everything here is build-time data — no network calls at runtime. That keeps | |
| the tool fully offline-capable (the "Off the Grid" goal) and means the advice | |
| can't silently drift when some external API changes. | |
| Sources for the numbers (so anyone can check our work): | |
| - bits-per-weight for GGUF quant families: llama.cpp / Hugging Face GGUF docs | |
| - "~2 GB per 1B params at fp16": Hugging Face Transformers optimisation guide | |
| - 8-bit ≈ 50% of fp16, 4-bit ≈ 25-30%: bitsandbytes docs | |
| - architecture sizes (layers / hidden): typical published configs per size class | |
| """ | |
| from dataclasses import dataclass, field | |
| # -------------------------------------------------------------------------- | |
| # Quantisation tiers | |
| # -------------------------------------------------------------------------- | |
| # "Quantisation" = squashing the model's numbers into fewer bits so it takes | |
| # less memory. Fewer bits = smaller + faster, but slightly less sharp. | |
| # gb_per_billion is just bits_per_weight / 8 (bits -> bytes -> GB per 1B params). | |
| class QuantTier: | |
| key: str | |
| plain_name: str # what a normal person sees | |
| bits_per_weight: float | |
| blurb: str # one honest sentence about the trade-off | |
| recommended: bool = False | |
| def gb_per_billion(self) -> float: | |
| return self.bits_per_weight / 8.0 | |
| QUANT_TIERS: list[QuantTier] = [ | |
| QuantTier("fp16", "Full quality (fp16)", 16.0, | |
| "The original, uncompressed model. Biggest and slowest to load."), | |
| QuantTier("Q8_0", "Near-full (8-bit)", 8.5, | |
| "Practically indistinguishable from full quality, about half the size."), | |
| QuantTier("Q6_K", "High (6-bit)", 6.56, | |
| "Very close to full quality, a bit smaller again."), | |
| QuantTier("Q5_K_M", "Balanced+ (5-bit)", 5.67, | |
| "A touch sharper than 4-bit for a little more memory."), | |
| QuantTier("Q4_K_M", "Balanced (4-bit)", 4.83, | |
| "The sweet spot most people use: small, fast, and still very good.", | |
| recommended=True), | |
| QuantTier("Q3_K_M", "Compact (3-bit)", 3.91, | |
| "Smaller still, with a slight, usually-acceptable quality dip."), | |
| QuantTier("Q2_K", "Tiny (2-bit)", 3.35, | |
| "Last resort to make something fit — noticeably less reliable."), | |
| ] | |
| QUANT_BY_KEY = {q.key: q for q in QUANT_TIERS} | |
| RECOMMENDED_QUANT = next(q for q in QUANT_TIERS if q.recommended) | |
| # -------------------------------------------------------------------------- | |
| # Model size classes | |
| # -------------------------------------------------------------------------- | |
| # We reason in *size classes* rather than individual models, because the | |
| # memory maths is driven by parameter count + architecture shape. Each class | |
| # carries an approximate architecture so we can estimate the KV cache (chat | |
| # memory) honestly. Layers/hidden are conservative typicals, not exact. | |
| class ModelClass: | |
| key: str | |
| billions: float # parameter count in billions (representative) | |
| plain_name: str | |
| good_for: str # plain-English "what it's actually good at" | |
| n_layers: int | |
| hidden: int | |
| # Example concrete models for the copy-paste commands (real, well-known). | |
| example_label: str | |
| ollama_tag: str # what you'd type after `ollama run` | |
| gguf_repo: str # a real Hugging Face GGUF repo for llama.cpp | |
| MODEL_CLASSES: list[ModelClass] = [ | |
| ModelClass("tiny", 1.0, "Tiny (around 1 billion)", | |
| "Quick simple chat, basic questions, tidying text. Runs on almost anything.", | |
| 24, 2048, "Llama 3.2 1B", "llama3.2:1b", | |
| "bartowski/Llama-3.2-1B-Instruct-GGUF"), | |
| ModelClass("small", 3.5, "Small (3-4 billion)", | |
| "Surprisingly capable everyday chat, summarising, and light coding help.", | |
| 28, 3072, "Llama 3.2 3B", "llama3.2:3b", | |
| "bartowski/Llama-3.2-3B-Instruct-GGUF"), | |
| ModelClass("medium", 8.0, "Medium (7-9 billion)", | |
| "A solid all-rounder: good chat, real coding help, decent reasoning.", | |
| 32, 4096, "Qwen2.5 7B", "qwen2.5:7b", | |
| "bartowski/Qwen2.5-7B-Instruct-GGUF"), | |
| ModelClass("large", 14.0, "Large (13-14 billion)", | |
| "Noticeably smarter and more reliable. Wants a real graphics card.", | |
| 40, 5120, "Qwen2.5 14B", "qwen2.5:14b", | |
| "bartowski/Qwen2.5-14B-Instruct-GGUF"), | |
| ModelClass("xlarge", 32.0, "Very large (30-34 billion)", | |
| "Near-premium quality. Needs a strong GPU or a lot of memory.", | |
| 48, 6656, "Qwen2.5 32B", "qwen2.5:32b", | |
| "bartowski/Qwen2.5-32B-Instruct-GGUF"), | |
| ModelClass("huge", 70.0, "Huge (70 billion)", | |
| "Top-tier open quality. Serious hardware only.", | |
| 80, 8192, "Llama 3.3 70B", "llama3.3:70b", | |
| "bartowski/Llama-3.3-70B-Instruct-GGUF"), | |
| ] | |
| MODEL_BY_KEY = {m.key: m for m in MODEL_CLASSES} | |
| # -------------------------------------------------------------------------- | |
| # Use cases (jobs people actually want done) | |
| # -------------------------------------------------------------------------- | |
| # Each maps to a *minimum* sensible size and a *comfortable* size. We never | |
| # pretend a job works on a model that's too small for it. | |
| class UseCase: | |
| key: str | |
| plain_name: str | |
| description: str | |
| min_class: str # smallest model that does an OK job | |
| good_class: str # where it starts feeling genuinely useful | |
| # Extra memory headroom multiplier for this job (RAG/agents need more | |
| # context; fine-tuning needs much more). 1.0 = normal inference. | |
| overhead_factor: float = 1.0 | |
| note: str = "" | |
| USE_CASES: list[UseCase] = [ | |
| UseCase("chat", "Just chatting / asking questions", | |
| "General conversation, explanations, everyday questions.", | |
| "tiny", "small"), | |
| UseCase("writing", "Writing & summarising", | |
| "Drafting emails, rewriting, condensing long text.", | |
| "small", "medium"), | |
| UseCase("coding", "Coding help", | |
| "Explaining code, writing functions, fixing bugs.", | |
| "small", "medium", | |
| note="Bigger models are much more reliable for code."), | |
| UseCase("agents", "Tool use / agents", | |
| "Letting the model call tools, search, or take steps for you.", | |
| "medium", "medium", overhead_factor=1.15, | |
| note="Needs steady instruction-following — go medium or larger."), | |
| UseCase("rag", "Document Q&A (your own files)", | |
| "Answering questions over your PDFs/notes (a.k.a. RAG).", | |
| "small", "medium", overhead_factor=1.25, | |
| note="Long documents use extra memory for context."), | |
| UseCase("finetune", "Teaching it your own data (fine-tuning)", | |
| "Training a small adapter (LoRA/QLoRA) on your examples.", | |
| "small", "medium", overhead_factor=2.2, | |
| note="Training needs roughly 2-3x the memory of just chatting."), | |
| ] | |
| USE_CASE_BY_KEY = {u.key: u for u in USE_CASES} | |