""" Static catalogue: the frozen facts the advisor reasons over. Everything here is build-time data — no network calls at runtime. That keeps the tool fully offline-capable (the "Off the Grid" goal) and means the advice can't silently drift when some external API changes. Sources for the numbers (so anyone can check our work): - bits-per-weight for GGUF quant families: llama.cpp / Hugging Face GGUF docs - "~2 GB per 1B params at fp16": Hugging Face Transformers optimisation guide - 8-bit ≈ 50% of fp16, 4-bit ≈ 25-30%: bitsandbytes docs - architecture sizes (layers / hidden): typical published configs per size class """ from dataclasses import dataclass, field # -------------------------------------------------------------------------- # Quantisation tiers # -------------------------------------------------------------------------- # "Quantisation" = squashing the model's numbers into fewer bits so it takes # less memory. Fewer bits = smaller + faster, but slightly less sharp. # gb_per_billion is just bits_per_weight / 8 (bits -> bytes -> GB per 1B params). @dataclass(frozen=True) class QuantTier: key: str plain_name: str # what a normal person sees bits_per_weight: float blurb: str # one honest sentence about the trade-off recommended: bool = False @property def gb_per_billion(self) -> float: return self.bits_per_weight / 8.0 QUANT_TIERS: list[QuantTier] = [ QuantTier("fp16", "Full quality (fp16)", 16.0, "The original, uncompressed model. Biggest and slowest to load."), QuantTier("Q8_0", "Near-full (8-bit)", 8.5, "Practically indistinguishable from full quality, about half the size."), QuantTier("Q6_K", "High (6-bit)", 6.56, "Very close to full quality, a bit smaller again."), QuantTier("Q5_K_M", "Balanced+ (5-bit)", 5.67, "A touch sharper than 4-bit for a little more memory."), QuantTier("Q4_K_M", "Balanced (4-bit)", 4.83, "The sweet spot most people use: small, fast, and still very good.", recommended=True), QuantTier("Q3_K_M", "Compact (3-bit)", 3.91, "Smaller still, with a slight, usually-acceptable quality dip."), QuantTier("Q2_K", "Tiny (2-bit)", 3.35, "Last resort to make something fit — noticeably less reliable."), ] QUANT_BY_KEY = {q.key: q for q in QUANT_TIERS} RECOMMENDED_QUANT = next(q for q in QUANT_TIERS if q.recommended) # -------------------------------------------------------------------------- # Model size classes # -------------------------------------------------------------------------- # We reason in *size classes* rather than individual models, because the # memory maths is driven by parameter count + architecture shape. Each class # carries an approximate architecture so we can estimate the KV cache (chat # memory) honestly. Layers/hidden are conservative typicals, not exact. @dataclass(frozen=True) class ModelClass: key: str billions: float # parameter count in billions (representative) plain_name: str good_for: str # plain-English "what it's actually good at" n_layers: int hidden: int # Example concrete models for the copy-paste commands (real, well-known). example_label: str ollama_tag: str # what you'd type after `ollama run` gguf_repo: str # a real Hugging Face GGUF repo for llama.cpp MODEL_CLASSES: list[ModelClass] = [ ModelClass("tiny", 1.0, "Tiny (around 1 billion)", "Quick simple chat, basic questions, tidying text. Runs on almost anything.", 24, 2048, "Llama 3.2 1B", "llama3.2:1b", "bartowski/Llama-3.2-1B-Instruct-GGUF"), ModelClass("small", 3.5, "Small (3-4 billion)", "Surprisingly capable everyday chat, summarising, and light coding help.", 28, 3072, "Llama 3.2 3B", "llama3.2:3b", "bartowski/Llama-3.2-3B-Instruct-GGUF"), ModelClass("medium", 8.0, "Medium (7-9 billion)", "A solid all-rounder: good chat, real coding help, decent reasoning.", 32, 4096, "Qwen2.5 7B", "qwen2.5:7b", "bartowski/Qwen2.5-7B-Instruct-GGUF"), ModelClass("large", 14.0, "Large (13-14 billion)", "Noticeably smarter and more reliable. Wants a real graphics card.", 40, 5120, "Qwen2.5 14B", "qwen2.5:14b", "bartowski/Qwen2.5-14B-Instruct-GGUF"), ModelClass("xlarge", 32.0, "Very large (30-34 billion)", "Near-premium quality. Needs a strong GPU or a lot of memory.", 48, 6656, "Qwen2.5 32B", "qwen2.5:32b", "bartowski/Qwen2.5-32B-Instruct-GGUF"), ModelClass("huge", 70.0, "Huge (70 billion)", "Top-tier open quality. Serious hardware only.", 80, 8192, "Llama 3.3 70B", "llama3.3:70b", "bartowski/Llama-3.3-70B-Instruct-GGUF"), ] MODEL_BY_KEY = {m.key: m for m in MODEL_CLASSES} # -------------------------------------------------------------------------- # Use cases (jobs people actually want done) # -------------------------------------------------------------------------- # Each maps to a *minimum* sensible size and a *comfortable* size. We never # pretend a job works on a model that's too small for it. @dataclass(frozen=True) class UseCase: key: str plain_name: str description: str min_class: str # smallest model that does an OK job good_class: str # where it starts feeling genuinely useful # Extra memory headroom multiplier for this job (RAG/agents need more # context; fine-tuning needs much more). 1.0 = normal inference. overhead_factor: float = 1.0 note: str = "" USE_CASES: list[UseCase] = [ UseCase("chat", "Just chatting / asking questions", "General conversation, explanations, everyday questions.", "tiny", "small"), UseCase("writing", "Writing & summarising", "Drafting emails, rewriting, condensing long text.", "small", "medium"), UseCase("coding", "Coding help", "Explaining code, writing functions, fixing bugs.", "small", "medium", note="Bigger models are much more reliable for code."), UseCase("agents", "Tool use / agents", "Letting the model call tools, search, or take steps for you.", "medium", "medium", overhead_factor=1.15, note="Needs steady instruction-following — go medium or larger."), UseCase("rag", "Document Q&A (your own files)", "Answering questions over your PDFs/notes (a.k.a. RAG).", "small", "medium", overhead_factor=1.25, note="Long documents use extra memory for context."), UseCase("finetune", "Teaching it your own data (fine-tuning)", "Training a small adapter (LoRA/QLoRA) on your examples.", "small", "medium", overhead_factor=2.2, note="Training needs roughly 2-3x the memory of just chatting."), ] USE_CASE_BY_KEY = {u.key: u for u in USE_CASES}