Spaces:
Running on Zero
Running on Zero
File size: 7,166 Bytes
12d2e34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | """
Static catalogue: the frozen facts the advisor reasons over.
Everything here is build-time data — no network calls at runtime. That keeps
the tool fully offline-capable (the "Off the Grid" goal) and means the advice
can't silently drift when some external API changes.
Sources for the numbers (so anyone can check our work):
- bits-per-weight for GGUF quant families: llama.cpp / Hugging Face GGUF docs
- "~2 GB per 1B params at fp16": Hugging Face Transformers optimisation guide
- 8-bit ≈ 50% of fp16, 4-bit ≈ 25-30%: bitsandbytes docs
- architecture sizes (layers / hidden): typical published configs per size class
"""
from dataclasses import dataclass, field
# --------------------------------------------------------------------------
# Quantisation tiers
# --------------------------------------------------------------------------
# "Quantisation" = squashing the model's numbers into fewer bits so it takes
# less memory. Fewer bits = smaller + faster, but slightly less sharp.
# gb_per_billion is just bits_per_weight / 8 (bits -> bytes -> GB per 1B params).
@dataclass(frozen=True)
class QuantTier:
key: str
plain_name: str # what a normal person sees
bits_per_weight: float
blurb: str # one honest sentence about the trade-off
recommended: bool = False
@property
def gb_per_billion(self) -> float:
return self.bits_per_weight / 8.0
QUANT_TIERS: list[QuantTier] = [
QuantTier("fp16", "Full quality (fp16)", 16.0,
"The original, uncompressed model. Biggest and slowest to load."),
QuantTier("Q8_0", "Near-full (8-bit)", 8.5,
"Practically indistinguishable from full quality, about half the size."),
QuantTier("Q6_K", "High (6-bit)", 6.56,
"Very close to full quality, a bit smaller again."),
QuantTier("Q5_K_M", "Balanced+ (5-bit)", 5.67,
"A touch sharper than 4-bit for a little more memory."),
QuantTier("Q4_K_M", "Balanced (4-bit)", 4.83,
"The sweet spot most people use: small, fast, and still very good.",
recommended=True),
QuantTier("Q3_K_M", "Compact (3-bit)", 3.91,
"Smaller still, with a slight, usually-acceptable quality dip."),
QuantTier("Q2_K", "Tiny (2-bit)", 3.35,
"Last resort to make something fit — noticeably less reliable."),
]
QUANT_BY_KEY = {q.key: q for q in QUANT_TIERS}
RECOMMENDED_QUANT = next(q for q in QUANT_TIERS if q.recommended)
# --------------------------------------------------------------------------
# Model size classes
# --------------------------------------------------------------------------
# We reason in *size classes* rather than individual models, because the
# memory maths is driven by parameter count + architecture shape. Each class
# carries an approximate architecture so we can estimate the KV cache (chat
# memory) honestly. Layers/hidden are conservative typicals, not exact.
@dataclass(frozen=True)
class ModelClass:
key: str
billions: float # parameter count in billions (representative)
plain_name: str
good_for: str # plain-English "what it's actually good at"
n_layers: int
hidden: int
# Example concrete models for the copy-paste commands (real, well-known).
example_label: str
ollama_tag: str # what you'd type after `ollama run`
gguf_repo: str # a real Hugging Face GGUF repo for llama.cpp
MODEL_CLASSES: list[ModelClass] = [
ModelClass("tiny", 1.0, "Tiny (around 1 billion)",
"Quick simple chat, basic questions, tidying text. Runs on almost anything.",
24, 2048, "Llama 3.2 1B", "llama3.2:1b",
"bartowski/Llama-3.2-1B-Instruct-GGUF"),
ModelClass("small", 3.5, "Small (3-4 billion)",
"Surprisingly capable everyday chat, summarising, and light coding help.",
28, 3072, "Llama 3.2 3B", "llama3.2:3b",
"bartowski/Llama-3.2-3B-Instruct-GGUF"),
ModelClass("medium", 8.0, "Medium (7-9 billion)",
"A solid all-rounder: good chat, real coding help, decent reasoning.",
32, 4096, "Qwen2.5 7B", "qwen2.5:7b",
"bartowski/Qwen2.5-7B-Instruct-GGUF"),
ModelClass("large", 14.0, "Large (13-14 billion)",
"Noticeably smarter and more reliable. Wants a real graphics card.",
40, 5120, "Qwen2.5 14B", "qwen2.5:14b",
"bartowski/Qwen2.5-14B-Instruct-GGUF"),
ModelClass("xlarge", 32.0, "Very large (30-34 billion)",
"Near-premium quality. Needs a strong GPU or a lot of memory.",
48, 6656, "Qwen2.5 32B", "qwen2.5:32b",
"bartowski/Qwen2.5-32B-Instruct-GGUF"),
ModelClass("huge", 70.0, "Huge (70 billion)",
"Top-tier open quality. Serious hardware only.",
80, 8192, "Llama 3.3 70B", "llama3.3:70b",
"bartowski/Llama-3.3-70B-Instruct-GGUF"),
]
MODEL_BY_KEY = {m.key: m for m in MODEL_CLASSES}
# --------------------------------------------------------------------------
# Use cases (jobs people actually want done)
# --------------------------------------------------------------------------
# Each maps to a *minimum* sensible size and a *comfortable* size. We never
# pretend a job works on a model that's too small for it.
@dataclass(frozen=True)
class UseCase:
key: str
plain_name: str
description: str
min_class: str # smallest model that does an OK job
good_class: str # where it starts feeling genuinely useful
# Extra memory headroom multiplier for this job (RAG/agents need more
# context; fine-tuning needs much more). 1.0 = normal inference.
overhead_factor: float = 1.0
note: str = ""
USE_CASES: list[UseCase] = [
UseCase("chat", "Just chatting / asking questions",
"General conversation, explanations, everyday questions.",
"tiny", "small"),
UseCase("writing", "Writing & summarising",
"Drafting emails, rewriting, condensing long text.",
"small", "medium"),
UseCase("coding", "Coding help",
"Explaining code, writing functions, fixing bugs.",
"small", "medium",
note="Bigger models are much more reliable for code."),
UseCase("agents", "Tool use / agents",
"Letting the model call tools, search, or take steps for you.",
"medium", "medium", overhead_factor=1.15,
note="Needs steady instruction-following — go medium or larger."),
UseCase("rag", "Document Q&A (your own files)",
"Answering questions over your PDFs/notes (a.k.a. RAG).",
"small", "medium", overhead_factor=1.25,
note="Long documents use extra memory for context."),
UseCase("finetune", "Teaching it your own data (fine-tuning)",
"Training a small adapter (LoRA/QLoRA) on your examples.",
"small", "medium", overhead_factor=2.2,
note="Training needs roughly 2-3x the memory of just chatting."),
]
USE_CASE_BY_KEY = {u.key: u for u in USE_CASES}
|