File size: 7,166 Bytes
12d2e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
Static catalogue: the frozen facts the advisor reasons over.

Everything here is build-time data — no network calls at runtime. That keeps
the tool fully offline-capable (the "Off the Grid" goal) and means the advice
can't silently drift when some external API changes.

Sources for the numbers (so anyone can check our work):
  - bits-per-weight for GGUF quant families: llama.cpp / Hugging Face GGUF docs
  - "~2 GB per 1B params at fp16": Hugging Face Transformers optimisation guide
  - 8-bit ≈ 50% of fp16, 4-bit ≈ 25-30%: bitsandbytes docs
  - architecture sizes (layers / hidden): typical published configs per size class
"""

from dataclasses import dataclass, field


# --------------------------------------------------------------------------
# Quantisation tiers
# --------------------------------------------------------------------------
# "Quantisation" = squashing the model's numbers into fewer bits so it takes
# less memory. Fewer bits = smaller + faster, but slightly less sharp.
# gb_per_billion is just bits_per_weight / 8 (bits -> bytes -> GB per 1B params).

@dataclass(frozen=True)
class QuantTier:
    key: str
    plain_name: str          # what a normal person sees
    bits_per_weight: float
    blurb: str               # one honest sentence about the trade-off
    recommended: bool = False

    @property
    def gb_per_billion(self) -> float:
        return self.bits_per_weight / 8.0


QUANT_TIERS: list[QuantTier] = [
    QuantTier("fp16", "Full quality (fp16)", 16.0,
              "The original, uncompressed model. Biggest and slowest to load."),
    QuantTier("Q8_0", "Near-full (8-bit)", 8.5,
              "Practically indistinguishable from full quality, about half the size."),
    QuantTier("Q6_K", "High (6-bit)", 6.56,
              "Very close to full quality, a bit smaller again."),
    QuantTier("Q5_K_M", "Balanced+ (5-bit)", 5.67,
              "A touch sharper than 4-bit for a little more memory."),
    QuantTier("Q4_K_M", "Balanced (4-bit)", 4.83,
              "The sweet spot most people use: small, fast, and still very good.",
              recommended=True),
    QuantTier("Q3_K_M", "Compact (3-bit)", 3.91,
              "Smaller still, with a slight, usually-acceptable quality dip."),
    QuantTier("Q2_K", "Tiny (2-bit)", 3.35,
              "Last resort to make something fit — noticeably less reliable."),
]

QUANT_BY_KEY = {q.key: q for q in QUANT_TIERS}
RECOMMENDED_QUANT = next(q for q in QUANT_TIERS if q.recommended)


# --------------------------------------------------------------------------
# Model size classes
# --------------------------------------------------------------------------
# We reason in *size classes* rather than individual models, because the
# memory maths is driven by parameter count + architecture shape. Each class
# carries an approximate architecture so we can estimate the KV cache (chat
# memory) honestly. Layers/hidden are conservative typicals, not exact.

@dataclass(frozen=True)
class ModelClass:
    key: str
    billions: float          # parameter count in billions (representative)
    plain_name: str
    good_for: str            # plain-English "what it's actually good at"
    n_layers: int
    hidden: int
    # Example concrete models for the copy-paste commands (real, well-known).
    example_label: str
    ollama_tag: str          # what you'd type after `ollama run`
    gguf_repo: str           # a real Hugging Face GGUF repo for llama.cpp


MODEL_CLASSES: list[ModelClass] = [
    ModelClass("tiny", 1.0, "Tiny (around 1 billion)",
               "Quick simple chat, basic questions, tidying text. Runs on almost anything.",
               24, 2048, "Llama 3.2 1B", "llama3.2:1b",
               "bartowski/Llama-3.2-1B-Instruct-GGUF"),
    ModelClass("small", 3.5, "Small (3-4 billion)",
               "Surprisingly capable everyday chat, summarising, and light coding help.",
               28, 3072, "Llama 3.2 3B", "llama3.2:3b",
               "bartowski/Llama-3.2-3B-Instruct-GGUF"),
    ModelClass("medium", 8.0, "Medium (7-9 billion)",
               "A solid all-rounder: good chat, real coding help, decent reasoning.",
               32, 4096, "Qwen2.5 7B", "qwen2.5:7b",
               "bartowski/Qwen2.5-7B-Instruct-GGUF"),
    ModelClass("large", 14.0, "Large (13-14 billion)",
               "Noticeably smarter and more reliable. Wants a real graphics card.",
               40, 5120, "Qwen2.5 14B", "qwen2.5:14b",
               "bartowski/Qwen2.5-14B-Instruct-GGUF"),
    ModelClass("xlarge", 32.0, "Very large (30-34 billion)",
               "Near-premium quality. Needs a strong GPU or a lot of memory.",
               48, 6656, "Qwen2.5 32B", "qwen2.5:32b",
               "bartowski/Qwen2.5-32B-Instruct-GGUF"),
    ModelClass("huge", 70.0, "Huge (70 billion)",
               "Top-tier open quality. Serious hardware only.",
               80, 8192, "Llama 3.3 70B", "llama3.3:70b",
               "bartowski/Llama-3.3-70B-Instruct-GGUF"),
]

MODEL_BY_KEY = {m.key: m for m in MODEL_CLASSES}


# --------------------------------------------------------------------------
# Use cases (jobs people actually want done)
# --------------------------------------------------------------------------
# Each maps to a *minimum* sensible size and a *comfortable* size. We never
# pretend a job works on a model that's too small for it.

@dataclass(frozen=True)
class UseCase:
    key: str
    plain_name: str
    description: str
    min_class: str           # smallest model that does an OK job
    good_class: str          # where it starts feeling genuinely useful
    # Extra memory headroom multiplier for this job (RAG/agents need more
    # context; fine-tuning needs much more). 1.0 = normal inference.
    overhead_factor: float = 1.0
    note: str = ""


USE_CASES: list[UseCase] = [
    UseCase("chat", "Just chatting / asking questions",
            "General conversation, explanations, everyday questions.",
            "tiny", "small"),
    UseCase("writing", "Writing & summarising",
            "Drafting emails, rewriting, condensing long text.",
            "small", "medium"),
    UseCase("coding", "Coding help",
            "Explaining code, writing functions, fixing bugs.",
            "small", "medium",
            note="Bigger models are much more reliable for code."),
    UseCase("agents", "Tool use / agents",
            "Letting the model call tools, search, or take steps for you.",
            "medium", "medium", overhead_factor=1.15,
            note="Needs steady instruction-following — go medium or larger."),
    UseCase("rag", "Document Q&A (your own files)",
            "Answering questions over your PDFs/notes (a.k.a. RAG).",
            "small", "medium", overhead_factor=1.25,
            note="Long documents use extra memory for context."),
    UseCase("finetune", "Teaching it your own data (fine-tuning)",
            "Training a small adapter (LoRA/QLoRA) on your examples.",
            "small", "medium", overhead_factor=2.2,
            note="Training needs roughly 2-3x the memory of just chatting."),
]

USE_CASE_BY_KEY = {u.key: u for u in USE_CASES}