Spaces:
Running on Zero
Running on Zero
File size: 8,090 Bytes
12d2e34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | """
The advisor: turn a machine + a goal into an honest verdict.
Output is organised into three plain bands, because that is what makes the
tool trustworthy instead of hypey:
- WORKS NOW : runs well, on the fast path, today.
- WORKS WITH COMPROMISES : it'll run, but slower or smaller than ideal.
- DON'T BOTHER : not realistic on this machine — said plainly.
No fake promises. If something doesn't fit, we say so and explain why.
"""
from dataclasses import dataclass, field
from .catalogue import (
MODEL_CLASSES,
QUANT_TIERS,
RECOMMENDED_QUANT,
QUANT_BY_KEY,
MODEL_BY_KEY,
ModelClass,
QuantTier,
UseCase,
USE_CASE_BY_KEY,
)
from .estimator import MemoryEstimate, estimate_memory
from .hardware import HardwareSpec
from .runtimes import Runtime, pick_runtimes
# How much text (context) we assume per job, in tokens. ~750 words per 1000.
_CONTEXT_FOR_USE_CASE = {
"chat": 4096,
"writing": 4096,
"coding": 4096,
"agents": 4096,
"rag": 8192,
"finetune": 2048,
}
# We only ever fill a budget to this fraction — the rest is breathing room.
_SAFETY_FILL = 0.90
VERDICT_WORKS = "works_now"
VERDICT_COMPROMISE = "compromises"
VERDICT_NO = "dont_bother"
@dataclass
class ModelVerdict:
model: ModelClass
verdict: str # one of the VERDICT_* constants
quant: QuantTier # the quant we'd actually recommend
estimate: MemoryEstimate
full_quality_on_fast: bool # True if it runs on the GPU at fp16/near-full
notes: list[str] = field(default_factory=list)
@dataclass
class Advice:
spec: HardwareSpec
use_case: UseCase
context_tokens: int
verdicts: list[ModelVerdict] # one per model class, big→small order kept
headline: ModelVerdict | None # the single best pick for this goal
runtimes: list[Runtime]
meets_goal: bool # does the headline satisfy the use case?
@property
def works_now(self) -> list[ModelVerdict]:
return [v for v in self.verdicts if v.verdict == VERDICT_WORKS]
@property
def compromises(self) -> list[ModelVerdict]:
return [v for v in self.verdicts if v.verdict == VERDICT_COMPROMISE]
@property
def dont_bother(self) -> list[ModelVerdict]:
return [v for v in self.verdicts if v.verdict == VERDICT_NO]
def _evaluate_model(
model: ModelClass, spec: HardwareSpec, use_case: UseCase, context_tokens: int
) -> ModelVerdict:
fast = spec.fast_budget_gb
total = spec.total_budget_gb
of = use_case.overhead_factor
q4_bpw = RECOMMENDED_QUANT.bits_per_weight # the 4-bit quality floor
# --- Fast path: best *quality* quant that fits on the GPU/shared mem ---
# We only call it "Works now" if it fits fast at 4-bit or better. Cramming
# a big model down to 2-bit just to claim it "fits" is exactly the kind of
# overpromise this tool refuses to make — that path becomes a compromise.
if spec.has_fast_path:
for q in QUANT_TIERS: # ordered best-quality -> smallest
if q.bits_per_weight < q4_bpw:
break # don't accept sub-4-bit as a clean "works now"
est = estimate_memory(model, q, context_tokens=context_tokens,
job_overhead_factor=of)
if est.total_gb <= fast * _SAFETY_FILL:
full_q = q.key in ("fp16", "Q8_0", "Q6_K")
notes = []
if q is not RECOMMENDED_QUANT and not full_q:
notes.append(f"Runs at {q.plain_name} — even a touch sharper than the usual 4-bit.")
return ModelVerdict(model, VERDICT_WORKS, q, est, full_q, notes)
# --- Compromise path: fits if we let it use ordinary RAM (slower) ------
# Prefer the everyday 4-bit; drop smaller only if needed.
for q in (RECOMMENDED_QUANT, QUANT_BY_KEY["Q3_K_M"], QUANT_BY_KEY["Q2_K"]):
est = estimate_memory(model, q, context_tokens=context_tokens,
job_overhead_factor=of)
if est.total_gb <= total * _SAFETY_FILL:
notes = []
if not spec.has_fast_path:
notes.append("Runs on the processor (no graphics card to speed it up) — expect slow replies.")
else:
notes.append("Too big to fit the graphics card on its own — part runs on slower memory, so replies come more slowly.")
if q is not RECOMMENDED_QUANT:
notes.append(f"Had to shrink it to {q.plain_name} to fit — some quality is lost.")
return ModelVerdict(model, VERDICT_COMPROMISE, q, est, False, notes)
# --- Doesn't fit even at the smallest setting --------------------------
est = estimate_memory(model, QUANT_BY_KEY["Q2_K"], context_tokens=context_tokens,
job_overhead_factor=of)
short_by = round(est.total_gb - total, 1)
notes = [f"Needs about {est.total_gb:g} GB even squeezed down — "
f"around {short_by:g} GB more than this machine can give it."]
return ModelVerdict(model, VERDICT_NO, QUANT_BY_KEY["Q2_K"], est, False, notes)
def _rank(model_key: str) -> int:
return next(i for i, m in enumerate(MODEL_CLASSES) if m.key == model_key)
def advise(spec: HardwareSpec, use_case_key: str = "chat") -> Advice:
"""Produce full advice for a machine and a goal."""
use_case = USE_CASE_BY_KEY.get(use_case_key, USE_CASE_BY_KEY["chat"])
context_tokens = _CONTEXT_FOR_USE_CASE.get(use_case.key, 4096)
# Evaluate every size class, biggest first (so the table reads top-down).
verdicts = [
_evaluate_model(m, spec, use_case, context_tokens)
for m in reversed(MODEL_CLASSES)
]
# --- Headline: the single "just use this" pick -----------------------
# Priorities, in order:
# 1. The biggest model that WORKS NOW (fast + good quality) and is at
# least big enough for the job. Fast-and-capable is the best answer.
# 2. If nothing fast is big enough, the best COMPROMISE that does the
# job — sized close to ideal, not needlessly oversized-and-slow.
# 3. Otherwise, the best we can honestly offer, flagged as below-par.
good_rank = _rank(use_case.good_class)
min_rank = _rank(use_case.min_class)
q4_bpw = RECOMMENDED_QUANT.bits_per_weight
works = [v for v in verdicts if v.verdict == VERDICT_WORKS]
comp = [v for v in verdicts if v.verdict == VERDICT_COMPROMISE]
def largest(vs):
return max(vs, key=lambda v: _rank(v.model.key))
def nearest_good(vs):
# Closest to the ideal size without overshooting into needless slowness.
below = [v for v in vs if _rank(v.model.key) <= good_rank]
return largest(below) if below else min(vs, key=lambda v: _rank(v.model.key))
def decent(vs):
# Don't headline a model that only fits at a desperate sub-4-bit squeeze
# if a cleaner option exists — quality matters more than size on the box.
return [v for v in vs if v.quant.bits_per_weight >= q4_bpw]
works_ok = [v for v in works if _rank(v.model.key) >= min_rank]
comp_ok = [v for v in comp if _rank(v.model.key) >= min_rank]
headline = None
meets_goal = False
if works_ok:
headline, meets_goal = largest(works_ok), True
elif comp_ok:
headline, meets_goal = nearest_good(decent(comp_ok) or comp_ok), True
elif works:
headline, meets_goal = largest(works), False
elif comp:
headline, meets_goal = nearest_good(decent(comp) or comp), False
if headline is not None and not meets_goal:
headline.notes.insert(
0, f"This is the best this machine can do, but it's on the small "
f"side for {use_case.plain_name.lower()} — treat results as 'okay', not great.")
return Advice(
spec=spec,
use_case=use_case,
context_tokens=context_tokens,
verdicts=verdicts,
headline=headline,
runtimes=pick_runtimes(spec),
meets_goal=meets_goal,
)
|