FitCheck / engine /advisor.py
cn0303's picture
Deploy FitCheck: engine + Nemotron model brick on ZeroGPU
12d2e34 verified
"""
The advisor: turn a machine + a goal into an honest verdict.
Output is organised into three plain bands, because that is what makes the
tool trustworthy instead of hypey:
- WORKS NOW : runs well, on the fast path, today.
- WORKS WITH COMPROMISES : it'll run, but slower or smaller than ideal.
- DON'T BOTHER : not realistic on this machine — said plainly.
No fake promises. If something doesn't fit, we say so and explain why.
"""
from dataclasses import dataclass, field
from .catalogue import (
MODEL_CLASSES,
QUANT_TIERS,
RECOMMENDED_QUANT,
QUANT_BY_KEY,
MODEL_BY_KEY,
ModelClass,
QuantTier,
UseCase,
USE_CASE_BY_KEY,
)
from .estimator import MemoryEstimate, estimate_memory
from .hardware import HardwareSpec
from .runtimes import Runtime, pick_runtimes
# How much text (context) we assume per job, in tokens. ~750 words per 1000.
_CONTEXT_FOR_USE_CASE = {
"chat": 4096,
"writing": 4096,
"coding": 4096,
"agents": 4096,
"rag": 8192,
"finetune": 2048,
}
# We only ever fill a budget to this fraction — the rest is breathing room.
_SAFETY_FILL = 0.90
VERDICT_WORKS = "works_now"
VERDICT_COMPROMISE = "compromises"
VERDICT_NO = "dont_bother"
@dataclass
class ModelVerdict:
model: ModelClass
verdict: str # one of the VERDICT_* constants
quant: QuantTier # the quant we'd actually recommend
estimate: MemoryEstimate
full_quality_on_fast: bool # True if it runs on the GPU at fp16/near-full
notes: list[str] = field(default_factory=list)
@dataclass
class Advice:
spec: HardwareSpec
use_case: UseCase
context_tokens: int
verdicts: list[ModelVerdict] # one per model class, big→small order kept
headline: ModelVerdict | None # the single best pick for this goal
runtimes: list[Runtime]
meets_goal: bool # does the headline satisfy the use case?
@property
def works_now(self) -> list[ModelVerdict]:
return [v for v in self.verdicts if v.verdict == VERDICT_WORKS]
@property
def compromises(self) -> list[ModelVerdict]:
return [v for v in self.verdicts if v.verdict == VERDICT_COMPROMISE]
@property
def dont_bother(self) -> list[ModelVerdict]:
return [v for v in self.verdicts if v.verdict == VERDICT_NO]
def _evaluate_model(
model: ModelClass, spec: HardwareSpec, use_case: UseCase, context_tokens: int
) -> ModelVerdict:
fast = spec.fast_budget_gb
total = spec.total_budget_gb
of = use_case.overhead_factor
q4_bpw = RECOMMENDED_QUANT.bits_per_weight # the 4-bit quality floor
# --- Fast path: best *quality* quant that fits on the GPU/shared mem ---
# We only call it "Works now" if it fits fast at 4-bit or better. Cramming
# a big model down to 2-bit just to claim it "fits" is exactly the kind of
# overpromise this tool refuses to make — that path becomes a compromise.
if spec.has_fast_path:
for q in QUANT_TIERS: # ordered best-quality -> smallest
if q.bits_per_weight < q4_bpw:
break # don't accept sub-4-bit as a clean "works now"
est = estimate_memory(model, q, context_tokens=context_tokens,
job_overhead_factor=of)
if est.total_gb <= fast * _SAFETY_FILL:
full_q = q.key in ("fp16", "Q8_0", "Q6_K")
notes = []
if q is not RECOMMENDED_QUANT and not full_q:
notes.append(f"Runs at {q.plain_name} — even a touch sharper than the usual 4-bit.")
return ModelVerdict(model, VERDICT_WORKS, q, est, full_q, notes)
# --- Compromise path: fits if we let it use ordinary RAM (slower) ------
# Prefer the everyday 4-bit; drop smaller only if needed.
for q in (RECOMMENDED_QUANT, QUANT_BY_KEY["Q3_K_M"], QUANT_BY_KEY["Q2_K"]):
est = estimate_memory(model, q, context_tokens=context_tokens,
job_overhead_factor=of)
if est.total_gb <= total * _SAFETY_FILL:
notes = []
if not spec.has_fast_path:
notes.append("Runs on the processor (no graphics card to speed it up) — expect slow replies.")
else:
notes.append("Too big to fit the graphics card on its own — part runs on slower memory, so replies come more slowly.")
if q is not RECOMMENDED_QUANT:
notes.append(f"Had to shrink it to {q.plain_name} to fit — some quality is lost.")
return ModelVerdict(model, VERDICT_COMPROMISE, q, est, False, notes)
# --- Doesn't fit even at the smallest setting --------------------------
est = estimate_memory(model, QUANT_BY_KEY["Q2_K"], context_tokens=context_tokens,
job_overhead_factor=of)
short_by = round(est.total_gb - total, 1)
notes = [f"Needs about {est.total_gb:g} GB even squeezed down — "
f"around {short_by:g} GB more than this machine can give it."]
return ModelVerdict(model, VERDICT_NO, QUANT_BY_KEY["Q2_K"], est, False, notes)
def _rank(model_key: str) -> int:
return next(i for i, m in enumerate(MODEL_CLASSES) if m.key == model_key)
def advise(spec: HardwareSpec, use_case_key: str = "chat") -> Advice:
"""Produce full advice for a machine and a goal."""
use_case = USE_CASE_BY_KEY.get(use_case_key, USE_CASE_BY_KEY["chat"])
context_tokens = _CONTEXT_FOR_USE_CASE.get(use_case.key, 4096)
# Evaluate every size class, biggest first (so the table reads top-down).
verdicts = [
_evaluate_model(m, spec, use_case, context_tokens)
for m in reversed(MODEL_CLASSES)
]
# --- Headline: the single "just use this" pick -----------------------
# Priorities, in order:
# 1. The biggest model that WORKS NOW (fast + good quality) and is at
# least big enough for the job. Fast-and-capable is the best answer.
# 2. If nothing fast is big enough, the best COMPROMISE that does the
# job — sized close to ideal, not needlessly oversized-and-slow.
# 3. Otherwise, the best we can honestly offer, flagged as below-par.
good_rank = _rank(use_case.good_class)
min_rank = _rank(use_case.min_class)
q4_bpw = RECOMMENDED_QUANT.bits_per_weight
works = [v for v in verdicts if v.verdict == VERDICT_WORKS]
comp = [v for v in verdicts if v.verdict == VERDICT_COMPROMISE]
def largest(vs):
return max(vs, key=lambda v: _rank(v.model.key))
def nearest_good(vs):
# Closest to the ideal size without overshooting into needless slowness.
below = [v for v in vs if _rank(v.model.key) <= good_rank]
return largest(below) if below else min(vs, key=lambda v: _rank(v.model.key))
def decent(vs):
# Don't headline a model that only fits at a desperate sub-4-bit squeeze
# if a cleaner option exists — quality matters more than size on the box.
return [v for v in vs if v.quant.bits_per_weight >= q4_bpw]
works_ok = [v for v in works if _rank(v.model.key) >= min_rank]
comp_ok = [v for v in comp if _rank(v.model.key) >= min_rank]
headline = None
meets_goal = False
if works_ok:
headline, meets_goal = largest(works_ok), True
elif comp_ok:
headline, meets_goal = nearest_good(decent(comp_ok) or comp_ok), True
elif works:
headline, meets_goal = largest(works), False
elif comp:
headline, meets_goal = nearest_good(decent(comp) or comp), False
if headline is not None and not meets_goal:
headline.notes.insert(
0, f"This is the best this machine can do, but it's on the small "
f"side for {use_case.plain_name.lower()} — treat results as 'okay', not great.")
return Advice(
spec=spec,
use_case=use_case,
context_tokens=context_tokens,
verdicts=verdicts,
headline=headline,
runtimes=pick_runtimes(spec),
meets_goal=meets_goal,
)