""" The advisor: turn a machine + a goal into an honest verdict. Output is organised into three plain bands, because that is what makes the tool trustworthy instead of hypey: - WORKS NOW : runs well, on the fast path, today. - WORKS WITH COMPROMISES : it'll run, but slower or smaller than ideal. - DON'T BOTHER : not realistic on this machine — said plainly. No fake promises. If something doesn't fit, we say so and explain why. """ from dataclasses import dataclass, field from .catalogue import ( MODEL_CLASSES, QUANT_TIERS, RECOMMENDED_QUANT, QUANT_BY_KEY, MODEL_BY_KEY, ModelClass, QuantTier, UseCase, USE_CASE_BY_KEY, ) from .estimator import MemoryEstimate, estimate_memory from .hardware import HardwareSpec from .runtimes import Runtime, pick_runtimes # How much text (context) we assume per job, in tokens. ~750 words per 1000. _CONTEXT_FOR_USE_CASE = { "chat": 4096, "writing": 4096, "coding": 4096, "agents": 4096, "rag": 8192, "finetune": 2048, } # We only ever fill a budget to this fraction — the rest is breathing room. _SAFETY_FILL = 0.90 VERDICT_WORKS = "works_now" VERDICT_COMPROMISE = "compromises" VERDICT_NO = "dont_bother" @dataclass class ModelVerdict: model: ModelClass verdict: str # one of the VERDICT_* constants quant: QuantTier # the quant we'd actually recommend estimate: MemoryEstimate full_quality_on_fast: bool # True if it runs on the GPU at fp16/near-full notes: list[str] = field(default_factory=list) @dataclass class Advice: spec: HardwareSpec use_case: UseCase context_tokens: int verdicts: list[ModelVerdict] # one per model class, big→small order kept headline: ModelVerdict | None # the single best pick for this goal runtimes: list[Runtime] meets_goal: bool # does the headline satisfy the use case? @property def works_now(self) -> list[ModelVerdict]: return [v for v in self.verdicts if v.verdict == VERDICT_WORKS] @property def compromises(self) -> list[ModelVerdict]: return [v for v in self.verdicts if v.verdict == VERDICT_COMPROMISE] @property def dont_bother(self) -> list[ModelVerdict]: return [v for v in self.verdicts if v.verdict == VERDICT_NO] def _evaluate_model( model: ModelClass, spec: HardwareSpec, use_case: UseCase, context_tokens: int ) -> ModelVerdict: fast = spec.fast_budget_gb total = spec.total_budget_gb of = use_case.overhead_factor q4_bpw = RECOMMENDED_QUANT.bits_per_weight # the 4-bit quality floor # --- Fast path: best *quality* quant that fits on the GPU/shared mem --- # We only call it "Works now" if it fits fast at 4-bit or better. Cramming # a big model down to 2-bit just to claim it "fits" is exactly the kind of # overpromise this tool refuses to make — that path becomes a compromise. if spec.has_fast_path: for q in QUANT_TIERS: # ordered best-quality -> smallest if q.bits_per_weight < q4_bpw: break # don't accept sub-4-bit as a clean "works now" est = estimate_memory(model, q, context_tokens=context_tokens, job_overhead_factor=of) if est.total_gb <= fast * _SAFETY_FILL: full_q = q.key in ("fp16", "Q8_0", "Q6_K") notes = [] if q is not RECOMMENDED_QUANT and not full_q: notes.append(f"Runs at {q.plain_name} — even a touch sharper than the usual 4-bit.") return ModelVerdict(model, VERDICT_WORKS, q, est, full_q, notes) # --- Compromise path: fits if we let it use ordinary RAM (slower) ------ # Prefer the everyday 4-bit; drop smaller only if needed. for q in (RECOMMENDED_QUANT, QUANT_BY_KEY["Q3_K_M"], QUANT_BY_KEY["Q2_K"]): est = estimate_memory(model, q, context_tokens=context_tokens, job_overhead_factor=of) if est.total_gb <= total * _SAFETY_FILL: notes = [] if not spec.has_fast_path: notes.append("Runs on the processor (no graphics card to speed it up) — expect slow replies.") else: notes.append("Too big to fit the graphics card on its own — part runs on slower memory, so replies come more slowly.") if q is not RECOMMENDED_QUANT: notes.append(f"Had to shrink it to {q.plain_name} to fit — some quality is lost.") return ModelVerdict(model, VERDICT_COMPROMISE, q, est, False, notes) # --- Doesn't fit even at the smallest setting -------------------------- est = estimate_memory(model, QUANT_BY_KEY["Q2_K"], context_tokens=context_tokens, job_overhead_factor=of) short_by = round(est.total_gb - total, 1) notes = [f"Needs about {est.total_gb:g} GB even squeezed down — " f"around {short_by:g} GB more than this machine can give it."] return ModelVerdict(model, VERDICT_NO, QUANT_BY_KEY["Q2_K"], est, False, notes) def _rank(model_key: str) -> int: return next(i for i, m in enumerate(MODEL_CLASSES) if m.key == model_key) def advise(spec: HardwareSpec, use_case_key: str = "chat") -> Advice: """Produce full advice for a machine and a goal.""" use_case = USE_CASE_BY_KEY.get(use_case_key, USE_CASE_BY_KEY["chat"]) context_tokens = _CONTEXT_FOR_USE_CASE.get(use_case.key, 4096) # Evaluate every size class, biggest first (so the table reads top-down). verdicts = [ _evaluate_model(m, spec, use_case, context_tokens) for m in reversed(MODEL_CLASSES) ] # --- Headline: the single "just use this" pick ----------------------- # Priorities, in order: # 1. The biggest model that WORKS NOW (fast + good quality) and is at # least big enough for the job. Fast-and-capable is the best answer. # 2. If nothing fast is big enough, the best COMPROMISE that does the # job — sized close to ideal, not needlessly oversized-and-slow. # 3. Otherwise, the best we can honestly offer, flagged as below-par. good_rank = _rank(use_case.good_class) min_rank = _rank(use_case.min_class) q4_bpw = RECOMMENDED_QUANT.bits_per_weight works = [v for v in verdicts if v.verdict == VERDICT_WORKS] comp = [v for v in verdicts if v.verdict == VERDICT_COMPROMISE] def largest(vs): return max(vs, key=lambda v: _rank(v.model.key)) def nearest_good(vs): # Closest to the ideal size without overshooting into needless slowness. below = [v for v in vs if _rank(v.model.key) <= good_rank] return largest(below) if below else min(vs, key=lambda v: _rank(v.model.key)) def decent(vs): # Don't headline a model that only fits at a desperate sub-4-bit squeeze # if a cleaner option exists — quality matters more than size on the box. return [v for v in vs if v.quant.bits_per_weight >= q4_bpw] works_ok = [v for v in works if _rank(v.model.key) >= min_rank] comp_ok = [v for v in comp if _rank(v.model.key) >= min_rank] headline = None meets_goal = False if works_ok: headline, meets_goal = largest(works_ok), True elif comp_ok: headline, meets_goal = nearest_good(decent(comp_ok) or comp_ok), True elif works: headline, meets_goal = largest(works), False elif comp: headline, meets_goal = nearest_good(decent(comp) or comp), False if headline is not None and not meets_goal: headline.notes.insert( 0, f"This is the best this machine can do, but it's on the small " f"side for {use_case.plain_name.lower()} — treat results as 'okay', not great.") return Advice( spec=spec, use_case=use_case, context_tokens=context_tokens, verdicts=verdicts, headline=headline, runtimes=pick_runtimes(spec), meets_goal=meets_goal, )