Spaces:
Running on Zero
Running on Zero
| """ | |
| The advisor: turn a machine + a goal into an honest verdict. | |
| Output is organised into three plain bands, because that is what makes the | |
| tool trustworthy instead of hypey: | |
| - WORKS NOW : runs well, on the fast path, today. | |
| - WORKS WITH COMPROMISES : it'll run, but slower or smaller than ideal. | |
| - DON'T BOTHER : not realistic on this machine — said plainly. | |
| No fake promises. If something doesn't fit, we say so and explain why. | |
| """ | |
| from dataclasses import dataclass, field | |
| from .catalogue import ( | |
| MODEL_CLASSES, | |
| QUANT_TIERS, | |
| RECOMMENDED_QUANT, | |
| QUANT_BY_KEY, | |
| MODEL_BY_KEY, | |
| ModelClass, | |
| QuantTier, | |
| UseCase, | |
| USE_CASE_BY_KEY, | |
| ) | |
| from .estimator import MemoryEstimate, estimate_memory | |
| from .hardware import HardwareSpec | |
| from .runtimes import Runtime, pick_runtimes | |
| # How much text (context) we assume per job, in tokens. ~750 words per 1000. | |
| _CONTEXT_FOR_USE_CASE = { | |
| "chat": 4096, | |
| "writing": 4096, | |
| "coding": 4096, | |
| "agents": 4096, | |
| "rag": 8192, | |
| "finetune": 2048, | |
| } | |
| # We only ever fill a budget to this fraction — the rest is breathing room. | |
| _SAFETY_FILL = 0.90 | |
| VERDICT_WORKS = "works_now" | |
| VERDICT_COMPROMISE = "compromises" | |
| VERDICT_NO = "dont_bother" | |
| class ModelVerdict: | |
| model: ModelClass | |
| verdict: str # one of the VERDICT_* constants | |
| quant: QuantTier # the quant we'd actually recommend | |
| estimate: MemoryEstimate | |
| full_quality_on_fast: bool # True if it runs on the GPU at fp16/near-full | |
| notes: list[str] = field(default_factory=list) | |
| class Advice: | |
| spec: HardwareSpec | |
| use_case: UseCase | |
| context_tokens: int | |
| verdicts: list[ModelVerdict] # one per model class, big→small order kept | |
| headline: ModelVerdict | None # the single best pick for this goal | |
| runtimes: list[Runtime] | |
| meets_goal: bool # does the headline satisfy the use case? | |
| def works_now(self) -> list[ModelVerdict]: | |
| return [v for v in self.verdicts if v.verdict == VERDICT_WORKS] | |
| def compromises(self) -> list[ModelVerdict]: | |
| return [v for v in self.verdicts if v.verdict == VERDICT_COMPROMISE] | |
| def dont_bother(self) -> list[ModelVerdict]: | |
| return [v for v in self.verdicts if v.verdict == VERDICT_NO] | |
| def _evaluate_model( | |
| model: ModelClass, spec: HardwareSpec, use_case: UseCase, context_tokens: int | |
| ) -> ModelVerdict: | |
| fast = spec.fast_budget_gb | |
| total = spec.total_budget_gb | |
| of = use_case.overhead_factor | |
| q4_bpw = RECOMMENDED_QUANT.bits_per_weight # the 4-bit quality floor | |
| # --- Fast path: best *quality* quant that fits on the GPU/shared mem --- | |
| # We only call it "Works now" if it fits fast at 4-bit or better. Cramming | |
| # a big model down to 2-bit just to claim it "fits" is exactly the kind of | |
| # overpromise this tool refuses to make — that path becomes a compromise. | |
| if spec.has_fast_path: | |
| for q in QUANT_TIERS: # ordered best-quality -> smallest | |
| if q.bits_per_weight < q4_bpw: | |
| break # don't accept sub-4-bit as a clean "works now" | |
| est = estimate_memory(model, q, context_tokens=context_tokens, | |
| job_overhead_factor=of) | |
| if est.total_gb <= fast * _SAFETY_FILL: | |
| full_q = q.key in ("fp16", "Q8_0", "Q6_K") | |
| notes = [] | |
| if q is not RECOMMENDED_QUANT and not full_q: | |
| notes.append(f"Runs at {q.plain_name} — even a touch sharper than the usual 4-bit.") | |
| return ModelVerdict(model, VERDICT_WORKS, q, est, full_q, notes) | |
| # --- Compromise path: fits if we let it use ordinary RAM (slower) ------ | |
| # Prefer the everyday 4-bit; drop smaller only if needed. | |
| for q in (RECOMMENDED_QUANT, QUANT_BY_KEY["Q3_K_M"], QUANT_BY_KEY["Q2_K"]): | |
| est = estimate_memory(model, q, context_tokens=context_tokens, | |
| job_overhead_factor=of) | |
| if est.total_gb <= total * _SAFETY_FILL: | |
| notes = [] | |
| if not spec.has_fast_path: | |
| notes.append("Runs on the processor (no graphics card to speed it up) — expect slow replies.") | |
| else: | |
| notes.append("Too big to fit the graphics card on its own — part runs on slower memory, so replies come more slowly.") | |
| if q is not RECOMMENDED_QUANT: | |
| notes.append(f"Had to shrink it to {q.plain_name} to fit — some quality is lost.") | |
| return ModelVerdict(model, VERDICT_COMPROMISE, q, est, False, notes) | |
| # --- Doesn't fit even at the smallest setting -------------------------- | |
| est = estimate_memory(model, QUANT_BY_KEY["Q2_K"], context_tokens=context_tokens, | |
| job_overhead_factor=of) | |
| short_by = round(est.total_gb - total, 1) | |
| notes = [f"Needs about {est.total_gb:g} GB even squeezed down — " | |
| f"around {short_by:g} GB more than this machine can give it."] | |
| return ModelVerdict(model, VERDICT_NO, QUANT_BY_KEY["Q2_K"], est, False, notes) | |
| def _rank(model_key: str) -> int: | |
| return next(i for i, m in enumerate(MODEL_CLASSES) if m.key == model_key) | |
| def advise(spec: HardwareSpec, use_case_key: str = "chat") -> Advice: | |
| """Produce full advice for a machine and a goal.""" | |
| use_case = USE_CASE_BY_KEY.get(use_case_key, USE_CASE_BY_KEY["chat"]) | |
| context_tokens = _CONTEXT_FOR_USE_CASE.get(use_case.key, 4096) | |
| # Evaluate every size class, biggest first (so the table reads top-down). | |
| verdicts = [ | |
| _evaluate_model(m, spec, use_case, context_tokens) | |
| for m in reversed(MODEL_CLASSES) | |
| ] | |
| # --- Headline: the single "just use this" pick ----------------------- | |
| # Priorities, in order: | |
| # 1. The biggest model that WORKS NOW (fast + good quality) and is at | |
| # least big enough for the job. Fast-and-capable is the best answer. | |
| # 2. If nothing fast is big enough, the best COMPROMISE that does the | |
| # job — sized close to ideal, not needlessly oversized-and-slow. | |
| # 3. Otherwise, the best we can honestly offer, flagged as below-par. | |
| good_rank = _rank(use_case.good_class) | |
| min_rank = _rank(use_case.min_class) | |
| q4_bpw = RECOMMENDED_QUANT.bits_per_weight | |
| works = [v for v in verdicts if v.verdict == VERDICT_WORKS] | |
| comp = [v for v in verdicts if v.verdict == VERDICT_COMPROMISE] | |
| def largest(vs): | |
| return max(vs, key=lambda v: _rank(v.model.key)) | |
| def nearest_good(vs): | |
| # Closest to the ideal size without overshooting into needless slowness. | |
| below = [v for v in vs if _rank(v.model.key) <= good_rank] | |
| return largest(below) if below else min(vs, key=lambda v: _rank(v.model.key)) | |
| def decent(vs): | |
| # Don't headline a model that only fits at a desperate sub-4-bit squeeze | |
| # if a cleaner option exists — quality matters more than size on the box. | |
| return [v for v in vs if v.quant.bits_per_weight >= q4_bpw] | |
| works_ok = [v for v in works if _rank(v.model.key) >= min_rank] | |
| comp_ok = [v for v in comp if _rank(v.model.key) >= min_rank] | |
| headline = None | |
| meets_goal = False | |
| if works_ok: | |
| headline, meets_goal = largest(works_ok), True | |
| elif comp_ok: | |
| headline, meets_goal = nearest_good(decent(comp_ok) or comp_ok), True | |
| elif works: | |
| headline, meets_goal = largest(works), False | |
| elif comp: | |
| headline, meets_goal = nearest_good(decent(comp) or comp), False | |
| if headline is not None and not meets_goal: | |
| headline.notes.insert( | |
| 0, f"This is the best this machine can do, but it's on the small " | |
| f"side for {use_case.plain_name.lower()} — treat results as 'okay', not great.") | |
| return Advice( | |
| spec=spec, | |
| use_case=use_case, | |
| context_tokens=context_tokens, | |
| verdicts=verdicts, | |
| headline=headline, | |
| runtimes=pick_runtimes(spec), | |
| meets_goal=meets_goal, | |
| ) | |