Spaces:

build-small-hackathon
/

FitCheck

Running on Zero

App Files Files Community

FitCheck / engine /advisor.py

cn0303

Deploy FitCheck: engine + Nemotron model brick on ZeroGPU

12d2e34 verified 1 day ago

raw

history blame contribute delete

8.09 kB

	"""
	The advisor: turn a machine + a goal into an honest verdict.

	Output is organised into three plain bands, because that is what makes the
	tool trustworthy instead of hypey:

	- WORKS NOW : runs well, on the fast path, today.
	- WORKS WITH COMPROMISES : it'll run, but slower or smaller than ideal.
	- DON'T BOTHER : not realistic on this machine — said plainly.

	No fake promises. If something doesn't fit, we say so and explain why.
	"""

	from dataclasses import dataclass, field

	from .catalogue import (
	MODEL_CLASSES,
	QUANT_TIERS,
	RECOMMENDED_QUANT,
	QUANT_BY_KEY,
	MODEL_BY_KEY,
	ModelClass,
	QuantTier,
	UseCase,
	USE_CASE_BY_KEY,
	)
	from .estimator import MemoryEstimate, estimate_memory
	from .hardware import HardwareSpec
	from .runtimes import Runtime, pick_runtimes


	# How much text (context) we assume per job, in tokens. ~750 words per 1000.
	_CONTEXT_FOR_USE_CASE = {
	"chat": 4096,
	"writing": 4096,
	"coding": 4096,
	"agents": 4096,
	"rag": 8192,
	"finetune": 2048,
	}

	# We only ever fill a budget to this fraction — the rest is breathing room.
	_SAFETY_FILL = 0.90

	VERDICT_WORKS = "works_now"
	VERDICT_COMPROMISE = "compromises"
	VERDICT_NO = "dont_bother"


	@dataclass
	class ModelVerdict:
	model: ModelClass
	verdict: str # one of the VERDICT_* constants
	quant: QuantTier # the quant we'd actually recommend
	estimate: MemoryEstimate
	full_quality_on_fast: bool # True if it runs on the GPU at fp16/near-full
	notes: list[str] = field(default_factory=list)


	@dataclass
	class Advice:
	spec: HardwareSpec
	use_case: UseCase
	context_tokens: int
	verdicts: list[ModelVerdict] # one per model class, big→small order kept
	headline: ModelVerdict \| None # the single best pick for this goal
	runtimes: list[Runtime]
	meets_goal: bool # does the headline satisfy the use case?

	@property
	def works_now(self) -> list[ModelVerdict]:
	return [v for v in self.verdicts if v.verdict == VERDICT_WORKS]

	@property
	def compromises(self) -> list[ModelVerdict]:
	return [v for v in self.verdicts if v.verdict == VERDICT_COMPROMISE]

	@property
	def dont_bother(self) -> list[ModelVerdict]:
	return [v for v in self.verdicts if v.verdict == VERDICT_NO]


	def _evaluate_model(
	model: ModelClass, spec: HardwareSpec, use_case: UseCase, context_tokens: int
	) -> ModelVerdict:
	fast = spec.fast_budget_gb
	total = spec.total_budget_gb
	of = use_case.overhead_factor
	q4_bpw = RECOMMENDED_QUANT.bits_per_weight # the 4-bit quality floor

	# --- Fast path: best quality quant that fits on the GPU/shared mem ---
	# We only call it "Works now" if it fits fast at 4-bit or better. Cramming
	# a big model down to 2-bit just to claim it "fits" is exactly the kind of
	# overpromise this tool refuses to make — that path becomes a compromise.
	if spec.has_fast_path:
	for q in QUANT_TIERS: # ordered best-quality -> smallest
	if q.bits_per_weight < q4_bpw:
	break # don't accept sub-4-bit as a clean "works now"
	est = estimate_memory(model, q, context_tokens=context_tokens,
	job_overhead_factor=of)
	if est.total_gb <= fast * _SAFETY_FILL:
	full_q = q.key in ("fp16", "Q8_0", "Q6_K")
	notes = []
	if q is not RECOMMENDED_QUANT and not full_q:
	notes.append(f"Runs at {q.plain_name} — even a touch sharper than the usual 4-bit.")
	return ModelVerdict(model, VERDICT_WORKS, q, est, full_q, notes)

	# --- Compromise path: fits if we let it use ordinary RAM (slower) ------
	# Prefer the everyday 4-bit; drop smaller only if needed.
	for q in (RECOMMENDED_QUANT, QUANT_BY_KEY["Q3_K_M"], QUANT_BY_KEY["Q2_K"]):
	est = estimate_memory(model, q, context_tokens=context_tokens,
	job_overhead_factor=of)
	if est.total_gb <= total * _SAFETY_FILL:
	notes = []
	if not spec.has_fast_path:
	notes.append("Runs on the processor (no graphics card to speed it up) — expect slow replies.")
	else:
	notes.append("Too big to fit the graphics card on its own — part runs on slower memory, so replies come more slowly.")
	if q is not RECOMMENDED_QUANT:
	notes.append(f"Had to shrink it to {q.plain_name} to fit — some quality is lost.")
	return ModelVerdict(model, VERDICT_COMPROMISE, q, est, False, notes)

	# --- Doesn't fit even at the smallest setting --------------------------
	est = estimate_memory(model, QUANT_BY_KEY["Q2_K"], context_tokens=context_tokens,
	job_overhead_factor=of)
	short_by = round(est.total_gb - total, 1)
	notes = [f"Needs about {est.total_gb:g} GB even squeezed down — "
	f"around {short_by:g} GB more than this machine can give it."]
	return ModelVerdict(model, VERDICT_NO, QUANT_BY_KEY["Q2_K"], est, False, notes)


	def _rank(model_key: str) -> int:
	return next(i for i, m in enumerate(MODEL_CLASSES) if m.key == model_key)


	def advise(spec: HardwareSpec, use_case_key: str = "chat") -> Advice:
	"""Produce full advice for a machine and a goal."""
	use_case = USE_CASE_BY_KEY.get(use_case_key, USE_CASE_BY_KEY["chat"])
	context_tokens = _CONTEXT_FOR_USE_CASE.get(use_case.key, 4096)

	# Evaluate every size class, biggest first (so the table reads top-down).
	verdicts = [
	_evaluate_model(m, spec, use_case, context_tokens)
	for m in reversed(MODEL_CLASSES)
	]

	# --- Headline: the single "just use this" pick -----------------------
	# Priorities, in order:
	# 1. The biggest model that WORKS NOW (fast + good quality) and is at
	# least big enough for the job. Fast-and-capable is the best answer.
	# 2. If nothing fast is big enough, the best COMPROMISE that does the
	# job — sized close to ideal, not needlessly oversized-and-slow.
	# 3. Otherwise, the best we can honestly offer, flagged as below-par.
	good_rank = _rank(use_case.good_class)
	min_rank = _rank(use_case.min_class)

	q4_bpw = RECOMMENDED_QUANT.bits_per_weight
	works = [v for v in verdicts if v.verdict == VERDICT_WORKS]
	comp = [v for v in verdicts if v.verdict == VERDICT_COMPROMISE]

	def largest(vs):
	return max(vs, key=lambda v: _rank(v.model.key))

	def nearest_good(vs):
	# Closest to the ideal size without overshooting into needless slowness.
	below = [v for v in vs if _rank(v.model.key) <= good_rank]
	return largest(below) if below else min(vs, key=lambda v: _rank(v.model.key))

	def decent(vs):
	# Don't headline a model that only fits at a desperate sub-4-bit squeeze
	# if a cleaner option exists — quality matters more than size on the box.
	return [v for v in vs if v.quant.bits_per_weight >= q4_bpw]

	works_ok = [v for v in works if _rank(v.model.key) >= min_rank]
	comp_ok = [v for v in comp if _rank(v.model.key) >= min_rank]

	headline = None
	meets_goal = False
	if works_ok:
	headline, meets_goal = largest(works_ok), True
	elif comp_ok:
	headline, meets_goal = nearest_good(decent(comp_ok) or comp_ok), True
	elif works:
	headline, meets_goal = largest(works), False
	elif comp:
	headline, meets_goal = nearest_good(decent(comp) or comp), False

	if headline is not None and not meets_goal:
	headline.notes.insert(
	0, f"This is the best this machine can do, but it's on the small "
	f"side for {use_case.plain_name.lower()} — treat results as 'okay', not great.")

	return Advice(
	spec=spec,
	use_case=use_case,
	context_tokens=context_tokens,
	verdicts=verdicts,
	headline=headline,
	runtimes=pick_runtimes(spec),
	meets_goal=meets_goal,
	)