Spaces:

build-small-hackathon
/

FitCheck

Running on Zero

File size: 7,166 Bytes

12d2e34

"""
Static catalogue: the frozen facts the advisor reasons over.

Everything here is build-time data — no network calls at runtime. That keeps
the tool fully offline-capable (the "Off the Grid" goal) and means the advice
can't silently drift when some external API changes.

Sources for the numbers (so anyone can check our work):
  - bits-per-weight for GGUF quant families: llama.cpp / Hugging Face GGUF docs
  - "~2 GB per 1B params at fp16": Hugging Face Transformers optimisation guide
  - 8-bit ≈ 50% of fp16, 4-bit ≈ 25-30%: bitsandbytes docs
  - architecture sizes (layers / hidden): typical published configs per size class
"""

from dataclasses import dataclass, field


# --------------------------------------------------------------------------
# Quantisation tiers
# --------------------------------------------------------------------------
# "Quantisation" = squashing the model's numbers into fewer bits so it takes
# less memory. Fewer bits = smaller + faster, but slightly less sharp.
# gb_per_billion is just bits_per_weight / 8 (bits -> bytes -> GB per 1B params).

@dataclass(frozen=True)
class QuantTier:
    key: str
    plain_name: str          # what a normal person sees
    bits_per_weight: float
    blurb: str               # one honest sentence about the trade-off
    recommended: bool = False

    @property
    def gb_per_billion(self) -> float:
        return self.bits_per_weight / 8.0


QUANT_TIERS: list[QuantTier] = [
    QuantTier("fp16", "Full quality (fp16)", 16.0,
              "The original, uncompressed model. Biggest and slowest to load."),
    QuantTier("Q8_0", "Near-full (8-bit)", 8.5,
              "Practically indistinguishable from full quality, about half the size."),
    QuantTier("Q6_K", "High (6-bit)", 6.56,
              "Very close to full quality, a bit smaller again."),
    QuantTier("Q5_K_M", "Balanced+ (5-bit)", 5.67,
              "A touch sharper than 4-bit for a little more memory."),
    QuantTier("Q4_K_M", "Balanced (4-bit)", 4.83,
              "The sweet spot most people use: small, fast, and still very good.",
              recommended=True),
    QuantTier("Q3_K_M", "Compact (3-bit)", 3.91,
              "Smaller still, with a slight, usually-acceptable quality dip."),
    QuantTier("Q2_K", "Tiny (2-bit)", 3.35,
              "Last resort to make something fit — noticeably less reliable."),
]

QUANT_BY_KEY = {q.key: q for q in QUANT_TIERS}
RECOMMENDED_QUANT = next(q for q in QUANT_TIERS if q.recommended)


# --------------------------------------------------------------------------
# Model size classes
# --------------------------------------------------------------------------
# We reason in *size classes* rather than individual models, because the
# memory maths is driven by parameter count + architecture shape. Each class
# carries an approximate architecture so we can estimate the KV cache (chat
# memory) honestly. Layers/hidden are conservative typicals, not exact.

@dataclass(frozen=True)
class ModelClass:
    key: str
    billions: float          # parameter count in billions (representative)
    plain_name: str
    good_for: str            # plain-English "what it's actually good at"
    n_layers: int
    hidden: int
    # Example concrete models for the copy-paste commands (real, well-known).
    example_label: str
    ollama_tag: str          # what you'd type after `ollama run`
    gguf_repo: str           # a real Hugging Face GGUF repo for llama.cpp


MODEL_CLASSES: list[ModelClass] = [
    ModelClass("tiny", 1.0, "Tiny (around 1 billion)",
               "Quick simple chat, basic questions, tidying text. Runs on almost anything.",
               24, 2048, "Llama 3.2 1B", "llama3.2:1b",
               "bartowski/Llama-3.2-1B-Instruct-GGUF"),
    ModelClass("small", 3.5, "Small (3-4 billion)",
               "Surprisingly capable everyday chat, summarising, and light coding help.",
               28, 3072, "Llama 3.2 3B", "llama3.2:3b",
               "bartowski/Llama-3.2-3B-Instruct-GGUF"),
    ModelClass("medium", 8.0, "Medium (7-9 billion)",
               "A solid all-rounder: good chat, real coding help, decent reasoning.",
               32, 4096, "Qwen2.5 7B", "qwen2.5:7b",
               "bartowski/Qwen2.5-7B-Instruct-GGUF"),
    ModelClass("large", 14.0, "Large (13-14 billion)",
               "Noticeably smarter and more reliable. Wants a real graphics card.",
               40, 5120, "Qwen2.5 14B", "qwen2.5:14b",
               "bartowski/Qwen2.5-14B-Instruct-GGUF"),
    ModelClass("xlarge", 32.0, "Very large (30-34 billion)",
               "Near-premium quality. Needs a strong GPU or a lot of memory.",
               48, 6656, "Qwen2.5 32B", "qwen2.5:32b",
               "bartowski/Qwen2.5-32B-Instruct-GGUF"),
    ModelClass("huge", 70.0, "Huge (70 billion)",
               "Top-tier open quality. Serious hardware only.",
               80, 8192, "Llama 3.3 70B", "llama3.3:70b",
               "bartowski/Llama-3.3-70B-Instruct-GGUF"),
]

MODEL_BY_KEY = {m.key: m for m in MODEL_CLASSES}


# --------------------------------------------------------------------------
# Use cases (jobs people actually want done)
# --------------------------------------------------------------------------
# Each maps to a *minimum* sensible size and a *comfortable* size. We never
# pretend a job works on a model that's too small for it.

@dataclass(frozen=True)
class UseCase:
    key: str
    plain_name: str
    description: str
    min_class: str           # smallest model that does an OK job
    good_class: str          # where it starts feeling genuinely useful
    # Extra memory headroom multiplier for this job (RAG/agents need more
    # context; fine-tuning needs much more). 1.0 = normal inference.
    overhead_factor: float = 1.0
    note: str = ""


USE_CASES: list[UseCase] = [
    UseCase("chat", "Just chatting / asking questions",
            "General conversation, explanations, everyday questions.",
            "tiny", "small"),
    UseCase("writing", "Writing & summarising",
            "Drafting emails, rewriting, condensing long text.",
            "small", "medium"),
    UseCase("coding", "Coding help",
            "Explaining code, writing functions, fixing bugs.",
            "small", "medium",
            note="Bigger models are much more reliable for code."),
    UseCase("agents", "Tool use / agents",
            "Letting the model call tools, search, or take steps for you.",
            "medium", "medium", overhead_factor=1.15,
            note="Needs steady instruction-following — go medium or larger."),
    UseCase("rag", "Document Q&A (your own files)",
            "Answering questions over your PDFs/notes (a.k.a. RAG).",
            "small", "medium", overhead_factor=1.25,
            note="Long documents use extra memory for context."),
    UseCase("finetune", "Teaching it your own data (fine-tuning)",
            "Training a small adapter (LoRA/QLoRA) on your examples.",
            "small", "medium", overhead_factor=2.2,
            note="Training needs roughly 2-3x the memory of just chatting."),
]

USE_CASE_BY_KEY = {u.key: u for u in USE_CASES}