Spaces:
Running on Zero
Running on Zero
| """ | |
| Putting it in plain words. | |
| The advisor produces structured facts; this module turns them into sentences a | |
| non-technical person actually understands, and into commands they can copy and | |
| paste. No jargon survives here without being explained. | |
| """ | |
| from .advisor import ( | |
| Advice, | |
| ModelVerdict, | |
| VERDICT_WORKS, | |
| VERDICT_COMPROMISE, | |
| VERDICT_NO, | |
| ) | |
| VERDICT_EMOJI = { | |
| VERDICT_WORKS: "🟢", | |
| VERDICT_COMPROMISE: "🟡", | |
| VERDICT_NO: "🔴", | |
| } | |
| VERDICT_WORD = { | |
| VERDICT_WORKS: "Works now", | |
| VERDICT_COMPROMISE: "Works, with compromises", | |
| VERDICT_NO: "Don't bother", | |
| } | |
| def speed_hint(v: ModelVerdict, spec) -> str: | |
| """A rough, honest feel for how fast replies will come.""" | |
| if v.verdict == VERDICT_NO: | |
| return "—" | |
| if v.verdict == VERDICT_COMPROMISE: | |
| return "Slow — usable for short tasks, not snappy chat." | |
| # Works now (fast path). Bigger models are still slower even on a GPU. | |
| if v.model.billions <= 4: | |
| return "Fast — replies feel instant." | |
| if v.model.billions <= 14: | |
| return "Comfortable — quick enough for live chat." | |
| return "Steady — fine, just not instant on big answers." | |
| # -------------------------------------------------------------------------- | |
| # Commands | |
| # -------------------------------------------------------------------------- | |
| def ollama_command(v: ModelVerdict) -> str: | |
| return f"ollama run {v.model.ollama_tag}" | |
| def llamacpp_command(v: ModelVerdict) -> str: | |
| # llama.cpp can pull a GGUF straight from Hugging Face by repo:quant. | |
| return (f"llama-server -hf {v.model.gguf_repo}:{v.quant.key} " | |
| f"-c {v.estimate.context_tokens}") | |
| # -------------------------------------------------------------------------- | |
| # Headline summary, in human words | |
| # -------------------------------------------------------------------------- | |
| def headline_text(advice: Advice) -> str: | |
| spec = advice.spec | |
| uc = advice.use_case | |
| h = advice.headline | |
| if h is None: | |
| return ( | |
| f"**Honest answer: this machine can't comfortably run local AI " | |
| f"for {uc.plain_name.lower()} yet.**\n\n" | |
| f"Even the smallest models need more memory than the " | |
| f"{spec.ram_gb:g} GB available here once everything else is " | |
| f"running. That's not a failure — small computers just have small " | |
| f"budgets. A free cloud option, or adding memory, would open this up." | |
| ) | |
| m = h.model | |
| q = h.quant | |
| fast = "on the graphics card" if spec.has_fast_path and h.verdict == VERDICT_WORKS else "on the processor" | |
| if h.verdict == VERDICT_WORKS: | |
| lead = f"**Yes — you can run a {m.plain_name} model {fast}, today.**" | |
| elif h.verdict == VERDICT_COMPROMISE: | |
| lead = f"**Sort of — a {m.plain_name} model will run, but with trade-offs.**" | |
| else: | |
| lead = f"**Not really — even a {m.plain_name} model is a stretch here.**" | |
| body = ( | |
| f"\n\nFor **{uc.plain_name.lower()}**, the sweet spot on your machine is a " | |
| f"**{m.plain_name}** model at the **{q.plain_name}** setting. " | |
| f"{m.good_for}\n\n" | |
| f"That needs about **{h.estimate.total_gb:g} GB** of memory " | |
| f"(model {h.estimate.weights_gb:g} GB + chat memory " | |
| f"{h.estimate.kv_cache_gb:g} GB + working space {h.estimate.overhead_gb:g} GB), " | |
| f"and you have roughly **{spec.fast_budget_gb:g} GB** fast / " | |
| f"**{spec.total_budget_gb:g} GB** total to play with." | |
| ) | |
| extra = "" | |
| if uc.note: | |
| extra += f"\n\n*Note for this job:* {uc.note}" | |
| if h.notes: | |
| extra += "\n\n" + "\n".join(f"- {n}" for n in h.notes) | |
| return lead + body + extra | |
| def jargon_glossary() -> str: | |
| return ( | |
| "**Plain-English glossary**\n\n" | |
| "- **Model** — the AI's 'brain'. Bigger = smarter but heavier.\n" | |
| "- **Parameters (e.g. 7B)** — how big the brain is. 7B = 7 billion. " | |
| "More = smarter and hungrier for memory.\n" | |
| "- **Quantisation (4-bit, 8-bit)** — shrinking the model so it fits. " | |
| "4-bit is the popular sweet spot: much smaller, barely-noticeable quality loss.\n" | |
| "- **VRAM** — the fast memory on a graphics card. The single biggest " | |
| "factor in what you can run quickly.\n" | |
| "- **RAM** — your computer's normal memory. Models can use it too, but it's slower.\n" | |
| "- **KV cache / 'chat memory'** — scratch space the model uses to " | |
| "remember the current conversation. Longer chats use more.\n" | |
| "- **GGUF** — a single-file model format made for running locally.\n" | |
| "- **llama.cpp / Ollama** — the programs that actually run the model on your machine." | |
| ) | |
| def how_to_find_specs(os_hint: str = "windows") -> str: | |
| common = ( | |
| "**Not sure of your specs? Here's how to check:**\n\n" | |
| ) | |
| if os_hint == "macos": | |
| return common + ( | |
| "- Click the Apple menu (top-left) → **About This Mac**.\n" | |
| "- It shows your chip (e.g. *Apple M2*) and **Memory** (e.g. *16 GB*).\n" | |
| "- On a Mac, that one memory number is all you need — the graphics " | |
| "share it." | |
| ) | |
| if os_hint == "linux": | |
| return common + ( | |
| "- RAM: run `free -h` in a terminal.\n" | |
| "- Graphics card: run `nvidia-smi` (NVIDIA) or `lspci | grep VGA`.\n" | |
| ) | |
| return common + ( | |
| "- **RAM:** press `Ctrl + Shift + Esc` → **Performance** tab → **Memory**.\n" | |
| "- **Graphics card:** same window → **GPU**. The name is at the top " | |
| "right (e.g. *NVIDIA RTX 3060*).\n" | |
| "- No GPU section showing a real card? You likely have built-in " | |
| "graphics — that's fine, just pick the 'built-in' option." | |
| ) | |