lifeos / scripts /modal_bench.py
awaisaziz's picture
Add LifeOS: local-first personal assistant on Nemotron-3-Nano-4B
40cf485
Raw
History Blame Contribute Delete
1.93 kB
"""Dev-time benchmark matrix: Nemotron-3 (Mamba hybrid) vs Llama-3.1-Nemotron
(llama arch) at different thread counts, to pick the model + Spaces hardware.
Run: modal run scripts/modal_bench.py
"""
import modal
app = modal.App("lifeos-bench")
image = (
modal.Image.debian_slim(python_version="3.13")
.apt_install("build-essential", "cmake", "ninja-build", "git")
.pip_install("llama-cpp-python==0.3.28", "huggingface-hub", "numpy")
)
MODELS = {
"nemotron3-nano-4b": ("nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF", "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"),
"llama31-nemotron-4b": ("bartowski/nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-GGUF", "nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-Q4_K_M.gguf"),
}
MESSAGES = [
{"role": "system", "content": "/no_think\nYou are LifeOS, a concise local assistant."},
{"role": "user", "content": "In 3 bullet points, what's a cheap high-protein dinner using chicken thighs at $2.49/lb?"},
]
@app.function(image=image, cpu=8.0, memory=16384, timeout=3600)
def bench(model_key: str, n_threads: int) -> dict:
import time
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
repo, fname = MODELS[model_key]
path = hf_hub_download(repo_id=repo, filename=fname)
llm = Llama(model_path=path, n_ctx=4096, n_threads=n_threads, verbose=False)
# warm once (memory map, graph build)
llm.create_chat_completion(MESSAGES, max_tokens=8)
t0 = time.time()
n = 0
for chunk in llm.create_chat_completion(MESSAGES, max_tokens=128, stream=True):
d = chunk["choices"][0]["delta"]
if d.get("content"):
n += 1
dt = time.time() - t0
return {"model": model_key, "threads": n_threads, "tokens": n, "sec": round(dt, 1), "tps": round(n / dt, 2)}
@app.local_entrypoint()
def main():
jobs = [(m, t) for m in MODELS for t in (2, 8)]
for res in bench.starmap(jobs):
print("RESULT:", res)