Spaces:
Running
Running
| """Dev-time benchmark matrix: Nemotron-3 (Mamba hybrid) vs Llama-3.1-Nemotron | |
| (llama arch) at different thread counts, to pick the model + Spaces hardware. | |
| Run: modal run scripts/modal_bench.py | |
| """ | |
| import modal | |
| app = modal.App("lifeos-bench") | |
| image = ( | |
| modal.Image.debian_slim(python_version="3.13") | |
| .apt_install("build-essential", "cmake", "ninja-build", "git") | |
| .pip_install("llama-cpp-python==0.3.28", "huggingface-hub", "numpy") | |
| ) | |
| MODELS = { | |
| "nemotron3-nano-4b": ("nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF", "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"), | |
| "llama31-nemotron-4b": ("bartowski/nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-GGUF", "nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-Q4_K_M.gguf"), | |
| } | |
| MESSAGES = [ | |
| {"role": "system", "content": "/no_think\nYou are LifeOS, a concise local assistant."}, | |
| {"role": "user", "content": "In 3 bullet points, what's a cheap high-protein dinner using chicken thighs at $2.49/lb?"}, | |
| ] | |
| def bench(model_key: str, n_threads: int) -> dict: | |
| import time | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| repo, fname = MODELS[model_key] | |
| path = hf_hub_download(repo_id=repo, filename=fname) | |
| llm = Llama(model_path=path, n_ctx=4096, n_threads=n_threads, verbose=False) | |
| # warm once (memory map, graph build) | |
| llm.create_chat_completion(MESSAGES, max_tokens=8) | |
| t0 = time.time() | |
| n = 0 | |
| for chunk in llm.create_chat_completion(MESSAGES, max_tokens=128, stream=True): | |
| d = chunk["choices"][0]["delta"] | |
| if d.get("content"): | |
| n += 1 | |
| dt = time.time() - t0 | |
| return {"model": model_key, "threads": n_threads, "tokens": n, "sec": round(dt, 1), "tps": round(n / dt, 2)} | |
| def main(): | |
| jobs = [(m, t) for m in MODELS for t in (2, 8)] | |
| for res in bench.starmap(jobs): | |
| print("RESULT:", res) | |