"""Dev-time benchmark matrix: Nemotron-3 (Mamba hybrid) vs Llama-3.1-Nemotron (llama arch) at different thread counts, to pick the model + Spaces hardware. Run: modal run scripts/modal_bench.py """ import modal app = modal.App("lifeos-bench") image = ( modal.Image.debian_slim(python_version="3.13") .apt_install("build-essential", "cmake", "ninja-build", "git") .pip_install("llama-cpp-python==0.3.28", "huggingface-hub", "numpy") ) MODELS = { "nemotron3-nano-4b": ("nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF", "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"), "llama31-nemotron-4b": ("bartowski/nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-GGUF", "nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-Q4_K_M.gguf"), } MESSAGES = [ {"role": "system", "content": "/no_think\nYou are LifeOS, a concise local assistant."}, {"role": "user", "content": "In 3 bullet points, what's a cheap high-protein dinner using chicken thighs at $2.49/lb?"}, ] @app.function(image=image, cpu=8.0, memory=16384, timeout=3600) def bench(model_key: str, n_threads: int) -> dict: import time from huggingface_hub import hf_hub_download from llama_cpp import Llama repo, fname = MODELS[model_key] path = hf_hub_download(repo_id=repo, filename=fname) llm = Llama(model_path=path, n_ctx=4096, n_threads=n_threads, verbose=False) # warm once (memory map, graph build) llm.create_chat_completion(MESSAGES, max_tokens=8) t0 = time.time() n = 0 for chunk in llm.create_chat_completion(MESSAGES, max_tokens=128, stream=True): d = chunk["choices"][0]["delta"] if d.get("content"): n += 1 dt = time.time() - t0 return {"model": model_key, "threads": n_threads, "tokens": n, "sec": round(dt, 1), "tps": round(n / dt, 2)} @app.local_entrypoint() def main(): jobs = [(m, t) for m in MODELS for t in (2, 8)] for res in bench.starmap(jobs): print("RESULT:", res)