"""Dev-time benchmark matrix: Nemotron-3 (Mamba hybrid) vs Llama-3.1-Nemotron
(llama arch) at different thread counts, to pick the model + Spaces hardware.

Run: modal run scripts/modal_bench.py
"""
import modal

app = modal.App("lifeos-bench")

image = (
    modal.Image.debian_slim(python_version="3.13")
    .apt_install("build-essential", "cmake", "ninja-build", "git")
    .pip_install("llama-cpp-python==0.3.28", "huggingface-hub", "numpy")
)

MODELS = {
    "nemotron3-nano-4b": ("nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF", "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"),
    "llama31-nemotron-4b": ("bartowski/nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-GGUF", "nvidia_Llama-3.1-Nemotron-Nano-4B-v1.1-Q4_K_M.gguf"),
}

MESSAGES = [
    {"role": "system", "content": "/no_think\nYou are LifeOS, a concise local assistant."},
    {"role": "user", "content": "In 3 bullet points, what's a cheap high-protein dinner using chicken thighs at $2.49/lb?"},
]


@app.function(image=image, cpu=8.0, memory=16384, timeout=3600)
def bench(model_key: str, n_threads: int) -> dict:
    import time
    from huggingface_hub import hf_hub_download
    from llama_cpp import Llama

    repo, fname = MODELS[model_key]
    path = hf_hub_download(repo_id=repo, filename=fname)
    llm = Llama(model_path=path, n_ctx=4096, n_threads=n_threads, verbose=False)

    # warm once (memory map, graph build)
    llm.create_chat_completion(MESSAGES, max_tokens=8)

    t0 = time.time()
    n = 0
    for chunk in llm.create_chat_completion(MESSAGES, max_tokens=128, stream=True):
        d = chunk["choices"][0]["delta"]
        if d.get("content"):
            n += 1
    dt = time.time() - t0
    return {"model": model_key, "threads": n_threads, "tokens": n, "sec": round(dt, 1), "tps": round(n / dt, 2)}


@app.local_entrypoint()
def main():
    jobs = [(m, t) for m in MODELS for t in (2, 8)]
    for res in bench.starmap(jobs):
        print("RESULT:", res)