microfactory-lab / scripts /bench_latency.py
kylebrodeur's picture
Upload folder using huggingface_hub
6b09b49 verified
Raw
History Blame Contribute Delete
1.84 kB
"""Pre-window latency check β€” does the model respond fast enough for a live demo?
Run on the ACTUAL target hardware (your laptop / the Space). If a turn takes
~40s, switch to a smaller quant now (gemma4:e2b), not on June 13.
Run: `make bench` (optionally CHIEF_ENGINEER_MODEL=gemma4:e2b)
"""
from __future__ import annotations
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) # repo root on path
from core import llm
from core.models import Environment, Job
from core.prompts import build_system_prompt
N = 3
def main() -> None:
if not llm.is_available():
print("⚠ Ollama not reachable β€” start `ollama serve` and pull the model, then re-run.")
print(f" target model: {llm.MODEL}")
return
job = Job(geometry_type="overhang", material="PLA", description="45Β° bracket")
env = Environment(temp=28, humidity=50)
system = build_system_prompt(job, env, [])
times = []
for i in range(N):
t0 = time.time()
out = llm.chat_json(system, "Give your recommendation for THIS job now.")
dt = time.time() - t0
times.append(dt)
ok = "ok" if out else "parse-fail"
print(f" run {i + 1}: {dt:5.1f}s ({ok})")
# Same cold/warm split + bands as preflight G2 (calibrated 6/10: warm <20s
# reads fine in a narrated demo).
cold, warm = times[0], (times[1:] or times)
warm_avg = sum(warm) / len(warm)
verdict = ("βœ… fine for a live narrated demo" if warm_avg < 20 else
("🟑 long pauses β€” tighten prompt or use e2b/ZeroGPU" if warm_avg < 35 else
"πŸ”΄ too slow β€” use gemma4:e2b"))
print(f"\n{llm.MODEL}: warm avg {warm_avg:.1f}s (first call {cold:.1f}s) over {N} runs β†’ {verdict}")
if __name__ == "__main__":
main()