Spaces:

build-small-hackathon
/

microfactory-lab

Runtime error

App Files Files Community

microfactory-lab / scripts /bench_latency.py

kylebrodeur

Upload folder using huggingface_hub

6b09b49 verified 9 days ago

Raw

History Blame Contribute Delete

1.84 kB

	"""Pre-window latency check — does the model respond fast enough for a live demo?

	Run on the ACTUAL target hardware (your laptop / the Space). If a turn takes
	~40s, switch to a smaller quant now (gemma4:e2b), not on June 13.
	Run: `make bench` (optionally CHIEF_ENGINEER_MODEL=gemma4:e2b)
	"""

	from __future__ import annotations

	import sys
	import time
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) # repo root on path

	from core import llm
	from core.models import Environment, Job
	from core.prompts import build_system_prompt

	N = 3


	def main() -> None:
	if not llm.is_available():
	print("⚠ Ollama not reachable — start `ollama serve` and pull the model, then re-run.")
	print(f" target model: {llm.MODEL}")
	return

	job = Job(geometry_type="overhang", material="PLA", description="45° bracket")
	env = Environment(temp=28, humidity=50)
	system = build_system_prompt(job, env, [])

	times = []
	for i in range(N):
	t0 = time.time()
	out = llm.chat_json(system, "Give your recommendation for THIS job now.")
	dt = time.time() - t0
	times.append(dt)
	ok = "ok" if out else "parse-fail"
	print(f" run {i + 1}: {dt:5.1f}s ({ok})")

	# Same cold/warm split + bands as preflight G2 (calibrated 6/10: warm <20s
	# reads fine in a narrated demo).
	cold, warm = times[0], (times[1:] or times)
	warm_avg = sum(warm) / len(warm)
	verdict = ("✅ fine for a live narrated demo" if warm_avg < 20 else
	("🟡 long pauses — tighten prompt or use e2b/ZeroGPU" if warm_avg < 35 else
	"🔴 too slow — use gemma4:e2b"))
	print(f"\n{llm.MODEL}: warm avg {warm_avg:.1f}s (first call {cold:.1f}s) over {N} runs → {verdict}")


	if __name__ == "__main__":
	main()