Spaces:
Runtime error
Runtime error
| """Pre-window latency check β does the model respond fast enough for a live demo? | |
| Run on the ACTUAL target hardware (your laptop / the Space). If a turn takes | |
| ~40s, switch to a smaller quant now (gemma4:e2b), not on June 13. | |
| Run: `make bench` (optionally CHIEF_ENGINEER_MODEL=gemma4:e2b) | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| import time | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) # repo root on path | |
| from core import llm | |
| from core.models import Environment, Job | |
| from core.prompts import build_system_prompt | |
| N = 3 | |
| def main() -> None: | |
| if not llm.is_available(): | |
| print("β Ollama not reachable β start `ollama serve` and pull the model, then re-run.") | |
| print(f" target model: {llm.MODEL}") | |
| return | |
| job = Job(geometry_type="overhang", material="PLA", description="45Β° bracket") | |
| env = Environment(temp=28, humidity=50) | |
| system = build_system_prompt(job, env, []) | |
| times = [] | |
| for i in range(N): | |
| t0 = time.time() | |
| out = llm.chat_json(system, "Give your recommendation for THIS job now.") | |
| dt = time.time() - t0 | |
| times.append(dt) | |
| ok = "ok" if out else "parse-fail" | |
| print(f" run {i + 1}: {dt:5.1f}s ({ok})") | |
| # Same cold/warm split + bands as preflight G2 (calibrated 6/10: warm <20s | |
| # reads fine in a narrated demo). | |
| cold, warm = times[0], (times[1:] or times) | |
| warm_avg = sum(warm) / len(warm) | |
| verdict = ("β fine for a live narrated demo" if warm_avg < 20 else | |
| ("π‘ long pauses β tighten prompt or use e2b/ZeroGPU" if warm_avg < 35 else | |
| "π΄ too slow β use gemma4:e2b")) | |
| print(f"\n{llm.MODEL}: warm avg {warm_avg:.1f}s (first call {cold:.1f}s) over {N} runs β {verdict}") | |
| if __name__ == "__main__": | |
| main() | |