Spaces:
Runtime error
Runtime error
File size: 15,305 Bytes
6b09b49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | """Preflight β gated GO/NO-GO check for the real local stack. Run this FIRST
thing when you sit down locally, before touching anything else:
ollama serve & (if not already running)
make preflight [CHIEF_ENGINEER_MODEL=gemma4:e2b make preflight]
(or: uv run python -m scripts.preflight)
It exercises the REAL model path (the thing the sandbox could never verify) and
grades every gate the demo depends on. Each FAIL points at the matching section
of docs/plan/06-CONTINGENCY.md β so a failure costs minutes, not a night.
Never touches demo state: uses a temp ledger copy. Offline gates still run
without Ollama (reported as SKIP for the live ones). Exit code 1 if any
REQUIRED gate fails β safe to wire into a pre-record ritual.
"""
from __future__ import annotations
import json
import os
import shutil
import sys
import tempfile
import time
from pathlib import Path
HERE = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(HERE))
from core import llm # noqa: E402
from core.ledger import LedgerManager # noqa: E402
from core.models import Advice, Environment, Job # noqa: E402
from core.prompts import REFLECT_SYSTEM, build_reflect_prompt, build_system_prompt # noqa: E402
from core.spine import SpineValidator # noqa: E402
from core.models import PrintSettings # noqa: E402
RESULTS: list[tuple[str, str, str]] = [] # (gate, status, detail)
CONTINGENCY = "docs/plan/06-CONTINGENCY.md"
def record(gate: str, status: str, detail: str, section: str = "") -> None:
ptr = f" β see {CONTINGENCY} Β§{section}" if (section and status == "FAIL") else ""
RESULTS.append((gate, status, detail))
icon = {"PASS": "β
", "WARN": "π‘", "FAIL": "π΄", "SKIP": "β"}[status]
print(f"{icon} {gate}: {status} β {detail}{ptr}")
def _temp_ledger() -> LedgerManager:
tmp = Path(tempfile.mkdtemp(prefix="preflight_")) / "lessons.jsonl"
seeds = HERE / "data" / "seed_lessons.jsonl"
if seeds.exists():
shutil.copy(seeds, tmp)
else:
tmp.touch()
return LedgerManager(path=tmp)
# --- G1: environment ---------------------------------------------------------
def g1_environment() -> bool:
if "4b" == llm.MODEL.split(":")[-1]:
record("G1 env", "FAIL", f"model tag '{llm.MODEL}' β gemma4:4b DOES NOT EXIST (Kaggle landmine)", "G1")
return False
if not llm.is_available():
record("G1 env", "FAIL", "Ollama daemon unreachable (is `ollama serve` running?)", "G1")
return False
try:
import ollama
tags = [m.get("model") or m.get("name") for m in ollama.list().get("models", [])]
except Exception as e:
tags = []
record("G1 env", "WARN", f"daemon up but list() odd: {e!r}")
if tags and not any(llm.MODEL in (t or "") or (t or "").startswith(llm.MODEL) for t in tags):
record("G1 env", "FAIL", f"'{llm.MODEL}' not pulled. Available: {tags}", "G1")
return False
record("G1 env", "PASS", f"daemon up, model '{llm.MODEL}' present ({len(tags)} tags local)")
_tiny_titan_check()
return True
def _tiny_titan_check() -> None:
"""Report Tiny Titan ($1.5k β€4B special award) eligibility from `ollama show`.
Informational β never blocks the demo. Verified 6/10: the field guide's 32B cap
counts TOTAL params ("not just active"); no ruling found for MatFormer E-models
(raw 5.1B/8.0B vs effective ~2B/~4B) on the β€4B award β treat as ambiguous and
ASK in the org discussions before tagging."""
try:
import ollama
info = ollama.show(llm.MODEL)
except Exception as e:
record("Tiny Titan", "SKIP", f"`ollama show` unavailable ({e!r:.60}) β run it by hand")
return
def _get(obj, *keys):
for k in keys:
if isinstance(obj, dict) and k in obj:
return obj[k]
if hasattr(obj, k):
return getattr(obj, k)
return None
details = _get(info, "details") or {}
modelinfo = _get(info, "modelinfo", "model_info") or {}
psize = _get(details, "parameter_size") # e.g. "4.3B"
b = None
if isinstance(modelinfo, dict):
for k, v in modelinfo.items():
if str(k).endswith("parameter_count") and isinstance(v, (int, float)):
b = float(v) / 1e9
if b is None and isinstance(psize, str):
try:
b = float(psize.strip().upper().rstrip("B"))
except Exception:
b = None
# Gemma 3n E-models report RAW params via ollama (E4B~8B) but are designed as
# EFFECTIVE 4B/2B (MatFormer + per-layer embeddings). The badge counts the
# effective size, so key off the model NAME, not the raw count.
import re
em = re.search(r"e(\d+)b", llm.MODEL.lower())
eff = float(em.group(1)) if em else None
raw = f"{b:.1f}B raw" if b is not None else "raw n/a"
if eff is not None:
if eff <= 4.0:
# Verified 6/10: the guide's 32B cap counts TOTAL params ("not just
# active") and no ruling exists for E-models on the <=4B award β so
# effective-params eligibility is genuinely AMBIGUOUS. Ask, don't tag.
record("Tiny Titan", "WARN",
f"{llm.MODEL}: effective ~{eff:.0f}B but {raw} β $1.5k award counts params "
f"ambiguously for E-models (32B cap counts TOTAL). ASK in the org "
f"discussions before tagging tiny-titan")
else:
record("Tiny Titan", "WARN",
f"{llm.MODEL}: effective ~{eff:.0f}B > 4B β outside Tiny Titan either way")
elif b is None:
record("Tiny Titan", "WARN", f"couldn't parse params (details={psize!r}); check `ollama show {llm.MODEL}` by hand")
elif b <= 4.0:
record("Tiny Titan", "PASS", f"{b:.2f}B β€ 4B β ELIGIBLE; add the tag")
else:
record("Tiny Titan", "WARN", f"{b:.2f}B > 4B β outside Tiny Titan; skip that badge")
# --- G2-G4: the load-bearing live calls ---------------------------------------
def g2_g4_live_calls() -> None:
lm = _temp_ledger()
# Case A: precedent-rich (humid PETG stringing β seeds 007/008/012 match)
job_a = Job(geometry_type="stringing", material="PETG", description="calibration tower, humid day")
env_a = Environment(temp=25, humidity=65)
retrieved = lm.retrieve("PETG", "stringing", 25, 65, k=3)
sys_a = build_system_prompt(job_a, env_a, retrieved)
# Case B: novel (TPU vase β no precedent in seeds)
job_b = Job(geometry_type="vase", material="TPU", description="flexible vase")
env_b = Environment(temp=22, humidity=45)
sys_b = build_system_prompt(job_b, env_b, lm.retrieve("TPU", "vase", 22, 45, k=3))
# Prompt-length budget (GEMMA-STEERING Technique 5): small-Gemma attention
# quality degrades past ~800 tokens. Informational β trim references/k if hot.
est = len(sys_a) // 4
flag = " β over the ~800-token small-Gemma budget β trim references / k" if est > 800 else ""
print(f" prompt size: ~{est} tokens (precedent-rich case){flag}")
times, parses, schemas = [], 0, 0
advice_a = None
N = 3
for i in range(N):
t0 = time.time()
raw = llm.chat_json(sys_a, "Give your recommendation for THIS job now.")
dt = time.time() - t0
times.append(dt)
print(f" live call {i+1}/{N}: {dt:5.1f}s {'(json ok)' if raw else '(parse FAIL)'}")
if raw is not None:
parses += 1
try:
advice_a = Advice(**raw)
schemas += 1
except Exception as e:
print(f" schema reject: {e!s:.120}")
# G2 latency β separate the one-time COLD model-load from WARM steady-state.
# The cold call (first) only happens once; you pre-warm before recording, so
# the demo experience is the warm number. Gate on warm, report cold as a tip.
cold = times[0]
warm = times[1:] if len(times) > 1 else times
warm_avg = sum(warm) / len(warm)
print(f" cold-start {cold:5.1f}s (one-time model load) Β· warm avg {warm_avg:.1f}s "
f"over {len(warm)} β pre-warm with one throwaway call before recording")
# Bands calibrated against real cockpit driving (Kyle, 6/10): warm ~18s on
# e4b reads fine in a narrated demo, so <20s is a PASS, not a warning.
if warm_avg < 20:
record("G2 latency", "PASS",
f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) β fine for a live narrated demo ({llm.MODEL}); pre-warm before recording")
elif warm_avg < 35:
record("G2 latency", "WARN",
f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) β long pauses; tighten prompt, or gemma4:e2b / ZeroGPU", "G2")
else:
record("G2 latency", "FAIL",
f"warm avg {warm_avg:.1f}s β too slow even warm; use gemma4:e2b or ZeroGPU", "G2")
# G3 contract
if schemas == N:
record("G3 contract", "PASS", f"{schemas}/{N} valid JSON + Advice schema")
elif schemas >= 1:
record("G3 contract", "WARN", f"only {schemas}/{N} schema-valid (fallback will cover, but video needs live)", "G3")
else:
record("G3 contract", "FAIL", f"0/{N} valid β live path unusable as-is", "G3")
# G4 reasoning quality β the load-bearing moment, heuristically graded
if advice_a is not None:
r = advice_a.reasoning.lower()
checks = {
"evaluates precedent (cites a job/precedent/prior)": any(w in r for w in ("precedent", "prior", "job", "seed-", "last time", "before")),
"reasons about the room (humidity/temp/moisture/dry)": any(w in r for w in ("humid", "moisture", "temp", "Β°c", " rh", "dry", "wet")),
"substantive (>120 chars)": len(advice_a.reasoning) > 120,
"flags at least one risk region": len(advice_a.risks) >= 1,
}
failed = [k for k, ok in checks.items() if not ok]
print(f" reasoning sample: \"{advice_a.reasoning[:180]}...\"")
if not failed:
record("G4 reasoning", "PASS", "precedent-evaluation text present and substantive")
else:
record("G4 reasoning", "WARN", f"weak on: {'; '.join(failed)} β prompt-tune before recording", "G4")
else:
record("G4 reasoning", "FAIL", "no schema-valid advice to grade", "G3")
# G4b novel case β must NOT hallucinate precedent
raw_b = llm.chat_json(sys_b, "Give your recommendation for THIS job now.")
if raw_b:
try:
adv_b = Advice(**raw_b)
rb = adv_b.reasoning.lower()
honest = any(w in rb for w in ("no close precedent", "no precedent", "no prior", "novel", "material properties", "first "))
cites_fake = "seed-" in rb
if honest and not cites_fake:
record("G4b novel-case", "PASS", "says no-precedent / reasons from material properties")
else:
record("G4b novel-case", "WARN", f"novel-job reasoning suspect (honest={honest}, cites_fake={cites_fake}) β check by eye", "G4")
print(f" novel sample: \"{adv_b.reasoning[:180]}...\"")
except Exception:
record("G4b novel-case", "WARN", "novel call returned but schema-invalid", "G3")
else:
record("G4b novel-case", "WARN", "novel call failed to parse", "G3")
# G5 reflection
raw_r = llm.chat_json(REFLECT_SYSTEM, build_reflect_prompt(
job_a, env_a, "nozzle 230Β°C, bed 80Β°C, retraction 4.5mm, fan 40%, first-layer fan 0%", "success"))
lesson = (raw_r or {}).get("lesson") if isinstance(raw_r, dict) else None
if lesson and len(lesson) > 30:
record("G5 reflection", "PASS", f"lesson distilled: \"{lesson[:100]}...\"")
elif lesson:
record("G5 reflection", "WARN", f"lesson thin: \"{lesson}\"", "G4")
else:
record("G5 reflection", "WARN", "reflect returned no lesson (deterministic fallback covers it)", "G3")
# --- G6: spine (offline, always) ----------------------------------------------
def g6_spine() -> None:
checked = SpineValidator().check(PrintSettings(
nozzle_temp=260, bed_temp=60, retraction_mm=5, fan_pct=100, first_layer_fan_pct=0), "PLA")
if checked.vetoes and checked.settings.nozzle_temp < 260:
record("G6 spine", "PASS", f"unsafe PLA 260Β°C clamped to {checked.settings.nozzle_temp:.0f}Β°C ({len(checked.vetoes)} veto)")
else:
record("G6 spine", "FAIL", "Spine did NOT clamp an unsafe setting β demo safety claim broken", "G6")
# --- G7: app serves (offline, always) -------------------------------------------
def g7_app() -> None:
try:
import urllib.request
import app as A
d = A.build()
d.launch(prevent_thread_lock=True, server_name="127.0.0.1", server_port=7991, quiet=True)
code = urllib.request.urlopen("http://127.0.0.1:7991/", timeout=15).status
d.close()
if code == 200:
record("G7 app", "PASS", "build() + launch + HTTP 200")
else:
record("G7 app", "FAIL", f"HTTP {code}", "G7")
except Exception as e:
record("G7 app", "FAIL", f"{e!r:.140}", "G7")
# --- G8: assets + data (offline, always) ---------------------------------------
def g8_assets() -> None:
missing = [n for n in ("overhang.glb", "bridge.glb", "vase.glb", "cube.glb")
if not (HERE / "assets" / n).exists()]
seeds = HERE / "data" / "seed_lessons.jsonl"
n_seeds = len([l for l in seeds.read_text().splitlines() if l.strip()]) if seeds.exists() else 0
if not missing and n_seeds == 12:
record("G8 assets", "PASS", "4 meshes present, 12 seed lessons")
elif missing:
record("G8 assets", "FAIL", f"missing meshes {missing} β run `make assets`", "G8")
else:
record("G8 assets", "WARN", f"seed count {n_seeds} != 12 β verify data/seed_lessons.jsonl", "G8")
def main() -> None:
print(f"Chief Engineer preflight β model={llm.MODEL} ({time.strftime('%Y-%m-%d %H:%M')})")
print("=" * 70)
live = g1_environment()
if live:
g2_g4_live_calls()
else:
for g in ("G2 latency", "G3 contract", "G4 reasoning", "G4b novel-case", "G5 reflection"):
record(g, "SKIP", "no live backend (offline gates still checked below)")
g6_spine()
g7_app()
g8_assets()
print("=" * 70)
fails = [g for g, s, _ in RESULTS if s == "FAIL"]
warns = [g for g, s, _ in RESULTS if s == "WARN"]
skips = [g for g, s, _ in RESULTS if s == "SKIP"]
if fails:
print(f"π΄ NO-GO: {len(fails)} gate(s) failed: {', '.join(fails)}")
print(f" Work {CONTINGENCY} top-to-bottom for each, then re-run.")
sys.exit(1)
if skips:
print("π‘ OFFLINE-ONLY PASS β fallback demo is safe, but DO NOT record the video")
print(" until the live gates run green. Start `ollama serve` and re-run.")
sys.exit(0)
if warns:
print(f"π‘ GO with warnings ({', '.join(warns)}) β read them before recording.")
sys.exit(0)
print("π’ GO β all gates green. Record the demo today, not tomorrow.")
if __name__ == "__main__":
main()
|