File size: 15,305 Bytes
6b09b49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
"""Preflight β€” gated GO/NO-GO check for the real local stack. Run this FIRST
thing when you sit down locally, before touching anything else:

    ollama serve &          (if not already running)
    make preflight          [CHIEF_ENGINEER_MODEL=gemma4:e2b make preflight]
                            (or: uv run python -m scripts.preflight)

It exercises the REAL model path (the thing the sandbox could never verify) and
grades every gate the demo depends on. Each FAIL points at the matching section
of docs/plan/06-CONTINGENCY.md β€” so a failure costs minutes, not a night.

Never touches demo state: uses a temp ledger copy. Offline gates still run
without Ollama (reported as SKIP for the live ones). Exit code 1 if any
REQUIRED gate fails β€” safe to wire into a pre-record ritual.
"""

from __future__ import annotations

import json
import os
import shutil
import sys
import tempfile
import time
from pathlib import Path

HERE = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(HERE))

from core import llm                                            # noqa: E402
from core.ledger import LedgerManager                      # noqa: E402
from core.models import Advice, Environment, Job           # noqa: E402
from core.prompts import REFLECT_SYSTEM, build_reflect_prompt, build_system_prompt  # noqa: E402
from core.spine import SpineValidator                      # noqa: E402
from core.models import PrintSettings                      # noqa: E402

RESULTS: list[tuple[str, str, str]] = []   # (gate, status, detail)
CONTINGENCY = "docs/plan/06-CONTINGENCY.md"


def record(gate: str, status: str, detail: str, section: str = "") -> None:
    ptr = f"  β†’ see {CONTINGENCY} Β§{section}" if (section and status == "FAIL") else ""
    RESULTS.append((gate, status, detail))
    icon = {"PASS": "βœ…", "WARN": "🟑", "FAIL": "πŸ”΄", "SKIP": "⏭"}[status]
    print(f"{icon} {gate}: {status} β€” {detail}{ptr}")


def _temp_ledger() -> LedgerManager:
    tmp = Path(tempfile.mkdtemp(prefix="preflight_")) / "lessons.jsonl"
    seeds = HERE / "data" / "seed_lessons.jsonl"
    if seeds.exists():
        shutil.copy(seeds, tmp)
    else:
        tmp.touch()
    return LedgerManager(path=tmp)


# --- G1: environment ---------------------------------------------------------
def g1_environment() -> bool:
    if "4b" == llm.MODEL.split(":")[-1]:
        record("G1 env", "FAIL", f"model tag '{llm.MODEL}' β€” gemma4:4b DOES NOT EXIST (Kaggle landmine)", "G1")
        return False
    if not llm.is_available():
        record("G1 env", "FAIL", "Ollama daemon unreachable (is `ollama serve` running?)", "G1")
        return False
    try:
        import ollama
        tags = [m.get("model") or m.get("name") for m in ollama.list().get("models", [])]
    except Exception as e:
        tags = []
        record("G1 env", "WARN", f"daemon up but list() odd: {e!r}")
    if tags and not any(llm.MODEL in (t or "") or (t or "").startswith(llm.MODEL) for t in tags):
        record("G1 env", "FAIL", f"'{llm.MODEL}' not pulled. Available: {tags}", "G1")
        return False
    record("G1 env", "PASS", f"daemon up, model '{llm.MODEL}' present ({len(tags)} tags local)")
    _tiny_titan_check()
    return True


def _tiny_titan_check() -> None:
    """Report Tiny Titan ($1.5k ≀4B special award) eligibility from `ollama show`.
    Informational β€” never blocks the demo. Verified 6/10: the field guide's 32B cap
    counts TOTAL params ("not just active"); no ruling found for MatFormer E-models
    (raw 5.1B/8.0B vs effective ~2B/~4B) on the ≀4B award β†’ treat as ambiguous and
    ASK in the org discussions before tagging."""
    try:
        import ollama
        info = ollama.show(llm.MODEL)
    except Exception as e:
        record("Tiny Titan", "SKIP", f"`ollama show` unavailable ({e!r:.60}) β€” run it by hand")
        return

    def _get(obj, *keys):
        for k in keys:
            if isinstance(obj, dict) and k in obj:
                return obj[k]
            if hasattr(obj, k):
                return getattr(obj, k)
        return None

    details = _get(info, "details") or {}
    modelinfo = _get(info, "modelinfo", "model_info") or {}
    psize = _get(details, "parameter_size")  # e.g. "4.3B"
    b = None
    if isinstance(modelinfo, dict):
        for k, v in modelinfo.items():
            if str(k).endswith("parameter_count") and isinstance(v, (int, float)):
                b = float(v) / 1e9
    if b is None and isinstance(psize, str):
        try:
            b = float(psize.strip().upper().rstrip("B"))
        except Exception:
            b = None

    # Gemma 3n E-models report RAW params via ollama (E4B~8B) but are designed as
    # EFFECTIVE 4B/2B (MatFormer + per-layer embeddings). The badge counts the
    # effective size, so key off the model NAME, not the raw count.
    import re
    em = re.search(r"e(\d+)b", llm.MODEL.lower())
    eff = float(em.group(1)) if em else None
    raw = f"{b:.1f}B raw" if b is not None else "raw n/a"

    if eff is not None:
        if eff <= 4.0:
            # Verified 6/10: the guide's 32B cap counts TOTAL params ("not just
            # active") and no ruling exists for E-models on the <=4B award β€” so
            # effective-params eligibility is genuinely AMBIGUOUS. Ask, don't tag.
            record("Tiny Titan", "WARN",
                   f"{llm.MODEL}: effective ~{eff:.0f}B but {raw} β€” $1.5k award counts params "
                   f"ambiguously for E-models (32B cap counts TOTAL). ASK in the org "
                   f"discussions before tagging tiny-titan")
        else:
            record("Tiny Titan", "WARN",
                   f"{llm.MODEL}: effective ~{eff:.0f}B > 4B β€” outside Tiny Titan either way")
    elif b is None:
        record("Tiny Titan", "WARN", f"couldn't parse params (details={psize!r}); check `ollama show {llm.MODEL}` by hand")
    elif b <= 4.0:
        record("Tiny Titan", "PASS", f"{b:.2f}B ≀ 4B β†’ ELIGIBLE; add the tag")
    else:
        record("Tiny Titan", "WARN", f"{b:.2f}B > 4B β€” outside Tiny Titan; skip that badge")


# --- G2-G4: the load-bearing live calls ---------------------------------------
def g2_g4_live_calls() -> None:
    lm = _temp_ledger()
    # Case A: precedent-rich (humid PETG stringing β€” seeds 007/008/012 match)
    job_a = Job(geometry_type="stringing", material="PETG", description="calibration tower, humid day")
    env_a = Environment(temp=25, humidity=65)
    retrieved = lm.retrieve("PETG", "stringing", 25, 65, k=3)
    sys_a = build_system_prompt(job_a, env_a, retrieved)
    # Case B: novel (TPU vase β€” no precedent in seeds)
    job_b = Job(geometry_type="vase", material="TPU", description="flexible vase")
    env_b = Environment(temp=22, humidity=45)
    sys_b = build_system_prompt(job_b, env_b, lm.retrieve("TPU", "vase", 22, 45, k=3))

    # Prompt-length budget (GEMMA-STEERING Technique 5): small-Gemma attention
    # quality degrades past ~800 tokens. Informational β€” trim references/k if hot.
    est = len(sys_a) // 4
    flag = "  ⚠ over the ~800-token small-Gemma budget β€” trim references / k" if est > 800 else ""
    print(f"   prompt size: ~{est} tokens (precedent-rich case){flag}")

    times, parses, schemas = [], 0, 0
    advice_a = None
    N = 3
    for i in range(N):
        t0 = time.time()
        raw = llm.chat_json(sys_a, "Give your recommendation for THIS job now.")
        dt = time.time() - t0
        times.append(dt)
        print(f"   live call {i+1}/{N}: {dt:5.1f}s {'(json ok)' if raw else '(parse FAIL)'}")
        if raw is not None:
            parses += 1
            try:
                advice_a = Advice(**raw)
                schemas += 1
            except Exception as e:
                print(f"     schema reject: {e!s:.120}")

    # G2 latency β€” separate the one-time COLD model-load from WARM steady-state.
    # The cold call (first) only happens once; you pre-warm before recording, so
    # the demo experience is the warm number. Gate on warm, report cold as a tip.
    cold = times[0]
    warm = times[1:] if len(times) > 1 else times
    warm_avg = sum(warm) / len(warm)
    print(f"   cold-start {cold:5.1f}s (one-time model load) Β· warm avg {warm_avg:.1f}s "
          f"over {len(warm)} β€” pre-warm with one throwaway call before recording")
    # Bands calibrated against real cockpit driving (Kyle, 6/10): warm ~18s on
    # e4b reads fine in a narrated demo, so <20s is a PASS, not a warning.
    if warm_avg < 20:
        record("G2 latency", "PASS",
               f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) β€” fine for a live narrated demo ({llm.MODEL}); pre-warm before recording")
    elif warm_avg < 35:
        record("G2 latency", "WARN",
               f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) β€” long pauses; tighten prompt, or gemma4:e2b / ZeroGPU", "G2")
    else:
        record("G2 latency", "FAIL",
               f"warm avg {warm_avg:.1f}s β€” too slow even warm; use gemma4:e2b or ZeroGPU", "G2")

    # G3 contract
    if schemas == N:
        record("G3 contract", "PASS", f"{schemas}/{N} valid JSON + Advice schema")
    elif schemas >= 1:
        record("G3 contract", "WARN", f"only {schemas}/{N} schema-valid (fallback will cover, but video needs live)", "G3")
    else:
        record("G3 contract", "FAIL", f"0/{N} valid β€” live path unusable as-is", "G3")

    # G4 reasoning quality β€” the load-bearing moment, heuristically graded
    if advice_a is not None:
        r = advice_a.reasoning.lower()
        checks = {
            "evaluates precedent (cites a job/precedent/prior)": any(w in r for w in ("precedent", "prior", "job", "seed-", "last time", "before")),
            "reasons about the room (humidity/temp/moisture/dry)": any(w in r for w in ("humid", "moisture", "temp", "Β°c", " rh", "dry", "wet")),
            "substantive (>120 chars)": len(advice_a.reasoning) > 120,
            "flags at least one risk region": len(advice_a.risks) >= 1,
        }
        failed = [k for k, ok in checks.items() if not ok]
        print(f"   reasoning sample: \"{advice_a.reasoning[:180]}...\"")
        if not failed:
            record("G4 reasoning", "PASS", "precedent-evaluation text present and substantive")
        else:
            record("G4 reasoning", "WARN", f"weak on: {'; '.join(failed)} β€” prompt-tune before recording", "G4")
    else:
        record("G4 reasoning", "FAIL", "no schema-valid advice to grade", "G3")

    # G4b novel case β€” must NOT hallucinate precedent
    raw_b = llm.chat_json(sys_b, "Give your recommendation for THIS job now.")
    if raw_b:
        try:
            adv_b = Advice(**raw_b)
            rb = adv_b.reasoning.lower()
            honest = any(w in rb for w in ("no close precedent", "no precedent", "no prior", "novel", "material properties", "first "))
            cites_fake = "seed-" in rb
            if honest and not cites_fake:
                record("G4b novel-case", "PASS", "says no-precedent / reasons from material properties")
            else:
                record("G4b novel-case", "WARN", f"novel-job reasoning suspect (honest={honest}, cites_fake={cites_fake}) β€” check by eye", "G4")
            print(f"   novel sample: \"{adv_b.reasoning[:180]}...\"")
        except Exception:
            record("G4b novel-case", "WARN", "novel call returned but schema-invalid", "G3")
    else:
        record("G4b novel-case", "WARN", "novel call failed to parse", "G3")

    # G5 reflection
    raw_r = llm.chat_json(REFLECT_SYSTEM, build_reflect_prompt(
        job_a, env_a, "nozzle 230Β°C, bed 80Β°C, retraction 4.5mm, fan 40%, first-layer fan 0%", "success"))
    lesson = (raw_r or {}).get("lesson") if isinstance(raw_r, dict) else None
    if lesson and len(lesson) > 30:
        record("G5 reflection", "PASS", f"lesson distilled: \"{lesson[:100]}...\"")
    elif lesson:
        record("G5 reflection", "WARN", f"lesson thin: \"{lesson}\"", "G4")
    else:
        record("G5 reflection", "WARN", "reflect returned no lesson (deterministic fallback covers it)", "G3")


# --- G6: spine (offline, always) ----------------------------------------------
def g6_spine() -> None:
    checked = SpineValidator().check(PrintSettings(
        nozzle_temp=260, bed_temp=60, retraction_mm=5, fan_pct=100, first_layer_fan_pct=0), "PLA")
    if checked.vetoes and checked.settings.nozzle_temp < 260:
        record("G6 spine", "PASS", f"unsafe PLA 260Β°C clamped to {checked.settings.nozzle_temp:.0f}Β°C ({len(checked.vetoes)} veto)")
    else:
        record("G6 spine", "FAIL", "Spine did NOT clamp an unsafe setting β€” demo safety claim broken", "G6")


# --- G7: app serves (offline, always) -------------------------------------------
def g7_app() -> None:
    try:
        import urllib.request
        import app as A
        d = A.build()
        d.launch(prevent_thread_lock=True, server_name="127.0.0.1", server_port=7991, quiet=True)
        code = urllib.request.urlopen("http://127.0.0.1:7991/", timeout=15).status
        d.close()
        if code == 200:
            record("G7 app", "PASS", "build() + launch + HTTP 200")
        else:
            record("G7 app", "FAIL", f"HTTP {code}", "G7")
    except Exception as e:
        record("G7 app", "FAIL", f"{e!r:.140}", "G7")


# --- G8: assets + data (offline, always) ---------------------------------------
def g8_assets() -> None:
    missing = [n for n in ("overhang.glb", "bridge.glb", "vase.glb", "cube.glb")
               if not (HERE / "assets" / n).exists()]
    seeds = HERE / "data" / "seed_lessons.jsonl"
    n_seeds = len([l for l in seeds.read_text().splitlines() if l.strip()]) if seeds.exists() else 0
    if not missing and n_seeds == 12:
        record("G8 assets", "PASS", "4 meshes present, 12 seed lessons")
    elif missing:
        record("G8 assets", "FAIL", f"missing meshes {missing} β€” run `make assets`", "G8")
    else:
        record("G8 assets", "WARN", f"seed count {n_seeds} != 12 β€” verify data/seed_lessons.jsonl", "G8")


def main() -> None:
    print(f"Chief Engineer preflight β€” model={llm.MODEL}  ({time.strftime('%Y-%m-%d %H:%M')})")
    print("=" * 70)
    live = g1_environment()
    if live:
        g2_g4_live_calls()
    else:
        for g in ("G2 latency", "G3 contract", "G4 reasoning", "G4b novel-case", "G5 reflection"):
            record(g, "SKIP", "no live backend (offline gates still checked below)")
    g6_spine()
    g7_app()
    g8_assets()

    print("=" * 70)
    fails = [g for g, s, _ in RESULTS if s == "FAIL"]
    warns = [g for g, s, _ in RESULTS if s == "WARN"]
    skips = [g for g, s, _ in RESULTS if s == "SKIP"]
    if fails:
        print(f"πŸ”΄ NO-GO: {len(fails)} gate(s) failed: {', '.join(fails)}")
        print(f"   Work {CONTINGENCY} top-to-bottom for each, then re-run.")
        sys.exit(1)
    if skips:
        print("🟑 OFFLINE-ONLY PASS β€” fallback demo is safe, but DO NOT record the video")
        print("   until the live gates run green. Start `ollama serve` and re-run.")
        sys.exit(0)
    if warns:
        print(f"🟑 GO with warnings ({', '.join(warns)}) β€” read them before recording.")
        sys.exit(0)
    print("🟒 GO β€” all gates green. Record the demo today, not tomorrow.")


if __name__ == "__main__":
    main()