Spaces:

build-small-hackathon
/

microfactory-lab

Runtime error

File size: 15,305 Bytes

6b09b49

"""Preflight — gated GO/NO-GO check for the real local stack. Run this FIRST
thing when you sit down locally, before touching anything else:

    ollama serve &          (if not already running)
    make preflight          [CHIEF_ENGINEER_MODEL=gemma4:e2b make preflight]
                            (or: uv run python -m scripts.preflight)

It exercises the REAL model path (the thing the sandbox could never verify) and
grades every gate the demo depends on. Each FAIL points at the matching section
of docs/plan/06-CONTINGENCY.md — so a failure costs minutes, not a night.

Never touches demo state: uses a temp ledger copy. Offline gates still run
without Ollama (reported as SKIP for the live ones). Exit code 1 if any
REQUIRED gate fails — safe to wire into a pre-record ritual.
"""

from __future__ import annotations

import json
import os
import shutil
import sys
import tempfile
import time
from pathlib import Path

HERE = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(HERE))

from core import llm                                            # noqa: E402
from core.ledger import LedgerManager                      # noqa: E402
from core.models import Advice, Environment, Job           # noqa: E402
from core.prompts import REFLECT_SYSTEM, build_reflect_prompt, build_system_prompt  # noqa: E402
from core.spine import SpineValidator                      # noqa: E402
from core.models import PrintSettings                      # noqa: E402

RESULTS: list[tuple[str, str, str]] = []   # (gate, status, detail)
CONTINGENCY = "docs/plan/06-CONTINGENCY.md"


def record(gate: str, status: str, detail: str, section: str = "") -> None:
    ptr = f"  → see {CONTINGENCY} §{section}" if (section and status == "FAIL") else ""
    RESULTS.append((gate, status, detail))
    icon = {"PASS": "✅", "WARN": "🟡", "FAIL": "🔴", "SKIP": "⏭"}[status]
    print(f"{icon} {gate}: {status} — {detail}{ptr}")


def _temp_ledger() -> LedgerManager:
    tmp = Path(tempfile.mkdtemp(prefix="preflight_")) / "lessons.jsonl"
    seeds = HERE / "data" / "seed_lessons.jsonl"
    if seeds.exists():
        shutil.copy(seeds, tmp)
    else:
        tmp.touch()
    return LedgerManager(path=tmp)


# --- G1: environment ---------------------------------------------------------
def g1_environment() -> bool:
    if "4b" == llm.MODEL.split(":")[-1]:
        record("G1 env", "FAIL", f"model tag '{llm.MODEL}' — gemma4:4b DOES NOT EXIST (Kaggle landmine)", "G1")
        return False
    if not llm.is_available():
        record("G1 env", "FAIL", "Ollama daemon unreachable (is `ollama serve` running?)", "G1")
        return False
    try:
        import ollama
        tags = [m.get("model") or m.get("name") for m in ollama.list().get("models", [])]
    except Exception as e:
        tags = []
        record("G1 env", "WARN", f"daemon up but list() odd: {e!r}")
    if tags and not any(llm.MODEL in (t or "") or (t or "").startswith(llm.MODEL) for t in tags):
        record("G1 env", "FAIL", f"'{llm.MODEL}' not pulled. Available: {tags}", "G1")
        return False
    record("G1 env", "PASS", f"daemon up, model '{llm.MODEL}' present ({len(tags)} tags local)")
    _tiny_titan_check()
    return True


def _tiny_titan_check() -> None:
    """Report Tiny Titan ($1.5k ≤4B special award) eligibility from `ollama show`.
    Informational — never blocks the demo. Verified 6/10: the field guide's 32B cap
    counts TOTAL params ("not just active"); no ruling found for MatFormer E-models
    (raw 5.1B/8.0B vs effective ~2B/~4B) on the ≤4B award → treat as ambiguous and
    ASK in the org discussions before tagging."""
    try:
        import ollama
        info = ollama.show(llm.MODEL)
    except Exception as e:
        record("Tiny Titan", "SKIP", f"`ollama show` unavailable ({e!r:.60}) — run it by hand")
        return

    def _get(obj, *keys):
        for k in keys:
            if isinstance(obj, dict) and k in obj:
                return obj[k]
            if hasattr(obj, k):
                return getattr(obj, k)
        return None

    details = _get(info, "details") or {}
    modelinfo = _get(info, "modelinfo", "model_info") or {}
    psize = _get(details, "parameter_size")  # e.g. "4.3B"
    b = None
    if isinstance(modelinfo, dict):
        for k, v in modelinfo.items():
            if str(k).endswith("parameter_count") and isinstance(v, (int, float)):
                b = float(v) / 1e9
    if b is None and isinstance(psize, str):
        try:
            b = float(psize.strip().upper().rstrip("B"))
        except Exception:
            b = None

    # Gemma 3n E-models report RAW params via ollama (E4B~8B) but are designed as
    # EFFECTIVE 4B/2B (MatFormer + per-layer embeddings). The badge counts the
    # effective size, so key off the model NAME, not the raw count.
    import re
    em = re.search(r"e(\d+)b", llm.MODEL.lower())
    eff = float(em.group(1)) if em else None
    raw = f"{b:.1f}B raw" if b is not None else "raw n/a"

    if eff is not None:
        if eff <= 4.0:
            # Verified 6/10: the guide's 32B cap counts TOTAL params ("not just
            # active") and no ruling exists for E-models on the <=4B award — so
            # effective-params eligibility is genuinely AMBIGUOUS. Ask, don't tag.
            record("Tiny Titan", "WARN",
                   f"{llm.MODEL}: effective ~{eff:.0f}B but {raw} — $1.5k award counts params "
                   f"ambiguously for E-models (32B cap counts TOTAL). ASK in the org "
                   f"discussions before tagging tiny-titan")
        else:
            record("Tiny Titan", "WARN",
                   f"{llm.MODEL}: effective ~{eff:.0f}B > 4B — outside Tiny Titan either way")
    elif b is None:
        record("Tiny Titan", "WARN", f"couldn't parse params (details={psize!r}); check `ollama show {llm.MODEL}` by hand")
    elif b <= 4.0:
        record("Tiny Titan", "PASS", f"{b:.2f}B ≤ 4B → ELIGIBLE; add the tag")
    else:
        record("Tiny Titan", "WARN", f"{b:.2f}B > 4B — outside Tiny Titan; skip that badge")


# --- G2-G4: the load-bearing live calls ---------------------------------------
def g2_g4_live_calls() -> None:
    lm = _temp_ledger()
    # Case A: precedent-rich (humid PETG stringing — seeds 007/008/012 match)
    job_a = Job(geometry_type="stringing", material="PETG", description="calibration tower, humid day")
    env_a = Environment(temp=25, humidity=65)
    retrieved = lm.retrieve("PETG", "stringing", 25, 65, k=3)
    sys_a = build_system_prompt(job_a, env_a, retrieved)
    # Case B: novel (TPU vase — no precedent in seeds)
    job_b = Job(geometry_type="vase", material="TPU", description="flexible vase")
    env_b = Environment(temp=22, humidity=45)
    sys_b = build_system_prompt(job_b, env_b, lm.retrieve("TPU", "vase", 22, 45, k=3))

    # Prompt-length budget (GEMMA-STEERING Technique 5): small-Gemma attention
    # quality degrades past ~800 tokens. Informational — trim references/k if hot.
    est = len(sys_a) // 4
    flag = "  ⚠ over the ~800-token small-Gemma budget — trim references / k" if est > 800 else ""
    print(f"   prompt size: ~{est} tokens (precedent-rich case){flag}")

    times, parses, schemas = [], 0, 0
    advice_a = None
    N = 3
    for i in range(N):
        t0 = time.time()
        raw = llm.chat_json(sys_a, "Give your recommendation for THIS job now.")
        dt = time.time() - t0
        times.append(dt)
        print(f"   live call {i+1}/{N}: {dt:5.1f}s {'(json ok)' if raw else '(parse FAIL)'}")
        if raw is not None:
            parses += 1
            try:
                advice_a = Advice(**raw)
                schemas += 1
            except Exception as e:
                print(f"     schema reject: {e!s:.120}")

    # G2 latency — separate the one-time COLD model-load from WARM steady-state.
    # The cold call (first) only happens once; you pre-warm before recording, so
    # the demo experience is the warm number. Gate on warm, report cold as a tip.
    cold = times[0]
    warm = times[1:] if len(times) > 1 else times
    warm_avg = sum(warm) / len(warm)
    print(f"   cold-start {cold:5.1f}s (one-time model load) · warm avg {warm_avg:.1f}s "
          f"over {len(warm)} — pre-warm with one throwaway call before recording")
    # Bands calibrated against real cockpit driving (Kyle, 6/10): warm ~18s on
    # e4b reads fine in a narrated demo, so <20s is a PASS, not a warning.
    if warm_avg < 20:
        record("G2 latency", "PASS",
               f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) — fine for a live narrated demo ({llm.MODEL}); pre-warm before recording")
    elif warm_avg < 35:
        record("G2 latency", "WARN",
               f"warm avg {warm_avg:.1f}s (cold {cold:.1f}s) — long pauses; tighten prompt, or gemma4:e2b / ZeroGPU", "G2")
    else:
        record("G2 latency", "FAIL",
               f"warm avg {warm_avg:.1f}s — too slow even warm; use gemma4:e2b or ZeroGPU", "G2")

    # G3 contract
    if schemas == N:
        record("G3 contract", "PASS", f"{schemas}/{N} valid JSON + Advice schema")
    elif schemas >= 1:
        record("G3 contract", "WARN", f"only {schemas}/{N} schema-valid (fallback will cover, but video needs live)", "G3")
    else:
        record("G3 contract", "FAIL", f"0/{N} valid — live path unusable as-is", "G3")

    # G4 reasoning quality — the load-bearing moment, heuristically graded
    if advice_a is not None:
        r = advice_a.reasoning.lower()
        checks = {
            "evaluates precedent (cites a job/precedent/prior)": any(w in r for w in ("precedent", "prior", "job", "seed-", "last time", "before")),
            "reasons about the room (humidity/temp/moisture/dry)": any(w in r for w in ("humid", "moisture", "temp", "°c", " rh", "dry", "wet")),
            "substantive (>120 chars)": len(advice_a.reasoning) > 120,
            "flags at least one risk region": len(advice_a.risks) >= 1,
        }
        failed = [k for k, ok in checks.items() if not ok]
        print(f"   reasoning sample: \"{advice_a.reasoning[:180]}...\"")
        if not failed:
            record("G4 reasoning", "PASS", "precedent-evaluation text present and substantive")
        else:
            record("G4 reasoning", "WARN", f"weak on: {'; '.join(failed)} — prompt-tune before recording", "G4")
    else:
        record("G4 reasoning", "FAIL", "no schema-valid advice to grade", "G3")

    # G4b novel case — must NOT hallucinate precedent
    raw_b = llm.chat_json(sys_b, "Give your recommendation for THIS job now.")
    if raw_b:
        try:
            adv_b = Advice(**raw_b)
            rb = adv_b.reasoning.lower()
            honest = any(w in rb for w in ("no close precedent", "no precedent", "no prior", "novel", "material properties", "first "))
            cites_fake = "seed-" in rb
            if honest and not cites_fake:
                record("G4b novel-case", "PASS", "says no-precedent / reasons from material properties")
            else:
                record("G4b novel-case", "WARN", f"novel-job reasoning suspect (honest={honest}, cites_fake={cites_fake}) — check by eye", "G4")
            print(f"   novel sample: \"{adv_b.reasoning[:180]}...\"")
        except Exception:
            record("G4b novel-case", "WARN", "novel call returned but schema-invalid", "G3")
    else:
        record("G4b novel-case", "WARN", "novel call failed to parse", "G3")

    # G5 reflection
    raw_r = llm.chat_json(REFLECT_SYSTEM, build_reflect_prompt(
        job_a, env_a, "nozzle 230°C, bed 80°C, retraction 4.5mm, fan 40%, first-layer fan 0%", "success"))
    lesson = (raw_r or {}).get("lesson") if isinstance(raw_r, dict) else None
    if lesson and len(lesson) > 30:
        record("G5 reflection", "PASS", f"lesson distilled: \"{lesson[:100]}...\"")
    elif lesson:
        record("G5 reflection", "WARN", f"lesson thin: \"{lesson}\"", "G4")
    else:
        record("G5 reflection", "WARN", "reflect returned no lesson (deterministic fallback covers it)", "G3")


# --- G6: spine (offline, always) ----------------------------------------------
def g6_spine() -> None:
    checked = SpineValidator().check(PrintSettings(
        nozzle_temp=260, bed_temp=60, retraction_mm=5, fan_pct=100, first_layer_fan_pct=0), "PLA")
    if checked.vetoes and checked.settings.nozzle_temp < 260:
        record("G6 spine", "PASS", f"unsafe PLA 260°C clamped to {checked.settings.nozzle_temp:.0f}°C ({len(checked.vetoes)} veto)")
    else:
        record("G6 spine", "FAIL", "Spine did NOT clamp an unsafe setting — demo safety claim broken", "G6")


# --- G7: app serves (offline, always) -------------------------------------------
def g7_app() -> None:
    try:
        import urllib.request
        import app as A
        d = A.build()
        d.launch(prevent_thread_lock=True, server_name="127.0.0.1", server_port=7991, quiet=True)
        code = urllib.request.urlopen("http://127.0.0.1:7991/", timeout=15).status
        d.close()
        if code == 200:
            record("G7 app", "PASS", "build() + launch + HTTP 200")
        else:
            record("G7 app", "FAIL", f"HTTP {code}", "G7")
    except Exception as e:
        record("G7 app", "FAIL", f"{e!r:.140}", "G7")


# --- G8: assets + data (offline, always) ---------------------------------------
def g8_assets() -> None:
    missing = [n for n in ("overhang.glb", "bridge.glb", "vase.glb", "cube.glb")
               if not (HERE / "assets" / n).exists()]
    seeds = HERE / "data" / "seed_lessons.jsonl"
    n_seeds = len([l for l in seeds.read_text().splitlines() if l.strip()]) if seeds.exists() else 0
    if not missing and n_seeds == 12:
        record("G8 assets", "PASS", "4 meshes present, 12 seed lessons")
    elif missing:
        record("G8 assets", "FAIL", f"missing meshes {missing} — run `make assets`", "G8")
    else:
        record("G8 assets", "WARN", f"seed count {n_seeds} != 12 — verify data/seed_lessons.jsonl", "G8")


def main() -> None:
    print(f"Chief Engineer preflight — model={llm.MODEL}  ({time.strftime('%Y-%m-%d %H:%M')})")
    print("=" * 70)
    live = g1_environment()
    if live:
        g2_g4_live_calls()
    else:
        for g in ("G2 latency", "G3 contract", "G4 reasoning", "G4b novel-case", "G5 reflection"):
            record(g, "SKIP", "no live backend (offline gates still checked below)")
    g6_spine()
    g7_app()
    g8_assets()

    print("=" * 70)
    fails = [g for g, s, _ in RESULTS if s == "FAIL"]
    warns = [g for g, s, _ in RESULTS if s == "WARN"]
    skips = [g for g, s, _ in RESULTS if s == "SKIP"]
    if fails:
        print(f"🔴 NO-GO: {len(fails)} gate(s) failed: {', '.join(fails)}")
        print(f"   Work {CONTINGENCY} top-to-bottom for each, then re-run.")
        sys.exit(1)
    if skips:
        print("🟡 OFFLINE-ONLY PASS — fallback demo is safe, but DO NOT record the video")
        print("   until the live gates run green. Start `ollama serve` and re-run.")
        sys.exit(0)
    if warns:
        print(f"🟡 GO with warnings ({', '.join(warns)}) — read them before recording.")
        sys.exit(0)
    print("🟢 GO — all gates green. Record the demo today, not tomorrow.")


if __name__ == "__main__":
    main()