Spaces:

build-small-hackathon
/

microfactory-lab

Runtime error

App Files Files Community

microfactory-lab / core /inspector.py

kylebrodeur

Upload folder using huggingface_hub

e9c4780 verified 20 days ago

Raw

History Blame Contribute Delete

10.8 kB

	"""The QA Inspector — a SEPARATE reviewer persona (the hybrid evaluator).

	Integrity, restated: the Chief Engineer never grades its own work. The
	deterministic simulated world (`sim/outcome.py`) produces the ground-truth
	pass/fail. The Inspector is a distinct voice — skeptical, conservative —
	that reads what the Engineer claimed and what actually happened and writes a
	verdict. The grade is therefore "hybrid": deterministic physics + a second LLM
	opinion, never the proposer marking its own homework.

	One persona, three sets of rules depending on where it stands in the workflow:
	• second_opinion(...) — BUILD: critique the PLAN before any print runs.
	• grade_outcome(...) — PRINT: grade one finished (simulated) print vs the
	Engineer's prediction — did the called risk hold?
	• summarize_run(...) — REVIEW: one verdict across a whole iteration run.

	LLM-backed via `llm.chat_json` with a distinct system prompt; each mode has a
	deterministic fallback so the verdict is always present offline.
	"""

	from __future__ import annotations

	from dataclasses import dataclass

	from . import llm
	from .models import Advice, Environment, Job, PrintSettings
	from sim.outcome import SimResult

	PERSONA = """You are La Forge, the QA Inspector: a skeptical, conservative print-shop \
	inspector. You did NOT propose these settings — Chief Engineer O'Brien did, and \
	O'Brien is an optimist. Your job is to second-guess, not to please. You are \
	terse and physical. You never flatter. You call out optimism, thin margins, and \
	unflagged risks, and you give credit only when the evidence earns it."""

	# predicted-risk vocabulary → simulated failure_mode it corresponds to
	_RISK_TO_MODE = {
	"sag": "sag", "stringing": "stringing", "adhesion": "adhesion",
	"warping": "warp", "warp": "warp", "delamination": "under_extrusion",
	}
	_MODE_HUMAN = {
	"sag": "sagging", "stringing": "stringing", "adhesion": "first-layer adhesion",
	"warp": "warping", "under_extrusion": "under-extrusion", "none": "no failure",
	}


	@dataclass
	class InspectorVerdict:
	stance: str # short label, e.g. "concur" / "caution" / "held" / "missed"
	headline: str # one-line verdict
	detail: str # 1-2 lines of rationale
	agreement: bool \| None = None # outcome modes: did the Engineer's prediction match reality?

	@property
	def color(self) -> str:
	s = self.stance.lower()
	if s in ("dispute", "missed", "fail"):
	return "var(--ao-red, #d9534f)"
	if s in ("caution", "overcautious", "watch"):
	return "var(--ao-amber, #e0a458)"
	return "var(--ao-green)"


	def _predicted_modes(advice: Advice) -> set[str]:
	out: set[str] = set()
	for r in advice.risks:
	key = (r.risk or "").strip().lower()
	out.add(_RISK_TO_MODE.get(key, key))
	return out


	def _settings_line(s: PrintSettings) -> str:
	return (f"nozzle {s.nozzle_temp:.0f}°C, bed {s.bed_temp:.0f}°C, fan {s.fan_pct:.0f}%, "
	f"first-layer fan {s.first_layer_fan_pct:.0f}%, retraction {s.retraction_mm:.1f}mm")


	# ── BUILD: a second opinion on the plan, before anything prints ───────────────
	def second_opinion(job: Job, env: Environment, settings: PrintSettings, advice: Advice) -> InspectorVerdict:
	raw = llm.chat_json(
	PERSONA + "\n\nRespond ONLY with JSON: "
	'{"stance":"concur\|caution\|dispute","headline":"<one line>","detail":"<1-2 lines>"}',
	"Review this PLAN before it prints — do not re-propose, just critique.\n"
	f"JOB: {job.material}/{job.geometry_type}, bed position {job.bed_position}, "
	f"room {env.temp:.0f}°C/{env.humidity:.0f}%RH on a {env.printer}.\n"
	f"ENGINEER PROPOSED: {_settings_line(settings)}.\n"
	f"ENGINEER REASONING: {advice.reasoning}\n"
	f"ENGINEER FLAGGED RISKS: {[r.risk for r in advice.risks] or 'none'}.\n"
	"Where is the Engineer being optimistic? What would you watch?",
	)
	if raw and {"stance", "headline", "detail"} <= set(raw):
	return InspectorVerdict(str(raw["stance"]), str(raw["headline"]), str(raw["detail"]))
	return _second_opinion_fallback(job, env, settings, advice)


	def _second_opinion_fallback(job: Job, env: Environment, settings: PrintSettings, advice: Advice) -> InspectorVerdict:
	geo, mat = job.geometry_type, job.material.upper()
	flags: list[str] = []
	if geo in ("overhang", "bridge") and settings.fan_pct < 60:
	flags.append(f"fan {settings.fan_pct:.0f}% is thin for a {geo} — sagging risk the Engineer may be underweighting")
	if mat == "ABS" and job.bed_position in ("edge", "corner"):
	flags.append(f"ABS off-center ({job.bed_position}) will pull at the edges — I'd second a warp watch and a brim")
	if mat == "ABS" and settings.fan_pct > 40:
	flags.append(f"fan {settings.fan_pct:.0f}% on ABS invites cracking/warp")
	if env.humidity > 55 and mat in ("PETG", "TPU", "ABS") and settings.retraction_mm < 3:
	flags.append(f"humid air ({env.humidity:.0f}%RH) + {settings.retraction_mm:.1f}mm retraction → expect stringing")
	if not advice.risks:
	flags.append("Engineer flagged NO failure regions — verify that's confidence, not optimism")

	if not flags:
	return InspectorVerdict("concur", "No red flags from a second look.",
	"Plan sits inside sane bounds for this material and room. Cleared to print.")
	stance = "dispute" if len(flags) >= 2 else "caution"
	return InspectorVerdict(stance, f"Second opinion: {flags[0]}.",
	" · ".join(flags[1:]) or "Print it, but watch that region.")


	# ── PRINT: grade one finished (simulated) print against the prediction ────────
	def grade_outcome(job: Job, env: Environment, settings: PrintSettings,
	advice: Advice, result: SimResult) -> InspectorVerdict:
	predicted = _predicted_modes(advice)
	raw = llm.chat_json(
	PERSONA + "\n\nRespond ONLY with JSON: "
	'{"stance":"held\|missed\|overcautious\|confirmed","headline":"<one line>","detail":"<1-2 lines>"}',
	"Grade this finished print. The outcome below came from the deterministic "
	"world, not from the Engineer — you are checking the Engineer's CALL against it.\n"
	f"JOB: {job.material}/{job.geometry_type} @ {env.temp:.0f}°C/{env.humidity:.0f}%RH.\n"
	f"ENGINEER PREDICTED RISKS: {[r.risk for r in advice.risks] or 'none'}.\n"
	f"ACTUAL OUTCOME: {result.outcome} — {result.detail} "
	f"(failure mode: {result.failure_mode}).\n"
	"Did the Engineer's prediction hold? Be blunt.",
	)
	agreement = _agreement(predicted, result)
	if raw and {"stance", "headline", "detail"} <= set(raw):
	return InspectorVerdict(str(raw["stance"]), str(raw["headline"]), str(raw["detail"]), agreement)
	return _grade_fallback(predicted, result, agreement)


	def _agreement(predicted: set[str], result: SimResult) -> bool:
	if result.failure_mode == "none":
	return True # clean print — nothing to have missed
	return result.failure_mode in predicted


	def _grade_fallback(predicted: set[str], result: SimResult, agreement: bool) -> InspectorVerdict:
	mode = _MODE_HUMAN.get(result.failure_mode, result.failure_mode)
	if result.failure_mode == "none":
	if predicted:
	return InspectorVerdict("overcautious", f"Print held (q={result.quality:.2f}).",
	f"Engineer flagged {', '.join(sorted(predicted))}; the settings covered it. "
	"Credit the call — or it was conservative.", True)
	return InspectorVerdict("held", f"Clean print (q={result.quality:.2f}).",
	"No failure flagged, none occurred. Plan and reality agree.", True)
	if agreement:
	return InspectorVerdict("confirmed", f"Failed on {mode} — exactly as called.",
	f"Quality {result.quality:.2f}. The Engineer's risk flag was right; "
	"the loop now has the lesson.", True)
	return InspectorVerdict("missed", f"Failed on {mode} — and it wasn't flagged.",
	f"Quality {result.quality:.2f}. The Engineer didn't predict this mode. "
	"That gap is what the next iteration has to close.", False)


	# canonical failure mode each geometry is expected to risk (the loop's implicit
	# prediction — the deterministic policy loop carries no LLM Advice per iteration)
	_GEO_EXPECT = {"overhang": "sag", "bridge": "sag", "stringing": "stringing",
	"adhesion": "adhesion", "vase": "warp"}


	def grade_iteration(geometry_type: str, result: SimResult) -> InspectorVerdict:
	"""Deterministic-only grade for one loop iteration (no LLM — the loop runs
	many fast, reproducible iterations). Checks the outcome against the failure
	mode this geometry is expected to risk."""
	expected = {_GEO_EXPECT.get(geometry_type, "sag")}
	return _grade_fallback(expected, result, _agreement(expected, result))


	# ── REVIEW: one verdict across a whole iteration run ──────────────────────────
	def summarize_run(records: list, *, material: str, geometry: str) -> InspectorVerdict:
	if not records:
	return InspectorVerdict("watch", "No run to review yet.", "Run the Print loop first.")
	qualities = [r.result.quality for r in records]
	first_clean = next((r.n for r in records if r.result.outcome == "success"), None)
	start, end = qualities[0], qualities[-1]
	climbed = end - start
	if first_clean:
	stance, head = "concur", f"Converged to clean by iteration {first_clean}."
	detail = (f"{material}/{geometry} climbed {start:.2f} → {end:.2f}. The compounding is real: "
	"each simulated outcome tightened the policy and the next run was better-informed.")
	elif climbed > 0.05:
	stance, head = "caution", f"Improving but not yet clean (best {max(qualities):.2f})."
	detail = (f"Quality rose {start:.2f} → {end:.2f} over {len(records)} runs but never crossed the "
	"bar. More iterations or a different lever needed — the loop is learning, slowly.")
	else:
	stance, head = "dispute", "No real progress this run."
	detail = (f"Quality stuck around {start:.2f}. Either the job is mis-specified or the policy is "
	"saturated for these conditions — worth a human look before trusting the trend.")
	return InspectorVerdict(stance, head, detail)