// Puck's eyes. Capture what's on screen, hand it to the vision brain (Modal), // which returns wire events that the daemon queues — so the existing poll picks // them up and Puck reacts, no special handling here. Vision is just another // event source that happens to come from pixels instead of a hook. // // Sim/Space: snapshot the rendered desktop scene (html-to-image). The overlay's // real-screen path (ScreenCaptureKit via Rust) lands later; same /api/brain/see. import { toPng } from "html-to-image"; import type { FairyState } from "../engine"; import { moodFor } from "../engine"; import { inTauri, lookAroundNative, type PeekResult, peekNative, type Region } from "./tauri"; // Puck's own surfaces — he observes the desktop, not himself. Reading his own // feed back through vision would be a hall-of-mirrors (and trains on noise). const PUCK_UI = [ "puck", // the sprite "comp", // companion (shows his own feed — the worst offender) "bubble", "toasts", "drop", // menu dropdown "settings", "interrupt-wrap", "bloom", ]; /** Sim: render the fake desktop and crop to the small region Puck is peering at * (Puck's own UI filtered out). The overlay crops in Rust instead. */ async function snapshotRegion(region: Region): Promise { const root = document.getElementById("root"); if (!root) return null; try { const full = await toPng(root, { pixelRatio: 1, filter: (node) => !(node instanceof HTMLElement) || !PUCK_UI.some((c) => node.classList?.contains?.(c)), }); const im = new Image(); im.src = full; await im.decode(); const c = document.createElement("canvas"); c.width = region.w; c.height = region.h; const ctx = c.getContext("2d"); if (!ctx) return null; ctx.drawImage(im, region.x, region.y, region.w, region.h, 0, 0, region.w, region.h); return c.toDataURL("image/jpeg", 0.85); } catch (e) { console.error("puck: region snapshot failed", e); return null; } } /** Nudge the vision backend awake on load (fire-and-forget). On a hosted Space the * cloud 12B scales to zero; pinging now means it's warming before the first peek * (~50-95s out). No-op cost locally (a warm /models call). */ export function warmVision(): void { void fetch("/api/brain/warm", { method: "POST" }).catch(() => {}); } /** The companion loop's eye: peek at the patch under Puck → `{quip, emotion}` (or null). * Overlay: Rust captures the region + the daemon voices it. Sim: crop the fake * desktop and POST it. No events, no queue — a line for a bubble + a felt reaction. */ export async function peekScene(fs: FairyState, region: Region): Promise { const fairyState = { mischief: fs.mischief, mood: moodFor(fs) }; if (inTauri()) return peekNative(region, fairyState); const image = await snapshotRegion(region); if (!image) return null; try { const res = await fetch("/api/brain/peek", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ image, fairy_state: fairyState }), signal: AbortSignal.timeout(70000), }); if (!res.ok) return null; const r = (await res.json()) as { quip?: string; emotion?: string }; return r.quip ? { quip: r.quip, emotion: r.emotion ?? "curious" } : null; } catch { return null; } } /** Snapshot the SIMULATED desktop (sim/Space) to a PNG data URL, with Puck's own * UI filtered out. The overlay's real-screen path lives in `perceive` (it's * captured + posted entirely in Rust). */ async function snapshotScene(): Promise { const root = document.getElementById("root"); if (!root) return null; return toPng(root, { pixelRatio: 1, // the VLM needs legible content, not retina detail — keep it small filter: (node) => !(node instanceof HTMLElement) || !PUCK_UI.some((c) => node.classList?.contains?.(c)), }).catch((e) => { console.error("puck: scene snapshot failed", e); return null; }); } /** Look at the screen; perceived events are queued daemon-side (the existing poll * delivers them). Returns the count it queued for UI feedback, or **null when vision * is unavailable** (no endpoint / offline / timeout) — caller treats null distinctly * ("my eyes are shut") from 0 ("nothing worth a fuss"). Null is graceful by design; * vision is optional and the sim runs without it. */ export async function perceive(fs: FairyState): Promise { const fairyState = { mischief: fs.mischief, mood: moodFor(fs) }; // Overlay: Rust captures the real screen AND posts it (background) — the image // never enters the webview. Blank Puck for ~2 frames so he isn't in the shot; // lookAroundNative returns right after the capture (NOT after inference), so the // blank is brief. Perceived events arrive via the poll. 0 = dispatched, null = failed. if (inTauri()) { const html = document.documentElement; html.classList.add("capturing"); await new Promise((r) => requestAnimationFrame(() => requestAnimationFrame(r))); try { return (await lookAroundNative(fairyState)) ? 0 : null; } finally { html.classList.remove("capturing"); } } const image = await snapshotScene(); if (!image) return null; try { const res = await fetch("/api/brain/see", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ image, fairy_state: fairyState }), signal: AbortSignal.timeout(70000), // cloud VLM + possible cold start }); if (!res.ok) return null; // 503 = no vision endpoint configured; silent like the brain seam // daemon returns { observed, queued, events }; we only surface the queued count const data: unknown = await res.json(); return (data as { queued?: number }).queued ?? 0; } catch { return null; // offline / timeout — vision is optional, the sim runs without it } }