puck / frontend /src /lib /vision.ts
vu1n's picture
Puck β€” desktop fairy familiar (HF Build Small)
3c124f3
Raw
History Blame Contribute Delete
5.95 kB
// Puck's eyes. Capture what's on screen, hand it to the vision brain (Modal),
// which returns wire events that the daemon queues β€” so the existing poll picks
// them up and Puck reacts, no special handling here. Vision is just another
// event source that happens to come from pixels instead of a hook.
//
// Sim/Space: snapshot the rendered desktop scene (html-to-image). The overlay's
// real-screen path (ScreenCaptureKit via Rust) lands later; same /api/brain/see.
import { toPng } from "html-to-image";
import type { FairyState } from "../engine";
import { moodFor } from "../engine";
import { inTauri, lookAroundNative, type PeekResult, peekNative, type Region } from "./tauri";
// Puck's own surfaces β€” he observes the desktop, not himself. Reading his own
// feed back through vision would be a hall-of-mirrors (and trains on noise).
const PUCK_UI = [
"puck", // the sprite
"comp", // companion (shows his own feed β€” the worst offender)
"bubble",
"toasts",
"drop", // menu dropdown
"settings",
"interrupt-wrap",
"bloom",
];
/** Sim: render the fake desktop and crop to the small region Puck is peering at
* (Puck's own UI filtered out). The overlay crops in Rust instead. */
async function snapshotRegion(region: Region): Promise<string | null> {
const root = document.getElementById("root");
if (!root) return null;
try {
const full = await toPng(root, {
pixelRatio: 1,
filter: (node) => !(node instanceof HTMLElement) || !PUCK_UI.some((c) => node.classList?.contains?.(c)),
});
const im = new Image();
im.src = full;
await im.decode();
const c = document.createElement("canvas");
c.width = region.w;
c.height = region.h;
const ctx = c.getContext("2d");
if (!ctx) return null;
ctx.drawImage(im, region.x, region.y, region.w, region.h, 0, 0, region.w, region.h);
return c.toDataURL("image/jpeg", 0.85);
} catch (e) {
console.error("puck: region snapshot failed", e);
return null;
}
}
/** Nudge the vision backend awake on load (fire-and-forget). On a hosted Space the
* cloud 12B scales to zero; pinging now means it's warming before the first peek
* (~50-95s out). No-op cost locally (a warm /models call). */
export function warmVision(): void {
void fetch("/api/brain/warm", { method: "POST" }).catch(() => {});
}
/** The companion loop's eye: peek at the patch under Puck β†’ `{quip, emotion}` (or null).
* Overlay: Rust captures the region + the daemon voices it. Sim: crop the fake
* desktop and POST it. No events, no queue β€” a line for a bubble + a felt reaction. */
export async function peekScene(fs: FairyState, region: Region): Promise<PeekResult | null> {
const fairyState = { mischief: fs.mischief, mood: moodFor(fs) };
if (inTauri()) return peekNative(region, fairyState);
const image = await snapshotRegion(region);
if (!image) return null;
try {
const res = await fetch("/api/brain/peek", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ image, fairy_state: fairyState }),
signal: AbortSignal.timeout(70000),
});
if (!res.ok) return null;
const r = (await res.json()) as { quip?: string; emotion?: string };
return r.quip ? { quip: r.quip, emotion: r.emotion ?? "curious" } : null;
} catch {
return null;
}
}
/** Snapshot the SIMULATED desktop (sim/Space) to a PNG data URL, with Puck's own
* UI filtered out. The overlay's real-screen path lives in `perceive` (it's
* captured + posted entirely in Rust). */
async function snapshotScene(): Promise<string | null> {
const root = document.getElementById("root");
if (!root) return null;
return toPng(root, {
pixelRatio: 1, // the VLM needs legible content, not retina detail β€” keep it small
filter: (node) => !(node instanceof HTMLElement) || !PUCK_UI.some((c) => node.classList?.contains?.(c)),
}).catch((e) => {
console.error("puck: scene snapshot failed", e);
return null;
});
}
/** Look at the screen; perceived events are queued daemon-side (the existing poll
* delivers them). Returns the count it queued for UI feedback, or **null when vision
* is unavailable** (no endpoint / offline / timeout) β€” caller treats null distinctly
* ("my eyes are shut") from 0 ("nothing worth a fuss"). Null is graceful by design;
* vision is optional and the sim runs without it. */
export async function perceive(fs: FairyState): Promise<number | null> {
const fairyState = { mischief: fs.mischief, mood: moodFor(fs) };
// Overlay: Rust captures the real screen AND posts it (background) β€” the image
// never enters the webview. Blank Puck for ~2 frames so he isn't in the shot;
// lookAroundNative returns right after the capture (NOT after inference), so the
// blank is brief. Perceived events arrive via the poll. 0 = dispatched, null = failed.
if (inTauri()) {
const html = document.documentElement;
html.classList.add("capturing");
await new Promise((r) => requestAnimationFrame(() => requestAnimationFrame(r)));
try {
return (await lookAroundNative(fairyState)) ? 0 : null;
} finally {
html.classList.remove("capturing");
}
}
const image = await snapshotScene();
if (!image) return null;
try {
const res = await fetch("/api/brain/see", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ image, fairy_state: fairyState }),
signal: AbortSignal.timeout(70000), // cloud VLM + possible cold start
});
if (!res.ok) return null; // 503 = no vision endpoint configured; silent like the brain seam
// daemon returns { observed, queued, events }; we only surface the queued count
const data: unknown = await res.json();
return (data as { queued?: number }).queued ?? 0;
} catch {
return null; // offline / timeout β€” vision is optional, the sim runs without it
}
}