"""Visual fingerprint recognizer — a CLIP image encoder in ONNX, run in the daemon (no torch / no sidecar; onnxruntime works on Python 3.14). Embeds the labeled refs in recognize/refs/ and matches a peek crop by cosine similarity → a tool/site label. Refs are input-area crops for terminals (see recognize/README), so a peek of plain scrollback stays BELOW threshold → "unknown" rather than confusing claude-code with codex. Build the index once (downloads ~350MB model first time), then recognize() is fast and offline. """ import base64 import io import os import urllib.request from pathlib import Path import numpy as np from PIL import Image ROOT = Path(__file__).resolve().parent.parent REFS = ROOT / "recognize" / "refs" MODEL_DIR = ROOT / "recognize" / "model" MODEL_PATH = MODEL_DIR / "clip-vit-b32-vision.onnx" INDEX_PATH = MODEL_DIR / "index.npz" # Qdrant's ONNX export of CLIP ViT-B/32's image encoder (512-d). Image-only, no torch. MODEL_URL = "https://huggingface.co/Qdrant/clip-ViT-B-32-vision/resolve/main/model.onnx" # standard CLIP preprocessing constants _MEAN = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32) _STD = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32) # cosine floor: below this a match is "unknown" (CLIP UI shots run high; tune via env) THRESHOLD = float(os.environ.get("PUCK_RECOGNIZE_THRESHOLD", "0.82")) _session = None _in_name = "" _out_name = "" _index: tuple[np.ndarray, np.ndarray] | None = None # (vectors[N,512], labels[N]) def _ensure_model() -> None: global _session, _in_name, _out_name if _session is not None: return import onnxruntime as ort MODEL_DIR.mkdir(parents=True, exist_ok=True) if not MODEL_PATH.exists(): print(f"recognizer: downloading model → {MODEL_PATH} (~350MB, one time)…") req = urllib.request.Request(MODEL_URL, headers={"User-Agent": "puck/1.0"}) with urllib.request.urlopen(req) as r, MODEL_PATH.open("wb") as f: f.write(r.read()) _session = ort.InferenceSession(str(MODEL_PATH), providers=["CPUExecutionProvider"]) _in_name = _session.get_inputs()[0].name _out_name = _session.get_outputs()[0].name def _preprocess(img: Image.Image) -> np.ndarray: img = img.convert("RGB") w, h = img.size s = 224 / min(w, h) img = img.resize((max(224, round(w * s)), max(224, round(h * s))), Image.BICUBIC) w, h = img.size left, top = (w - 224) // 2, (h - 224) // 2 img = img.crop((left, top, left + 224, top + 224)) arr = (np.asarray(img, dtype=np.float32) / 255.0 - _MEAN) / _STD return arr.transpose(2, 0, 1)[None].astype(np.float32) # [1,3,224,224] def _embed(img: Image.Image) -> np.ndarray: _ensure_model() out = _session.run([_out_name], {_in_name: _preprocess(img)})[0][0].astype(np.float32) return out / (np.linalg.norm(out) + 1e-8) def build_index() -> dict: """Embed every labeled ref → cache vectors + labels. Call after collecting/labeling.""" global _index vecs: list[np.ndarray] = [] labels: list[str] = [] if REFS.exists(): for d in sorted(p for p in REFS.iterdir() if p.is_dir()): for f in d.iterdir(): if f.suffix.lower() in (".png", ".jpg", ".jpeg"): try: vecs.append(_embed(Image.open(f))) labels.append(d.name) except Exception as e: # noqa: BLE001 — skip a bad image, keep building print(f"recognizer: skip {f.name}: {e}") if vecs: MODEL_DIR.mkdir(parents=True, exist_ok=True) np.savez(INDEX_PATH, vectors=np.stack(vecs), labels=np.array(labels)) _index = (np.stack(vecs), np.array(labels)) else: _index = None return {"refs": len(vecs), "labels": sorted(set(labels))} def _load_index() -> tuple[np.ndarray, np.ndarray] | None: global _index if _index is None and INDEX_PATH.exists(): d = np.load(INDEX_PATH, allow_pickle=True) _index = (d["vectors"], d["labels"]) return _index def recognize(image_data_url: str) -> tuple[str | None, float]: """Match a crop against the fingerprint library → (label, score) or (None, score). Nearest reference by cosine; below THRESHOLD → unknown (so scrollback doesn't lie).""" idx = _load_index() if idx is None: return None, 0.0 vectors, labels = idx _, _, b64 = image_data_url.partition(",") q = _embed(Image.open(io.BytesIO(base64.b64decode(b64)))) sims = vectors @ q # both unit-normalized → cosine i = int(np.argmax(sims)) score = float(sims[i]) return (str(labels[i]), score) if score >= THRESHOLD else (None, score)