Spaces:

build-small-hackathon
/

puck

Running

File size: 4,729 Bytes

3c124f3

"""Visual fingerprint recognizer — a CLIP image encoder in ONNX, run in the daemon
(no torch / no sidecar; onnxruntime works on Python 3.14). Embeds the labeled refs
in recognize/refs/ and matches a peek crop by cosine similarity → a tool/site label.

Refs are input-area crops for terminals (see recognize/README), so a peek of plain
scrollback stays BELOW threshold → "unknown" rather than confusing claude-code with
codex. Build the index once (downloads ~350MB model first time), then recognize()
is fast and offline.
"""

import base64
import io
import os
import urllib.request
from pathlib import Path

import numpy as np
from PIL import Image

ROOT = Path(__file__).resolve().parent.parent
REFS = ROOT / "recognize" / "refs"
MODEL_DIR = ROOT / "recognize" / "model"
MODEL_PATH = MODEL_DIR / "clip-vit-b32-vision.onnx"
INDEX_PATH = MODEL_DIR / "index.npz"
# Qdrant's ONNX export of CLIP ViT-B/32's image encoder (512-d). Image-only, no torch.
MODEL_URL = "https://huggingface.co/Qdrant/clip-ViT-B-32-vision/resolve/main/model.onnx"

# standard CLIP preprocessing constants
_MEAN = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
_STD = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
# cosine floor: below this a match is "unknown" (CLIP UI shots run high; tune via env)
THRESHOLD = float(os.environ.get("PUCK_RECOGNIZE_THRESHOLD", "0.82"))

_session = None
_in_name = ""
_out_name = ""
_index: tuple[np.ndarray, np.ndarray] | None = None  # (vectors[N,512], labels[N])


def _ensure_model() -> None:
    global _session, _in_name, _out_name
    if _session is not None:
        return
    import onnxruntime as ort

    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    if not MODEL_PATH.exists():
        print(f"recognizer: downloading model → {MODEL_PATH} (~350MB, one time)…")
        req = urllib.request.Request(MODEL_URL, headers={"User-Agent": "puck/1.0"})
        with urllib.request.urlopen(req) as r, MODEL_PATH.open("wb") as f:
            f.write(r.read())
    _session = ort.InferenceSession(str(MODEL_PATH), providers=["CPUExecutionProvider"])
    _in_name = _session.get_inputs()[0].name
    _out_name = _session.get_outputs()[0].name


def _preprocess(img: Image.Image) -> np.ndarray:
    img = img.convert("RGB")
    w, h = img.size
    s = 224 / min(w, h)
    img = img.resize((max(224, round(w * s)), max(224, round(h * s))), Image.BICUBIC)
    w, h = img.size
    left, top = (w - 224) // 2, (h - 224) // 2
    img = img.crop((left, top, left + 224, top + 224))
    arr = (np.asarray(img, dtype=np.float32) / 255.0 - _MEAN) / _STD
    return arr.transpose(2, 0, 1)[None].astype(np.float32)  # [1,3,224,224]


def _embed(img: Image.Image) -> np.ndarray:
    _ensure_model()
    out = _session.run([_out_name], {_in_name: _preprocess(img)})[0][0].astype(np.float32)
    return out / (np.linalg.norm(out) + 1e-8)


def build_index() -> dict:
    """Embed every labeled ref → cache vectors + labels. Call after collecting/labeling."""
    global _index
    vecs: list[np.ndarray] = []
    labels: list[str] = []
    if REFS.exists():
        for d in sorted(p for p in REFS.iterdir() if p.is_dir()):
            for f in d.iterdir():
                if f.suffix.lower() in (".png", ".jpg", ".jpeg"):
                    try:
                        vecs.append(_embed(Image.open(f)))
                        labels.append(d.name)
                    except Exception as e:  # noqa: BLE001 — skip a bad image, keep building
                        print(f"recognizer: skip {f.name}: {e}")
    if vecs:
        MODEL_DIR.mkdir(parents=True, exist_ok=True)
        np.savez(INDEX_PATH, vectors=np.stack(vecs), labels=np.array(labels))
        _index = (np.stack(vecs), np.array(labels))
    else:
        _index = None
    return {"refs": len(vecs), "labels": sorted(set(labels))}


def _load_index() -> tuple[np.ndarray, np.ndarray] | None:
    global _index
    if _index is None and INDEX_PATH.exists():
        d = np.load(INDEX_PATH, allow_pickle=True)
        _index = (d["vectors"], d["labels"])
    return _index


def recognize(image_data_url: str) -> tuple[str | None, float]:
    """Match a crop against the fingerprint library → (label, score) or (None, score).
    Nearest reference by cosine; below THRESHOLD → unknown (so scrollback doesn't lie)."""
    idx = _load_index()
    if idx is None:
        return None, 0.0
    vectors, labels = idx
    _, _, b64 = image_data_url.partition(",")
    q = _embed(Image.open(io.BytesIO(base64.b64decode(b64))))
    sims = vectors @ q  # both unit-normalized → cosine
    i = int(np.argmax(sims))
    score = float(sims[i])
    return (str(labels[i]), score) if score >= THRESHOLD else (None, score)