"""Self-contained inference example for the 15-class UI-element classifier. Run: pip install onnxruntime numpy pillow python inference_example.py path/to/element_crop.png Designed to drop into an LLM-orchestration loop where you have a screenshot, a list of detected element bounding boxes (from any detector — YOLOv8, OWL-ViT, SAM-then-filter, accessibility tree, etc.), and you need cheap, deterministic per-element type labels before passing them to a reasoning LLM. Inference is CPU-friendly (~5 ms per crop on a modern x86 laptop). Use it as a 'helper' that adds structure to the orchestrator's prompt — e.g., 'click the text_input near label "Username"' — instead of paying VLM tokens to look at every crop. """ from __future__ import annotations import json import sys from pathlib import Path import numpy as np import onnxruntime as ort from PIL import Image HERE = Path(__file__).parent ONNX_PATH = HERE / "mobilenetv3_small.onnx" CLASSES = json.loads((HERE / "classes.json").read_text())["classes"] IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) def pad_to_square(img: Image.Image) -> Image.Image: """Pad shorter side with gray (128, 128, 128) — must match training transform.""" w, h = img.size m = max(w, h) out = Image.new("RGB", (m, m), (128, 128, 128)) out.paste(img, ((m - w) // 2, (m - h) // 2)) return out def preprocess(img: Image.Image) -> np.ndarray: """PadToSquare -> Resize 224x224 BILINEAR -> /255 -> ImageNet normalize -> CHW.""" img = pad_to_square(img.convert("RGB")) img = img.resize((224, 224), Image.BILINEAR) arr = np.array(img, dtype=np.float32) / 255.0 arr = (arr - IMAGENET_MEAN) / IMAGENET_STD arr = arr.transpose(2, 0, 1) return arr[None, :, :, :].astype(np.float32) # (1, 3, 224, 224) def softmax(x: np.ndarray) -> np.ndarray: e = np.exp(x - np.max(x, axis=1, keepdims=True)) return e / np.sum(e, axis=1, keepdims=True) def classify(crop_path: str | Path) -> dict: """Classify a single element crop. Returns label, confidence, full score map.""" so = ort.SessionOptions() so.intra_op_num_threads = 4 sess = ort.InferenceSession(str(ONNX_PATH), sess_options=so, providers=["CPUExecutionProvider"]) img = Image.open(crop_path) batch = preprocess(img) logits = sess.run(None, {sess.get_inputs()[0].name: batch})[0] probs = softmax(logits)[0] idx = int(np.argmax(probs)) return { "label": CLASSES[idx], "confidence": float(probs[idx]), "scores": {c: float(probs[i]) for i, c in enumerate(CLASSES)}, } def classify_batch(crop_paths: list[str | Path]) -> list[dict]: """Convenience: per-crop loop. The shipped ONNX is fixed batch_size=1. For higher throughput on large batches, re-export with dynamic axes and run a single batched session.run() — kept simple here for clarity. """ so = ort.SessionOptions() so.intra_op_num_threads = 4 sess = ort.InferenceSession(str(ONNX_PATH), sess_options=so, providers=["CPUExecutionProvider"]) results = [] for p in crop_paths: img = Image.open(p) batch = preprocess(img) logits = sess.run(None, {sess.get_inputs()[0].name: batch})[0] probs = softmax(logits)[0] idx = int(np.argmax(probs)) results.append({ "label": CLASSES[idx], "confidence": float(probs[idx]), }) return results if __name__ == "__main__": if len(sys.argv) < 2: print("usage: python inference_example.py [ ...]") sys.exit(1) for path in sys.argv[1:]: result = classify(path) print(f"{path}: {result['label']} (confidence={result['confidence']:.3f})")