"""Compatibility + forward-parity sweep over the most-downloaded image models (one per architecture). For each model: load its processor + model, decide whether the kernel can stand in for the processor's resize(+crop)+normalize, and for supported ones run processor-vs-kernel pixel_values through the SAME vision tower and report pixel + feature parity. Unsupported models list the reason. Run on the DGX (CUDA + working transformers): PYTHONPATH=../torch-ext python compat_check.py """ import sys # This transformers worktree constructs kernels.LayerRepository without a version/revision, which # newer `kernels` rejects at import. We do not need hub LAYER kernels for preprocessing, so hide # `kernels` from transformers — it falls back to its no-hub-kernels stub path and imports cleanly. sys.modules["kernels"] = None import torch from kernel_image_resize import resize_normalize # local package, via PYTHONPATH=../torch-ext _PIL_RESAMPLE = {2: "bilinear", 3: "bicubic"} # Top image models by HF downloads (June 2026), deduplicated to one repo per architecture family. MODELS = [ ("openai/clip-vit-base-patch32", 20528683), # clip ("google/vit-base-patch16-224", 4910416), # vit ("apple/mobilevit-small", 3488074), # mobilevit ("facebook/dinov2-small", 2602780), # dinov2 ("google/siglip-so400m-patch14-384", 1379598), # siglip ("facebook/dinov3-vitb16-pretrain-lvd1689m", 467337), # dinov3 ("microsoft/swinv2-tiny-patch4-window16-256", 385713), # swinv2 ("google/siglip2-base-patch16-224", 336824), # siglip2 ("microsoft/resnet-50", 307057), # resnet (convnext processor) ("nvidia/segformer-b0-finetuned-ade-512-512", 262459), # segformer ("facebook/convnextv2-tiny-22k-384", 48614), # convnextv2 ("google/mobilenet_v2_1.0_224", 48342), # mobilenet ("facebook/convnext-tiny-224", 16984), # convnext ("google/efficientnet-b0", 8577), # efficientnet ("microsoft/beit-base-patch16-224-pt22k-ft22k", 7529), # beit ] def unsupported_reason(p): """Return None if the kernel can stand in for this processor, else a short reason.""" if not getattr(p, "do_resize", True): return "no resize" if not getattr(p, "do_normalize", True): return "no normalize (rescale only)" if getattr(p, "do_flip_channel_order", False): return "channel flip (BGR)" if getattr(p, "do_pad", False): return "pad" if int(getattr(p, "resample", 2)) not in _PIL_RESAMPLE: return f"resample {p.resample}" size = getattr(p, "size", {}) or {} crop = p.crop_size if getattr(p, "do_center_crop", False) else None if "shortest_edge" in size: return None if crop else "shortest_edge without crop (variable output)" if "height" in size and "width" in size: return None return f"size {size}" def preprocess_with_kernel(p, images): size = p.size resample = _PIL_RESAMPLE[int(p.resample)] antialias = bool(getattr(p, "antialias", True)) rescale = float(p.rescale_factor) if getattr(p, "do_rescale", True) else 1.0 mean, std = p.image_mean, p.image_std crop = p.crop_size if getattr(p, "do_center_crop", False) else None common = dict(rescale_factor=rescale, resample=resample, antialias=antialias) if "shortest_edge" in size: return resize_normalize( images, size["shortest_edge"], mean, std, crop_size=(crop["height"], crop["width"]), resize_mode="shortest_edge", **common) if crop is not None and (crop["height"] != size["height"] or crop["width"] != size["width"]): return resize_normalize( images, (size["height"], size["width"]), mean, std, crop_size=(crop["height"], crop["width"]), resize_mode="square", **common) return resize_normalize(images, (size["height"], size["width"]), mean, std, **common) def vision_features(model, pixel_values): tower = getattr(model, "vision_model", model) out = tower(pixel_values=pixel_values.to(model.dtype)) for attr in ("pooler_output", "last_hidden_state"): value = getattr(out, attr, None) if value is not None and torch.is_tensor(value): return value return out[0] def main(): from transformers import AutoImageProcessor, AutoModel # lazy: avoids importing the kernels lib first device = torch.device("cuda" if torch.cuda.is_available() else "cpu") images = [ torch.randint(0, 256, (3, h, w), dtype=torch.uint8, device=device) for h, w in [(640, 480), (800, 600), (512, 512), (384, 1024)] ] print(f"{'model':46s} {'verdict':10s} pixel max|Δ| feature max|Δ| (rel)") for model_id, _ in MODELS: try: processor = AutoImageProcessor.from_pretrained(model_id) reason = unsupported_reason(processor) if reason is not None: print(f"{model_id:46s} SKIP: {reason}") continue model = AutoModel.from_pretrained(model_id).to(device).eval() reference_pv = processor(images, return_tensors="pt", device=device)["pixel_values"].to(device) kernel_pv = preprocess_with_kernel(processor, images) pixel_delta = (kernel_pv - reference_pv).abs().max().item() with torch.no_grad(): base = vision_features(model, reference_pv) feat_delta = (vision_features(model, kernel_pv) - base).abs().max().item() rel = feat_delta / base.abs().max().item() print(f"{model_id:46s} OK {pixel_delta:.2e} {feat_delta:.2e} ({rel:.1%})") del model torch.cuda.empty_cache() except Exception as e: print(f"{model_id:46s} ERROR: {type(e).__name__}: {str(e)[:55]}") if __name__ == "__main__": main()