File size: 6,047 Bytes

e199518

"""Compatibility + forward-parity sweep over the most-downloaded image models (one per architecture).

For each model: load its processor + model, decide whether the kernel can stand in for the
processor's resize(+crop)+normalize, and for supported ones run processor-vs-kernel pixel_values
through the SAME vision tower and report pixel + feature parity. Unsupported models list the reason.

Run on the DGX (CUDA + working transformers):
    PYTHONPATH=../torch-ext python compat_check.py
"""

import sys

# This transformers worktree constructs kernels.LayerRepository without a version/revision, which
# newer `kernels` rejects at import. We do not need hub LAYER kernels for preprocessing, so hide
# `kernels` from transformers — it falls back to its no-hub-kernels stub path and imports cleanly.
sys.modules["kernels"] = None

import torch

from kernel_image_resize import resize_normalize  # local package, via PYTHONPATH=../torch-ext


_PIL_RESAMPLE = {2: "bilinear", 3: "bicubic"}

# Top image models by HF downloads (June 2026), deduplicated to one repo per architecture family.
MODELS = [
    ("openai/clip-vit-base-patch32", 20528683),                  # clip
    ("google/vit-base-patch16-224", 4910416),                    # vit
    ("apple/mobilevit-small", 3488074),                          # mobilevit
    ("facebook/dinov2-small", 2602780),                          # dinov2
    ("google/siglip-so400m-patch14-384", 1379598),               # siglip
    ("facebook/dinov3-vitb16-pretrain-lvd1689m", 467337),        # dinov3
    ("microsoft/swinv2-tiny-patch4-window16-256", 385713),       # swinv2
    ("google/siglip2-base-patch16-224", 336824),                 # siglip2
    ("microsoft/resnet-50", 307057),                             # resnet (convnext processor)
    ("nvidia/segformer-b0-finetuned-ade-512-512", 262459),       # segformer
    ("facebook/convnextv2-tiny-22k-384", 48614),                 # convnextv2
    ("google/mobilenet_v2_1.0_224", 48342),                      # mobilenet
    ("facebook/convnext-tiny-224", 16984),                       # convnext
    ("google/efficientnet-b0", 8577),                            # efficientnet
    ("microsoft/beit-base-patch16-224-pt22k-ft22k", 7529),       # beit
]


def unsupported_reason(p):
    """Return None if the kernel can stand in for this processor, else a short reason."""
    if not getattr(p, "do_resize", True):
        return "no resize"
    if not getattr(p, "do_normalize", True):
        return "no normalize (rescale only)"
    if getattr(p, "do_flip_channel_order", False):
        return "channel flip (BGR)"
    if getattr(p, "do_pad", False):
        return "pad"
    if int(getattr(p, "resample", 2)) not in _PIL_RESAMPLE:
        return f"resample {p.resample}"
    size = getattr(p, "size", {}) or {}
    crop = p.crop_size if getattr(p, "do_center_crop", False) else None
    if "shortest_edge" in size:
        return None if crop else "shortest_edge without crop (variable output)"
    if "height" in size and "width" in size:
        return None
    return f"size {size}"


def preprocess_with_kernel(p, images):
    size = p.size
    resample = _PIL_RESAMPLE[int(p.resample)]
    antialias = bool(getattr(p, "antialias", True))
    rescale = float(p.rescale_factor) if getattr(p, "do_rescale", True) else 1.0
    mean, std = p.image_mean, p.image_std
    crop = p.crop_size if getattr(p, "do_center_crop", False) else None
    common = dict(rescale_factor=rescale, resample=resample, antialias=antialias)
    if "shortest_edge" in size:
        return resize_normalize(
            images, size["shortest_edge"], mean, std,
            crop_size=(crop["height"], crop["width"]), resize_mode="shortest_edge", **common)
    if crop is not None and (crop["height"] != size["height"] or crop["width"] != size["width"]):
        return resize_normalize(
            images, (size["height"], size["width"]), mean, std,
            crop_size=(crop["height"], crop["width"]), resize_mode="square", **common)
    return resize_normalize(images, (size["height"], size["width"]), mean, std, **common)


def vision_features(model, pixel_values):
    tower = getattr(model, "vision_model", model)
    out = tower(pixel_values=pixel_values.to(model.dtype))
    for attr in ("pooler_output", "last_hidden_state"):
        value = getattr(out, attr, None)
        if value is not None and torch.is_tensor(value):
            return value
    return out[0]


def main():
    from transformers import AutoImageProcessor, AutoModel  # lazy: avoids importing the kernels lib first

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    images = [
        torch.randint(0, 256, (3, h, w), dtype=torch.uint8, device=device)
        for h, w in [(640, 480), (800, 600), (512, 512), (384, 1024)]
    ]
    print(f"{'model':46s} {'verdict':10s} pixel max|Δ|   feature max|Δ| (rel)")
    for model_id, _ in MODELS:
        try:
            processor = AutoImageProcessor.from_pretrained(model_id)
            reason = unsupported_reason(processor)
            if reason is not None:
                print(f"{model_id:46s} SKIP: {reason}")
                continue
            model = AutoModel.from_pretrained(model_id).to(device).eval()
            reference_pv = processor(images, return_tensors="pt", device=device)["pixel_values"].to(device)
            kernel_pv = preprocess_with_kernel(processor, images)
            pixel_delta = (kernel_pv - reference_pv).abs().max().item()
            with torch.no_grad():
                base = vision_features(model, reference_pv)
                feat_delta = (vision_features(model, kernel_pv) - base).abs().max().item()
            rel = feat_delta / base.abs().max().item()
            print(f"{model_id:46s} OK         {pixel_delta:.2e}      {feat_delta:.2e} ({rel:.1%})")
            del model
            torch.cuda.empty_cache()
        except Exception as e:
            print(f"{model_id:46s} ERROR: {type(e).__name__}: {str(e)[:55]}")


if __name__ == "__main__":
    main()