| """Compatibility + forward-parity sweep over the most-downloaded image models (one per architecture). |
| |
| For each model: load its processor + model, decide whether the kernel can stand in for the |
| processor's resize(+crop)+normalize, and for supported ones run processor-vs-kernel pixel_values |
| through the SAME vision tower and report pixel + feature parity. Unsupported models list the reason. |
| |
| Run on the DGX (CUDA + working transformers): |
| PYTHONPATH=../torch-ext python compat_check.py |
| """ |
|
|
| import sys |
|
|
| |
| |
| |
| sys.modules["kernels"] = None |
|
|
| import torch |
|
|
| from kernel_image_resize import resize_normalize |
|
|
|
|
| _PIL_RESAMPLE = {2: "bilinear", 3: "bicubic"} |
|
|
| |
| MODELS = [ |
| ("openai/clip-vit-base-patch32", 20528683), |
| ("google/vit-base-patch16-224", 4910416), |
| ("apple/mobilevit-small", 3488074), |
| ("facebook/dinov2-small", 2602780), |
| ("google/siglip-so400m-patch14-384", 1379598), |
| ("facebook/dinov3-vitb16-pretrain-lvd1689m", 467337), |
| ("microsoft/swinv2-tiny-patch4-window16-256", 385713), |
| ("google/siglip2-base-patch16-224", 336824), |
| ("microsoft/resnet-50", 307057), |
| ("nvidia/segformer-b0-finetuned-ade-512-512", 262459), |
| ("facebook/convnextv2-tiny-22k-384", 48614), |
| ("google/mobilenet_v2_1.0_224", 48342), |
| ("facebook/convnext-tiny-224", 16984), |
| ("google/efficientnet-b0", 8577), |
| ("microsoft/beit-base-patch16-224-pt22k-ft22k", 7529), |
| ] |
|
|
|
|
| def unsupported_reason(p): |
| """Return None if the kernel can stand in for this processor, else a short reason.""" |
| if not getattr(p, "do_resize", True): |
| return "no resize" |
| if not getattr(p, "do_normalize", True): |
| return "no normalize (rescale only)" |
| if getattr(p, "do_flip_channel_order", False): |
| return "channel flip (BGR)" |
| if getattr(p, "do_pad", False): |
| return "pad" |
| if int(getattr(p, "resample", 2)) not in _PIL_RESAMPLE: |
| return f"resample {p.resample}" |
| size = getattr(p, "size", {}) or {} |
| crop = p.crop_size if getattr(p, "do_center_crop", False) else None |
| if "shortest_edge" in size: |
| return None if crop else "shortest_edge without crop (variable output)" |
| if "height" in size and "width" in size: |
| return None |
| return f"size {size}" |
|
|
|
|
| def preprocess_with_kernel(p, images): |
| size = p.size |
| resample = _PIL_RESAMPLE[int(p.resample)] |
| antialias = bool(getattr(p, "antialias", True)) |
| rescale = float(p.rescale_factor) if getattr(p, "do_rescale", True) else 1.0 |
| mean, std = p.image_mean, p.image_std |
| crop = p.crop_size if getattr(p, "do_center_crop", False) else None |
| common = dict(rescale_factor=rescale, resample=resample, antialias=antialias) |
| if "shortest_edge" in size: |
| return resize_normalize( |
| images, size["shortest_edge"], mean, std, |
| crop_size=(crop["height"], crop["width"]), resize_mode="shortest_edge", **common) |
| if crop is not None and (crop["height"] != size["height"] or crop["width"] != size["width"]): |
| return resize_normalize( |
| images, (size["height"], size["width"]), mean, std, |
| crop_size=(crop["height"], crop["width"]), resize_mode="square", **common) |
| return resize_normalize(images, (size["height"], size["width"]), mean, std, **common) |
|
|
|
|
| def vision_features(model, pixel_values): |
| tower = getattr(model, "vision_model", model) |
| out = tower(pixel_values=pixel_values.to(model.dtype)) |
| for attr in ("pooler_output", "last_hidden_state"): |
| value = getattr(out, attr, None) |
| if value is not None and torch.is_tensor(value): |
| return value |
| return out[0] |
|
|
|
|
| def main(): |
| from transformers import AutoImageProcessor, AutoModel |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| images = [ |
| torch.randint(0, 256, (3, h, w), dtype=torch.uint8, device=device) |
| for h, w in [(640, 480), (800, 600), (512, 512), (384, 1024)] |
| ] |
| print(f"{'model':46s} {'verdict':10s} pixel max|Δ| feature max|Δ| (rel)") |
| for model_id, _ in MODELS: |
| try: |
| processor = AutoImageProcessor.from_pretrained(model_id) |
| reason = unsupported_reason(processor) |
| if reason is not None: |
| print(f"{model_id:46s} SKIP: {reason}") |
| continue |
| model = AutoModel.from_pretrained(model_id).to(device).eval() |
| reference_pv = processor(images, return_tensors="pt", device=device)["pixel_values"].to(device) |
| kernel_pv = preprocess_with_kernel(processor, images) |
| pixel_delta = (kernel_pv - reference_pv).abs().max().item() |
| with torch.no_grad(): |
| base = vision_features(model, reference_pv) |
| feat_delta = (vision_features(model, kernel_pv) - base).abs().max().item() |
| rel = feat_delta / base.abs().max().item() |
| print(f"{model_id:46s} OK {pixel_delta:.2e} {feat_delta:.2e} ({rel:.1%})") |
| del model |
| torch.cuda.empty_cache() |
| except Exception as e: |
| print(f"{model_id:46s} ERROR: {type(e).__name__}: {str(e)[:55]}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|