kernel_image_resize / benchmarks /compat_check.py
Molbap's picture
Molbap HF Staff
Upload folder using huggingface_hub
e199518 verified
Raw
History Blame Contribute Delete
6.05 kB
"""Compatibility + forward-parity sweep over the most-downloaded image models (one per architecture).
For each model: load its processor + model, decide whether the kernel can stand in for the
processor's resize(+crop)+normalize, and for supported ones run processor-vs-kernel pixel_values
through the SAME vision tower and report pixel + feature parity. Unsupported models list the reason.
Run on the DGX (CUDA + working transformers):
PYTHONPATH=../torch-ext python compat_check.py
"""
import sys
# This transformers worktree constructs kernels.LayerRepository without a version/revision, which
# newer `kernels` rejects at import. We do not need hub LAYER kernels for preprocessing, so hide
# `kernels` from transformers — it falls back to its no-hub-kernels stub path and imports cleanly.
sys.modules["kernels"] = None
import torch
from kernel_image_resize import resize_normalize # local package, via PYTHONPATH=../torch-ext
_PIL_RESAMPLE = {2: "bilinear", 3: "bicubic"}
# Top image models by HF downloads (June 2026), deduplicated to one repo per architecture family.
MODELS = [
("openai/clip-vit-base-patch32", 20528683), # clip
("google/vit-base-patch16-224", 4910416), # vit
("apple/mobilevit-small", 3488074), # mobilevit
("facebook/dinov2-small", 2602780), # dinov2
("google/siglip-so400m-patch14-384", 1379598), # siglip
("facebook/dinov3-vitb16-pretrain-lvd1689m", 467337), # dinov3
("microsoft/swinv2-tiny-patch4-window16-256", 385713), # swinv2
("google/siglip2-base-patch16-224", 336824), # siglip2
("microsoft/resnet-50", 307057), # resnet (convnext processor)
("nvidia/segformer-b0-finetuned-ade-512-512", 262459), # segformer
("facebook/convnextv2-tiny-22k-384", 48614), # convnextv2
("google/mobilenet_v2_1.0_224", 48342), # mobilenet
("facebook/convnext-tiny-224", 16984), # convnext
("google/efficientnet-b0", 8577), # efficientnet
("microsoft/beit-base-patch16-224-pt22k-ft22k", 7529), # beit
]
def unsupported_reason(p):
"""Return None if the kernel can stand in for this processor, else a short reason."""
if not getattr(p, "do_resize", True):
return "no resize"
if not getattr(p, "do_normalize", True):
return "no normalize (rescale only)"
if getattr(p, "do_flip_channel_order", False):
return "channel flip (BGR)"
if getattr(p, "do_pad", False):
return "pad"
if int(getattr(p, "resample", 2)) not in _PIL_RESAMPLE:
return f"resample {p.resample}"
size = getattr(p, "size", {}) or {}
crop = p.crop_size if getattr(p, "do_center_crop", False) else None
if "shortest_edge" in size:
return None if crop else "shortest_edge without crop (variable output)"
if "height" in size and "width" in size:
return None
return f"size {size}"
def preprocess_with_kernel(p, images):
size = p.size
resample = _PIL_RESAMPLE[int(p.resample)]
antialias = bool(getattr(p, "antialias", True))
rescale = float(p.rescale_factor) if getattr(p, "do_rescale", True) else 1.0
mean, std = p.image_mean, p.image_std
crop = p.crop_size if getattr(p, "do_center_crop", False) else None
common = dict(rescale_factor=rescale, resample=resample, antialias=antialias)
if "shortest_edge" in size:
return resize_normalize(
images, size["shortest_edge"], mean, std,
crop_size=(crop["height"], crop["width"]), resize_mode="shortest_edge", **common)
if crop is not None and (crop["height"] != size["height"] or crop["width"] != size["width"]):
return resize_normalize(
images, (size["height"], size["width"]), mean, std,
crop_size=(crop["height"], crop["width"]), resize_mode="square", **common)
return resize_normalize(images, (size["height"], size["width"]), mean, std, **common)
def vision_features(model, pixel_values):
tower = getattr(model, "vision_model", model)
out = tower(pixel_values=pixel_values.to(model.dtype))
for attr in ("pooler_output", "last_hidden_state"):
value = getattr(out, attr, None)
if value is not None and torch.is_tensor(value):
return value
return out[0]
def main():
from transformers import AutoImageProcessor, AutoModel # lazy: avoids importing the kernels lib first
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
images = [
torch.randint(0, 256, (3, h, w), dtype=torch.uint8, device=device)
for h, w in [(640, 480), (800, 600), (512, 512), (384, 1024)]
]
print(f"{'model':46s} {'verdict':10s} pixel max|Δ| feature max|Δ| (rel)")
for model_id, _ in MODELS:
try:
processor = AutoImageProcessor.from_pretrained(model_id)
reason = unsupported_reason(processor)
if reason is not None:
print(f"{model_id:46s} SKIP: {reason}")
continue
model = AutoModel.from_pretrained(model_id).to(device).eval()
reference_pv = processor(images, return_tensors="pt", device=device)["pixel_values"].to(device)
kernel_pv = preprocess_with_kernel(processor, images)
pixel_delta = (kernel_pv - reference_pv).abs().max().item()
with torch.no_grad():
base = vision_features(model, reference_pv)
feat_delta = (vision_features(model, kernel_pv) - base).abs().max().item()
rel = feat_delta / base.abs().max().item()
print(f"{model_id:46s} OK {pixel_delta:.2e} {feat_delta:.2e} ({rel:.1%})")
del model
torch.cuda.empty_cache()
except Exception as e:
print(f"{model_id:46s} ERROR: {type(e).__name__}: {str(e)[:55]}")
if __name__ == "__main__":
main()