# /// script # requires-python = ">=3.10" # dependencies = ["torch", "triton", "kernels", "transformers", "torchvision"] # /// """Drop-in: use the kernel as the resize+normalize stage of a transformers fast processor. There is no `use_kernels=True` hook for image processors (that machinery swaps nn.Module layer forwards inside the model, not processor code). So the usable path is to read the processor's config and call the kernel directly. `preprocess_with_kernel` below is the whole adapter — copy it into your code. Run on a CUDA box: python example_transformers.py """ import torch from kernels import get_kernel from transformers import AutoImageProcessor, AutoModel kernel_image_resize = get_kernel("Molbap/kernel_image_resize", revision="main", trust_remote_code=True) _PIL_RESAMPLE = {0: "bilinear", 2: "bilinear", 3: "bicubic"} def preprocess_with_kernel(processor, images): """Run the kernel using `processor`'s own config; returns pixel_values like processor(images). Handles fixed-size resize, square-resize + center-crop, and shortest-edge resize + center-crop (CLIP / DINOv2). Does not handle padding processors. """ size = processor.size if getattr(processor, "do_pad", False): raise ValueError("kernel does not pad; this processor needs a pad step") if not getattr(processor, "do_normalize", True): raise ValueError("processor does not normalize (rescale only); kernel always normalizes") if getattr(processor, "do_flip_channel_order", False): raise ValueError("processor flips channels to BGR; kernel keeps RGB") resample = _PIL_RESAMPLE[int(processor.resample)] antialias = bool(getattr(processor, "antialias", True)) rescale = float(processor.rescale_factor) if getattr(processor, "do_rescale", True) else 1.0 mean, std = processor.image_mean, processor.image_std crop = processor.crop_size if getattr(processor, "do_center_crop", False) else None common = dict(rescale_factor=rescale, resample=resample, antialias=antialias) if "shortest_edge" in size: if crop is None: raise ValueError("shortest-edge resize without a crop gives variable-size output") return kernel_image_resize.resize_normalize( images, size["shortest_edge"], mean, std, crop_size=(crop["height"], crop["width"]), resize_mode="shortest_edge", **common, ) if crop is not None and (crop["height"] != size["height"] or crop["width"] != size["width"]): return kernel_image_resize.resize_normalize( images, (size["height"], size["width"]), mean, std, crop_size=(crop["height"], crop["width"]), resize_mode="square", **common, ) return kernel_image_resize.resize_normalize(images, (size["height"], size["width"]), mean, std, **common) def main(): device = "cuda" if torch.cuda.is_available() else "cpu" model_id = "google/siglip2-base-patch16-224" processor = AutoImageProcessor.from_pretrained(model_id, backend="torchvision") model = AutoModel.from_pretrained(model_id).to(device).eval() images = [ torch.randint(0, 256, (3, h, w), dtype=torch.uint8, device=device) for h, w in [(640, 480), (800, 600), (384, 1024)] ] pixel_values = preprocess_with_kernel(processor, images) print(f"{len(images)} ragged images -> pixel_values {tuple(pixel_values.shape)} {pixel_values.dtype}") with torch.no_grad(): features = model.vision_model(pixel_values=pixel_values.to(model.dtype)).pooler_output print(f"vision features: {tuple(features.shape)}") # parity vs the real processor (float-vs-uint8 resize -> small, expected gap) reference = processor(images, return_tensors="pt", device=device)["pixel_values"].to(device) print(f"max|Δ| pixel_values vs processor: {(pixel_values - reference).abs().max().item():.2e}") if __name__ == "__main__": main()