Molbap HF Staff

Upload folder using huggingface_hub

e199518 verified 9 days ago

6.05 kB

	"""Compatibility + forward-parity sweep over the most-downloaded image models (one per architecture).

	For each model: load its processor + model, decide whether the kernel can stand in for the
	processor's resize(+crop)+normalize, and for supported ones run processor-vs-kernel pixel_values
	through the SAME vision tower and report pixel + feature parity. Unsupported models list the reason.

	Run on the DGX (CUDA + working transformers):
	PYTHONPATH=../torch-ext python compat_check.py
	"""

	import sys

	# This transformers worktree constructs kernels.LayerRepository without a version/revision, which
	# newer `kernels` rejects at import. We do not need hub LAYER kernels for preprocessing, so hide
	# `kernels` from transformers — it falls back to its no-hub-kernels stub path and imports cleanly.
	sys.modules["kernels"] = None

	import torch

	from kernel_image_resize import resize_normalize # local package, via PYTHONPATH=../torch-ext


	_PIL_RESAMPLE = {2: "bilinear", 3: "bicubic"}

	# Top image models by HF downloads (June 2026), deduplicated to one repo per architecture family.
	MODELS = [
	("openai/clip-vit-base-patch32", 20528683), # clip
	("google/vit-base-patch16-224", 4910416), # vit
	("apple/mobilevit-small", 3488074), # mobilevit
	("facebook/dinov2-small", 2602780), # dinov2
	("google/siglip-so400m-patch14-384", 1379598), # siglip
	("facebook/dinov3-vitb16-pretrain-lvd1689m", 467337), # dinov3
	("microsoft/swinv2-tiny-patch4-window16-256", 385713), # swinv2
	("google/siglip2-base-patch16-224", 336824), # siglip2
	("microsoft/resnet-50", 307057), # resnet (convnext processor)
	("nvidia/segformer-b0-finetuned-ade-512-512", 262459), # segformer
	("facebook/convnextv2-tiny-22k-384", 48614), # convnextv2
	("google/mobilenet_v2_1.0_224", 48342), # mobilenet
	("facebook/convnext-tiny-224", 16984), # convnext
	("google/efficientnet-b0", 8577), # efficientnet
	("microsoft/beit-base-patch16-224-pt22k-ft22k", 7529), # beit
	]


	def unsupported_reason(p):
	"""Return None if the kernel can stand in for this processor, else a short reason."""
	if not getattr(p, "do_resize", True):
	return "no resize"
	if not getattr(p, "do_normalize", True):
	return "no normalize (rescale only)"
	if getattr(p, "do_flip_channel_order", False):
	return "channel flip (BGR)"
	if getattr(p, "do_pad", False):
	return "pad"
	if int(getattr(p, "resample", 2)) not in _PIL_RESAMPLE:
	return f"resample {p.resample}"
	size = getattr(p, "size", {}) or {}
	crop = p.crop_size if getattr(p, "do_center_crop", False) else None
	if "shortest_edge" in size:
	return None if crop else "shortest_edge without crop (variable output)"
	if "height" in size and "width" in size:
	return None
	return f"size {size}"


	def preprocess_with_kernel(p, images):
	size = p.size
	resample = _PIL_RESAMPLE[int(p.resample)]
	antialias = bool(getattr(p, "antialias", True))
	rescale = float(p.rescale_factor) if getattr(p, "do_rescale", True) else 1.0
	mean, std = p.image_mean, p.image_std
	crop = p.crop_size if getattr(p, "do_center_crop", False) else None
	common = dict(rescale_factor=rescale, resample=resample, antialias=antialias)
	if "shortest_edge" in size:
	return resize_normalize(
	images, size["shortest_edge"], mean, std,
	crop_size=(crop["height"], crop["width"]), resize_mode="shortest_edge", **common)
	if crop is not None and (crop["height"] != size["height"] or crop["width"] != size["width"]):
	return resize_normalize(
	images, (size["height"], size["width"]), mean, std,
	crop_size=(crop["height"], crop["width"]), resize_mode="square", **common)
	return resize_normalize(images, (size["height"], size["width"]), mean, std, **common)


	def vision_features(model, pixel_values):
	tower = getattr(model, "vision_model", model)
	out = tower(pixel_values=pixel_values.to(model.dtype))
	for attr in ("pooler_output", "last_hidden_state"):
	value = getattr(out, attr, None)
	if value is not None and torch.is_tensor(value):
	return value
	return out[0]


	def main():
	from transformers import AutoImageProcessor, AutoModel # lazy: avoids importing the kernels lib first

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	images = [
	torch.randint(0, 256, (3, h, w), dtype=torch.uint8, device=device)
	for h, w in [(640, 480), (800, 600), (512, 512), (384, 1024)]
	]
	print(f"{'model':46s} {'verdict':10s} pixel max\|Δ\| feature max\|Δ\| (rel)")
	for model_id, _ in MODELS:
	try:
	processor = AutoImageProcessor.from_pretrained(model_id)
	reason = unsupported_reason(processor)
	if reason is not None:
	print(f"{model_id:46s} SKIP: {reason}")
	continue
	model = AutoModel.from_pretrained(model_id).to(device).eval()
	reference_pv = processor(images, return_tensors="pt", device=device)["pixel_values"].to(device)
	kernel_pv = preprocess_with_kernel(processor, images)
	pixel_delta = (kernel_pv - reference_pv).abs().max().item()
	with torch.no_grad():
	base = vision_features(model, reference_pv)
	feat_delta = (vision_features(model, kernel_pv) - base).abs().max().item()
	rel = feat_delta / base.abs().max().item()
	print(f"{model_id:46s} OK {pixel_delta:.2e} {feat_delta:.2e} ({rel:.1%})")
	del model
	torch.cuda.empty_cache()
	except Exception as e:
	print(f"{model_id:46s} ERROR: {type(e).__name__}: {str(e)[:55]}")


	if __name__ == "__main__":
	main()