ViT-B-16-SigLIP2-Image-CoreML / convert_to_coreml.py
batmac's picture
Upload folder using huggingface_hub
424bd46 verified
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.10,<3.14"
# dependencies = [
# "coremltools",
# "open_clip_torch",
# "transformers",
# "torch",
# "torchvision",
# "pillow",
# "numpy",
# ]
# ///
"""Convert any open_clip image encoder to Core ML for ANE acceleration.
Used to produce the .mlpackage files that ./embed_media_mobileclip.py loads
when --force-coreml is passed. The indexer also has an inline copy of the
fp16 conversion logic for lazy auto-build on first use; this standalone
script adds palettization + correctness verification + benchmarking, suitable
for producing artifacts to publish on HuggingFace.
Tested model coverage:
- MobileCLIP2-B / dfndr2b → fp16: 0.999 cosine vs PyTorch (drop-in)
- ViT-B-16-SigLIP2 / webli → fp16: 0.976, 8-bit palettized: 0.966
Untested (expect to work but verify cosine):
- other MobileCLIP2-* (S0/S2/S3/S4, L-14)
- other SigLIP/SigLIP2 sizes (S0/S2/S3/S4/L-14)
- EVA02-*, ViTamin-*, PE-Core-*
Usage:
# fp16 conversion (default)
./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2
# 8-bit palettized — half the disk size, near-identical fidelity
./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 --palettize 8
# Custom pretrained tag + output dir
./convert_to_coreml_mobileclip.py MobileCLIP2-B --pretrained dfndr2b -o ./out
# Skip the cosine verification + benchmark to convert faster
./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 --no-verify
"""
import argparse
import sys
import time
from pathlib import Path
import numpy as np
import torch
from PIL import Image, ImageDraw
import open_clip
import coremltools as ct
def default_pretrained_for(model_name: str) -> str:
if model_name.startswith("MobileCLIP2-"):
return "dfndr2b"
if "SigLIP2" in model_name:
return "webli"
if "SigLIP" in model_name:
return "webli"
if "EVA02" in model_name:
return "merged2b_s8b_b131k"
return "datacompdr"
def preprocess_image_size(preprocess) -> int:
for tf in preprocess.transforms:
if hasattr(tf, "size"):
s = tf.size
return s if isinstance(s, int) else int(s[0])
sys.exit("could not determine input size from preprocess transform")
def preprocess_normalization(preprocess) -> tuple[float, list[float]]:
"""Derive ct.ImageType scale/bias from the model's Normalize transform.
For Normalize(mean, std), the math is:
normalized = (pixel/255 - mean) / std
= pixel * (1/(255*std)) + (-mean/std)
So Core ML's ImageType params are:
scale = 1 / (255 * std)
bias = -mean / std
Examples:
SigLIP2 (mean=0.5, std=0.5): scale=2/255, bias=[-1,-1,-1] → [-1, 1]
MobileCLIP2 (mean=0, std=1): scale=1/255, bias=[0, 0, 0] → [0, 1]
OpenAI CLIP (mean≈0.48, std≈0.27): scale ≈ 0.0146, bias varies → standard CLIP norm
Getting this wrong silently degrades the embedding. Our SigLIP2 was at
0.976 cosine vs PyTorch for weeks because we hardcoded the [0,1] mapping
that worked for MobileCLIP2 but not SigLIP2.
"""
for tf in preprocess.transforms:
if type(tf).__name__ == "Normalize":
mean = list(tf.mean)
std = list(tf.std)
# Channel-wise scale/bias. Core ML accepts a single scale + per-channel bias
# only when std is uniform across channels. For SigLIP2 (std=0.5,0.5,0.5)
# this works; for OpenAI CLIP (std varies) we'd need a different approach.
if not all(s == std[0] for s in std):
sys.exit(f"non-uniform std {std} not supported by ct.ImageType "
"(would need per-channel scale)")
scale = 1.0 / (255.0 * std[0])
bias = [-m / std[0] for m in mean]
return scale, bias
# No Normalize transform → assume [0, 1] direct
return 1.0 / 255.0, [0.0, 0.0, 0.0]
class L2NormImageEncoder(torch.nn.Module):
"""Wraps an open_clip model so the Core ML output is already L2-normalized.
Saves a normalization step at search time and matches the convention used
by Apple's pre-shipped Core ML packages.
"""
def __init__(self, m):
super().__init__()
self.m = m
def forward(self, x):
f = self.m.encode_image(x)
return f / f.norm(dim=-1, keepdim=True)
def convert(model_name: str, pretrained: str, output_dir: Path) -> tuple[Path, int]:
"""Trace open_clip image branch + convert to fp16 Core ML. Returns (path, size)."""
print(f"[1/3] loading {model_name} ({pretrained}) …", flush=True)
t0 = time.perf_counter()
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
model.eval()
print(f" {time.perf_counter()-t0:.1f}s", flush=True)
size = preprocess_image_size(preprocess)
scale, bias = preprocess_normalization(preprocess)
print(f"[2/3] tracing at {size}x{size} (input scale={scale:.5f}, bias={bias}) …", flush=True)
t0 = time.perf_counter()
with torch.no_grad():
traced = torch.jit.trace(L2NormImageEncoder(model).eval(),
torch.zeros(1, 3, size, size))
print(f" {time.perf_counter()-t0:.1f}s", flush=True)
print(f"[3/3] converting to Core ML (fp16) …", flush=True)
t0 = time.perf_counter()
ml = ct.convert(
traced,
inputs=[ct.ImageType(name="image", shape=(1, 3, size, size),
scale=scale, bias=bias)],
outputs=[ct.TensorType(name="embedding")],
compute_units=ct.ComputeUnit.CPU_AND_NE,
minimum_deployment_target=ct.target.macOS14,
)
print(f" {time.perf_counter()-t0:.1f}s", flush=True)
output_dir.mkdir(parents=True, exist_ok=True)
out_path = output_dir / f"{model_name}_image.mlpackage"
ml.save(str(out_path))
out_size = sum(f.stat().st_size for f in out_path.rglob("*") if f.is_file())
print(f" saved → {out_path} ({out_size/1e6:.1f} MB)", flush=True)
return out_path, out_size
def palettize(src_path: Path, nbits: int) -> tuple[Path, int]:
"""Apply k-means palettization. Returns (path, size)."""
import coremltools.optimize.coreml as cto
print(f"[palettize] loading {src_path.name} …", flush=True)
src = ct.models.MLModel(str(src_path), compute_units=ct.ComputeUnit.CPU_ONLY)
print(f"[palettize] {nbits}-bit k-means clustering (this scales with model depth) …", flush=True)
t0 = time.perf_counter()
config = cto.OptimizationConfig(
global_config=cto.OpPalettizerConfig(nbits=nbits, mode="kmeans"),
)
compressed = cto.palettize_weights(src, config)
print(f" {time.perf_counter()-t0:.1f}s", flush=True)
out_path = src_path.parent / f"{src_path.stem}_{nbits}bit.mlpackage"
compressed.save(str(out_path))
out_size = sum(f.stat().st_size for f in out_path.rglob("*") if f.is_file())
print(f" saved → {out_path} ({out_size/1e6:.1f} MB)", flush=True)
return out_path, out_size
def verify(coreml_path: Path, model_name: str, pretrained: str,
pytorch_model, preprocess) -> float:
"""Encode a synthetic test image both ways, return cosine similarity."""
img = Image.new("RGB", (224, 224), (40, 40, 40))
ImageDraw.Draw(img).ellipse([40, 40, 184, 184], fill=(0, 255, 0))
with torch.no_grad():
pt = pytorch_model.encode_image(preprocess(img).unsqueeze(0))
pt = (pt / pt.norm(dim=-1, keepdim=True))[0].numpy().astype(np.float32)
cm = ct.models.MLModel(str(coreml_path), compute_units=ct.ComputeUnit.CPU_AND_NE)
cm_out = next(iter(cm.predict({"image": img}).values())).squeeze().astype(np.float32)
cm_out /= np.linalg.norm(cm_out)
return float(np.dot(pt, cm_out))
def benchmark(coreml_path: Path, n: int = 200) -> float:
"""Return throughput in images/sec for the converted model on ANE."""
cm = ct.models.MLModel(str(coreml_path), compute_units=ct.ComputeUnit.CPU_AND_NE)
spec = cm.get_spec()
size = spec.description.input[0].type.imageType.width
imgs = [Image.new("RGB", (size, size), (i % 255, (i*3) % 255, (i*7) % 255)) for i in range(n)]
for _ in range(3):
cm.predict({"image": imgs[0]})
t0 = time.perf_counter()
cm.predict([{"image": img} for img in imgs])
return n / (time.perf_counter() - t0)
def main():
p = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__.split("\n\n")[0],
)
p.add_argument("model", help="open_clip model name (e.g. ViT-B-16-SigLIP2, MobileCLIP2-B)")
p.add_argument("--pretrained", default=None,
help="open_clip pretrained tag (auto-detected from model name if omitted)")
p.add_argument("-o", "--output-dir", type=Path,
default=Path.home() / ".cache" / "mobileclip-coreml",
help="Where to save the .mlpackage (default: ~/.cache/mobileclip-coreml)")
p.add_argument("--palettize", type=int, choices=[2, 4, 6, 8], default=None,
help="After fp16 conversion, also produce a palettized version "
"with this bit-depth. 8-bit ≈ 2x smaller, near-zero quality "
"loss (recommended). 6-bit ≈ 2.7x but degrades ViT models. "
"4/2-bit only for non-critical layers.")
p.add_argument("--no-verify", action="store_true",
help="Skip cosine-similarity verification vs PyTorch (saves ~30s).")
p.add_argument("--no-benchmark", action="store_true",
help="Skip throughput benchmark (saves ~5s).")
args = p.parse_args()
if args.pretrained is None:
args.pretrained = default_pretrained_for(args.model)
print(f"[setup] auto-detected pretrained tag: {args.pretrained}", file=sys.stderr)
fp16_path, fp16_size = convert(args.model, args.pretrained, args.output_dir)
pal_path, pal_size = (None, 0)
if args.palettize:
pal_path, pal_size = palettize(fp16_path, args.palettize)
if not args.no_verify:
print(f"\n[verify] cosine similarity vs PyTorch:", flush=True)
# Reload PyTorch model once for verification.
pt_model, _, preprocess = open_clip.create_model_and_transforms(
args.model, pretrained=args.pretrained)
pt_model.eval()
cos_fp16 = verify(fp16_path, args.model, args.pretrained, pt_model, preprocess)
print(f" fp16: {cos_fp16:.4f}", flush=True)
if pal_path is not None:
cos_pal = verify(pal_path, args.model, args.pretrained, pt_model, preprocess)
print(f" {args.palettize}-bit palettized: {cos_pal:.4f} (compounded vs PyTorch)", flush=True)
if not args.no_benchmark:
print(f"\n[benchmark] throughput on ANE (200 in-memory images):", flush=True)
fps_fp16 = benchmark(fp16_path)
print(f" fp16: {fps_fp16:6.1f} img/s", flush=True)
if pal_path is not None:
fps_pal = benchmark(pal_path)
print(f" {args.palettize}-bit palettized: {fps_pal:6.1f} img/s", flush=True)
print(f"\n[done] artifacts in {args.output_dir}", flush=True)
print(f" fp16: {fp16_path.name} ({fp16_size/1e6:.0f} MB)", flush=True)
if pal_path is not None:
print(f" {args.palettize}-bit palett.: {pal_path.name} ({pal_size/1e6:.0f} MB, "
f"{fp16_size/pal_size:.1f}x smaller)", flush=True)
print(f"\nNext: rename one to '{args.model}_image.mlpackage' inside the output dir to "
f"make embed_media_mobileclip.py use it.", flush=True)
if __name__ == "__main__":
main()