#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.10,<3.14" # dependencies = [ # "coremltools", # "open_clip_torch", # "transformers", # "torch", # "torchvision", # "pillow", # "numpy", # ] # /// """Convert any open_clip image encoder to Core ML for ANE acceleration. Used to produce the .mlpackage files that ./embed_media_mobileclip.py loads when --force-coreml is passed. The indexer also has an inline copy of the fp16 conversion logic for lazy auto-build on first use; this standalone script adds palettization + correctness verification + benchmarking, suitable for producing artifacts to publish on HuggingFace. Tested model coverage: - MobileCLIP2-B / dfndr2b → fp16: 0.999 cosine vs PyTorch (drop-in) - ViT-B-16-SigLIP2 / webli → fp16: 0.976, 8-bit palettized: 0.966 Untested (expect to work but verify cosine): - other MobileCLIP2-* (S0/S2/S3/S4, L-14) - other SigLIP/SigLIP2 sizes (S0/S2/S3/S4/L-14) - EVA02-*, ViTamin-*, PE-Core-* Usage: # fp16 conversion (default) ./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 # 8-bit palettized — half the disk size, near-identical fidelity ./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 --palettize 8 # Custom pretrained tag + output dir ./convert_to_coreml_mobileclip.py MobileCLIP2-B --pretrained dfndr2b -o ./out # Skip the cosine verification + benchmark to convert faster ./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 --no-verify """ import argparse import sys import time from pathlib import Path import numpy as np import torch from PIL import Image, ImageDraw import open_clip import coremltools as ct def default_pretrained_for(model_name: str) -> str: if model_name.startswith("MobileCLIP2-"): return "dfndr2b" if "SigLIP2" in model_name: return "webli" if "SigLIP" in model_name: return "webli" if "EVA02" in model_name: return "merged2b_s8b_b131k" return "datacompdr" def preprocess_image_size(preprocess) -> int: for tf in preprocess.transforms: if hasattr(tf, "size"): s = tf.size return s if isinstance(s, int) else int(s[0]) sys.exit("could not determine input size from preprocess transform") def preprocess_normalization(preprocess) -> tuple[float, list[float]]: """Derive ct.ImageType scale/bias from the model's Normalize transform. For Normalize(mean, std), the math is: normalized = (pixel/255 - mean) / std = pixel * (1/(255*std)) + (-mean/std) So Core ML's ImageType params are: scale = 1 / (255 * std) bias = -mean / std Examples: SigLIP2 (mean=0.5, std=0.5): scale=2/255, bias=[-1,-1,-1] → [-1, 1] MobileCLIP2 (mean=0, std=1): scale=1/255, bias=[0, 0, 0] → [0, 1] OpenAI CLIP (mean≈0.48, std≈0.27): scale ≈ 0.0146, bias varies → standard CLIP norm Getting this wrong silently degrades the embedding. Our SigLIP2 was at 0.976 cosine vs PyTorch for weeks because we hardcoded the [0,1] mapping that worked for MobileCLIP2 but not SigLIP2. """ for tf in preprocess.transforms: if type(tf).__name__ == "Normalize": mean = list(tf.mean) std = list(tf.std) # Channel-wise scale/bias. Core ML accepts a single scale + per-channel bias # only when std is uniform across channels. For SigLIP2 (std=0.5,0.5,0.5) # this works; for OpenAI CLIP (std varies) we'd need a different approach. if not all(s == std[0] for s in std): sys.exit(f"non-uniform std {std} not supported by ct.ImageType " "(would need per-channel scale)") scale = 1.0 / (255.0 * std[0]) bias = [-m / std[0] for m in mean] return scale, bias # No Normalize transform → assume [0, 1] direct return 1.0 / 255.0, [0.0, 0.0, 0.0] class L2NormImageEncoder(torch.nn.Module): """Wraps an open_clip model so the Core ML output is already L2-normalized. Saves a normalization step at search time and matches the convention used by Apple's pre-shipped Core ML packages. """ def __init__(self, m): super().__init__() self.m = m def forward(self, x): f = self.m.encode_image(x) return f / f.norm(dim=-1, keepdim=True) def convert(model_name: str, pretrained: str, output_dir: Path) -> tuple[Path, int]: """Trace open_clip image branch + convert to fp16 Core ML. Returns (path, size).""" print(f"[1/3] loading {model_name} ({pretrained}) …", flush=True) t0 = time.perf_counter() model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained) model.eval() print(f" {time.perf_counter()-t0:.1f}s", flush=True) size = preprocess_image_size(preprocess) scale, bias = preprocess_normalization(preprocess) print(f"[2/3] tracing at {size}x{size} (input scale={scale:.5f}, bias={bias}) …", flush=True) t0 = time.perf_counter() with torch.no_grad(): traced = torch.jit.trace(L2NormImageEncoder(model).eval(), torch.zeros(1, 3, size, size)) print(f" {time.perf_counter()-t0:.1f}s", flush=True) print(f"[3/3] converting to Core ML (fp16) …", flush=True) t0 = time.perf_counter() ml = ct.convert( traced, inputs=[ct.ImageType(name="image", shape=(1, 3, size, size), scale=scale, bias=bias)], outputs=[ct.TensorType(name="embedding")], compute_units=ct.ComputeUnit.CPU_AND_NE, minimum_deployment_target=ct.target.macOS14, ) print(f" {time.perf_counter()-t0:.1f}s", flush=True) output_dir.mkdir(parents=True, exist_ok=True) out_path = output_dir / f"{model_name}_image.mlpackage" ml.save(str(out_path)) out_size = sum(f.stat().st_size for f in out_path.rglob("*") if f.is_file()) print(f" saved → {out_path} ({out_size/1e6:.1f} MB)", flush=True) return out_path, out_size def palettize(src_path: Path, nbits: int) -> tuple[Path, int]: """Apply k-means palettization. Returns (path, size).""" import coremltools.optimize.coreml as cto print(f"[palettize] loading {src_path.name} …", flush=True) src = ct.models.MLModel(str(src_path), compute_units=ct.ComputeUnit.CPU_ONLY) print(f"[palettize] {nbits}-bit k-means clustering (this scales with model depth) …", flush=True) t0 = time.perf_counter() config = cto.OptimizationConfig( global_config=cto.OpPalettizerConfig(nbits=nbits, mode="kmeans"), ) compressed = cto.palettize_weights(src, config) print(f" {time.perf_counter()-t0:.1f}s", flush=True) out_path = src_path.parent / f"{src_path.stem}_{nbits}bit.mlpackage" compressed.save(str(out_path)) out_size = sum(f.stat().st_size for f in out_path.rglob("*") if f.is_file()) print(f" saved → {out_path} ({out_size/1e6:.1f} MB)", flush=True) return out_path, out_size def verify(coreml_path: Path, model_name: str, pretrained: str, pytorch_model, preprocess) -> float: """Encode a synthetic test image both ways, return cosine similarity.""" img = Image.new("RGB", (224, 224), (40, 40, 40)) ImageDraw.Draw(img).ellipse([40, 40, 184, 184], fill=(0, 255, 0)) with torch.no_grad(): pt = pytorch_model.encode_image(preprocess(img).unsqueeze(0)) pt = (pt / pt.norm(dim=-1, keepdim=True))[0].numpy().astype(np.float32) cm = ct.models.MLModel(str(coreml_path), compute_units=ct.ComputeUnit.CPU_AND_NE) cm_out = next(iter(cm.predict({"image": img}).values())).squeeze().astype(np.float32) cm_out /= np.linalg.norm(cm_out) return float(np.dot(pt, cm_out)) def benchmark(coreml_path: Path, n: int = 200) -> float: """Return throughput in images/sec for the converted model on ANE.""" cm = ct.models.MLModel(str(coreml_path), compute_units=ct.ComputeUnit.CPU_AND_NE) spec = cm.get_spec() size = spec.description.input[0].type.imageType.width imgs = [Image.new("RGB", (size, size), (i % 255, (i*3) % 255, (i*7) % 255)) for i in range(n)] for _ in range(3): cm.predict({"image": imgs[0]}) t0 = time.perf_counter() cm.predict([{"image": img} for img in imgs]) return n / (time.perf_counter() - t0) def main(): p = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__.split("\n\n")[0], ) p.add_argument("model", help="open_clip model name (e.g. ViT-B-16-SigLIP2, MobileCLIP2-B)") p.add_argument("--pretrained", default=None, help="open_clip pretrained tag (auto-detected from model name if omitted)") p.add_argument("-o", "--output-dir", type=Path, default=Path.home() / ".cache" / "mobileclip-coreml", help="Where to save the .mlpackage (default: ~/.cache/mobileclip-coreml)") p.add_argument("--palettize", type=int, choices=[2, 4, 6, 8], default=None, help="After fp16 conversion, also produce a palettized version " "with this bit-depth. 8-bit ≈ 2x smaller, near-zero quality " "loss (recommended). 6-bit ≈ 2.7x but degrades ViT models. " "4/2-bit only for non-critical layers.") p.add_argument("--no-verify", action="store_true", help="Skip cosine-similarity verification vs PyTorch (saves ~30s).") p.add_argument("--no-benchmark", action="store_true", help="Skip throughput benchmark (saves ~5s).") args = p.parse_args() if args.pretrained is None: args.pretrained = default_pretrained_for(args.model) print(f"[setup] auto-detected pretrained tag: {args.pretrained}", file=sys.stderr) fp16_path, fp16_size = convert(args.model, args.pretrained, args.output_dir) pal_path, pal_size = (None, 0) if args.palettize: pal_path, pal_size = palettize(fp16_path, args.palettize) if not args.no_verify: print(f"\n[verify] cosine similarity vs PyTorch:", flush=True) # Reload PyTorch model once for verification. pt_model, _, preprocess = open_clip.create_model_and_transforms( args.model, pretrained=args.pretrained) pt_model.eval() cos_fp16 = verify(fp16_path, args.model, args.pretrained, pt_model, preprocess) print(f" fp16: {cos_fp16:.4f}", flush=True) if pal_path is not None: cos_pal = verify(pal_path, args.model, args.pretrained, pt_model, preprocess) print(f" {args.palettize}-bit palettized: {cos_pal:.4f} (compounded vs PyTorch)", flush=True) if not args.no_benchmark: print(f"\n[benchmark] throughput on ANE (200 in-memory images):", flush=True) fps_fp16 = benchmark(fp16_path) print(f" fp16: {fps_fp16:6.1f} img/s", flush=True) if pal_path is not None: fps_pal = benchmark(pal_path) print(f" {args.palettize}-bit palettized: {fps_pal:6.1f} img/s", flush=True) print(f"\n[done] artifacts in {args.output_dir}", flush=True) print(f" fp16: {fp16_path.name} ({fp16_size/1e6:.0f} MB)", flush=True) if pal_path is not None: print(f" {args.palettize}-bit palett.: {pal_path.name} ({pal_size/1e6:.0f} MB, " f"{fp16_size/pal_size:.1f}x smaller)", flush=True) print(f"\nNext: rename one to '{args.model}_image.mlpackage' inside the output dir to " f"make embed_media_mobileclip.py use it.", flush=True) if __name__ == "__main__": main()