| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """Convert any open_clip image encoder to Core ML for ANE acceleration. |
| |
| Used to produce the .mlpackage files that ./embed_media_mobileclip.py loads |
| when --force-coreml is passed. The indexer also has an inline copy of the |
| fp16 conversion logic for lazy auto-build on first use; this standalone |
| script adds palettization + correctness verification + benchmarking, suitable |
| for producing artifacts to publish on HuggingFace. |
| |
| Tested model coverage: |
| - MobileCLIP2-B / dfndr2b → fp16: 0.999 cosine vs PyTorch (drop-in) |
| - ViT-B-16-SigLIP2 / webli → fp16: 0.976, 8-bit palettized: 0.966 |
| |
| Untested (expect to work but verify cosine): |
| - other MobileCLIP2-* (S0/S2/S3/S4, L-14) |
| - other SigLIP/SigLIP2 sizes (S0/S2/S3/S4/L-14) |
| - EVA02-*, ViTamin-*, PE-Core-* |
| |
| Usage: |
| # fp16 conversion (default) |
| ./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 |
| |
| # 8-bit palettized — half the disk size, near-identical fidelity |
| ./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 --palettize 8 |
| |
| # Custom pretrained tag + output dir |
| ./convert_to_coreml_mobileclip.py MobileCLIP2-B --pretrained dfndr2b -o ./out |
| |
| # Skip the cosine verification + benchmark to convert faster |
| ./convert_to_coreml_mobileclip.py ViT-B-16-SigLIP2 --no-verify |
| """ |
|
|
| import argparse |
| import sys |
| import time |
| from pathlib import Path |
|
|
| import numpy as np |
| import torch |
| from PIL import Image, ImageDraw |
|
|
| import open_clip |
| import coremltools as ct |
|
|
|
|
| def default_pretrained_for(model_name: str) -> str: |
| if model_name.startswith("MobileCLIP2-"): |
| return "dfndr2b" |
| if "SigLIP2" in model_name: |
| return "webli" |
| if "SigLIP" in model_name: |
| return "webli" |
| if "EVA02" in model_name: |
| return "merged2b_s8b_b131k" |
| return "datacompdr" |
|
|
|
|
| def preprocess_image_size(preprocess) -> int: |
| for tf in preprocess.transforms: |
| if hasattr(tf, "size"): |
| s = tf.size |
| return s if isinstance(s, int) else int(s[0]) |
| sys.exit("could not determine input size from preprocess transform") |
|
|
|
|
| def preprocess_normalization(preprocess) -> tuple[float, list[float]]: |
| """Derive ct.ImageType scale/bias from the model's Normalize transform. |
| |
| For Normalize(mean, std), the math is: |
| normalized = (pixel/255 - mean) / std |
| = pixel * (1/(255*std)) + (-mean/std) |
| So Core ML's ImageType params are: |
| scale = 1 / (255 * std) |
| bias = -mean / std |
| |
| Examples: |
| SigLIP2 (mean=0.5, std=0.5): scale=2/255, bias=[-1,-1,-1] → [-1, 1] |
| MobileCLIP2 (mean=0, std=1): scale=1/255, bias=[0, 0, 0] → [0, 1] |
| OpenAI CLIP (mean≈0.48, std≈0.27): scale ≈ 0.0146, bias varies → standard CLIP norm |
| |
| Getting this wrong silently degrades the embedding. Our SigLIP2 was at |
| 0.976 cosine vs PyTorch for weeks because we hardcoded the [0,1] mapping |
| that worked for MobileCLIP2 but not SigLIP2. |
| """ |
| for tf in preprocess.transforms: |
| if type(tf).__name__ == "Normalize": |
| mean = list(tf.mean) |
| std = list(tf.std) |
| |
| |
| |
| if not all(s == std[0] for s in std): |
| sys.exit(f"non-uniform std {std} not supported by ct.ImageType " |
| "(would need per-channel scale)") |
| scale = 1.0 / (255.0 * std[0]) |
| bias = [-m / std[0] for m in mean] |
| return scale, bias |
| |
| return 1.0 / 255.0, [0.0, 0.0, 0.0] |
|
|
|
|
| class L2NormImageEncoder(torch.nn.Module): |
| """Wraps an open_clip model so the Core ML output is already L2-normalized. |
| |
| Saves a normalization step at search time and matches the convention used |
| by Apple's pre-shipped Core ML packages. |
| """ |
| def __init__(self, m): |
| super().__init__() |
| self.m = m |
|
|
| def forward(self, x): |
| f = self.m.encode_image(x) |
| return f / f.norm(dim=-1, keepdim=True) |
|
|
|
|
| def convert(model_name: str, pretrained: str, output_dir: Path) -> tuple[Path, int]: |
| """Trace open_clip image branch + convert to fp16 Core ML. Returns (path, size).""" |
| print(f"[1/3] loading {model_name} ({pretrained}) …", flush=True) |
| t0 = time.perf_counter() |
| model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained) |
| model.eval() |
| print(f" {time.perf_counter()-t0:.1f}s", flush=True) |
|
|
| size = preprocess_image_size(preprocess) |
| scale, bias = preprocess_normalization(preprocess) |
| print(f"[2/3] tracing at {size}x{size} (input scale={scale:.5f}, bias={bias}) …", flush=True) |
| t0 = time.perf_counter() |
| with torch.no_grad(): |
| traced = torch.jit.trace(L2NormImageEncoder(model).eval(), |
| torch.zeros(1, 3, size, size)) |
| print(f" {time.perf_counter()-t0:.1f}s", flush=True) |
|
|
| print(f"[3/3] converting to Core ML (fp16) …", flush=True) |
| t0 = time.perf_counter() |
| ml = ct.convert( |
| traced, |
| inputs=[ct.ImageType(name="image", shape=(1, 3, size, size), |
| scale=scale, bias=bias)], |
| outputs=[ct.TensorType(name="embedding")], |
| compute_units=ct.ComputeUnit.CPU_AND_NE, |
| minimum_deployment_target=ct.target.macOS14, |
| ) |
| print(f" {time.perf_counter()-t0:.1f}s", flush=True) |
|
|
| output_dir.mkdir(parents=True, exist_ok=True) |
| out_path = output_dir / f"{model_name}_image.mlpackage" |
| ml.save(str(out_path)) |
| out_size = sum(f.stat().st_size for f in out_path.rglob("*") if f.is_file()) |
| print(f" saved → {out_path} ({out_size/1e6:.1f} MB)", flush=True) |
| return out_path, out_size |
|
|
|
|
| def palettize(src_path: Path, nbits: int) -> tuple[Path, int]: |
| """Apply k-means palettization. Returns (path, size).""" |
| import coremltools.optimize.coreml as cto |
| print(f"[palettize] loading {src_path.name} …", flush=True) |
| src = ct.models.MLModel(str(src_path), compute_units=ct.ComputeUnit.CPU_ONLY) |
|
|
| print(f"[palettize] {nbits}-bit k-means clustering (this scales with model depth) …", flush=True) |
| t0 = time.perf_counter() |
| config = cto.OptimizationConfig( |
| global_config=cto.OpPalettizerConfig(nbits=nbits, mode="kmeans"), |
| ) |
| compressed = cto.palettize_weights(src, config) |
| print(f" {time.perf_counter()-t0:.1f}s", flush=True) |
|
|
| out_path = src_path.parent / f"{src_path.stem}_{nbits}bit.mlpackage" |
| compressed.save(str(out_path)) |
| out_size = sum(f.stat().st_size for f in out_path.rglob("*") if f.is_file()) |
| print(f" saved → {out_path} ({out_size/1e6:.1f} MB)", flush=True) |
| return out_path, out_size |
|
|
|
|
| def verify(coreml_path: Path, model_name: str, pretrained: str, |
| pytorch_model, preprocess) -> float: |
| """Encode a synthetic test image both ways, return cosine similarity.""" |
| img = Image.new("RGB", (224, 224), (40, 40, 40)) |
| ImageDraw.Draw(img).ellipse([40, 40, 184, 184], fill=(0, 255, 0)) |
|
|
| with torch.no_grad(): |
| pt = pytorch_model.encode_image(preprocess(img).unsqueeze(0)) |
| pt = (pt / pt.norm(dim=-1, keepdim=True))[0].numpy().astype(np.float32) |
|
|
| cm = ct.models.MLModel(str(coreml_path), compute_units=ct.ComputeUnit.CPU_AND_NE) |
| cm_out = next(iter(cm.predict({"image": img}).values())).squeeze().astype(np.float32) |
| cm_out /= np.linalg.norm(cm_out) |
| return float(np.dot(pt, cm_out)) |
|
|
|
|
| def benchmark(coreml_path: Path, n: int = 200) -> float: |
| """Return throughput in images/sec for the converted model on ANE.""" |
| cm = ct.models.MLModel(str(coreml_path), compute_units=ct.ComputeUnit.CPU_AND_NE) |
| spec = cm.get_spec() |
| size = spec.description.input[0].type.imageType.width |
| imgs = [Image.new("RGB", (size, size), (i % 255, (i*3) % 255, (i*7) % 255)) for i in range(n)] |
| for _ in range(3): |
| cm.predict({"image": imgs[0]}) |
| t0 = time.perf_counter() |
| cm.predict([{"image": img} for img in imgs]) |
| return n / (time.perf_counter() - t0) |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser( |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| description=__doc__.split("\n\n")[0], |
| ) |
| p.add_argument("model", help="open_clip model name (e.g. ViT-B-16-SigLIP2, MobileCLIP2-B)") |
| p.add_argument("--pretrained", default=None, |
| help="open_clip pretrained tag (auto-detected from model name if omitted)") |
| p.add_argument("-o", "--output-dir", type=Path, |
| default=Path.home() / ".cache" / "mobileclip-coreml", |
| help="Where to save the .mlpackage (default: ~/.cache/mobileclip-coreml)") |
| p.add_argument("--palettize", type=int, choices=[2, 4, 6, 8], default=None, |
| help="After fp16 conversion, also produce a palettized version " |
| "with this bit-depth. 8-bit ≈ 2x smaller, near-zero quality " |
| "loss (recommended). 6-bit ≈ 2.7x but degrades ViT models. " |
| "4/2-bit only for non-critical layers.") |
| p.add_argument("--no-verify", action="store_true", |
| help="Skip cosine-similarity verification vs PyTorch (saves ~30s).") |
| p.add_argument("--no-benchmark", action="store_true", |
| help="Skip throughput benchmark (saves ~5s).") |
| args = p.parse_args() |
|
|
| if args.pretrained is None: |
| args.pretrained = default_pretrained_for(args.model) |
| print(f"[setup] auto-detected pretrained tag: {args.pretrained}", file=sys.stderr) |
|
|
| fp16_path, fp16_size = convert(args.model, args.pretrained, args.output_dir) |
|
|
| pal_path, pal_size = (None, 0) |
| if args.palettize: |
| pal_path, pal_size = palettize(fp16_path, args.palettize) |
|
|
| if not args.no_verify: |
| print(f"\n[verify] cosine similarity vs PyTorch:", flush=True) |
| |
| pt_model, _, preprocess = open_clip.create_model_and_transforms( |
| args.model, pretrained=args.pretrained) |
| pt_model.eval() |
| cos_fp16 = verify(fp16_path, args.model, args.pretrained, pt_model, preprocess) |
| print(f" fp16: {cos_fp16:.4f}", flush=True) |
| if pal_path is not None: |
| cos_pal = verify(pal_path, args.model, args.pretrained, pt_model, preprocess) |
| print(f" {args.palettize}-bit palettized: {cos_pal:.4f} (compounded vs PyTorch)", flush=True) |
|
|
| if not args.no_benchmark: |
| print(f"\n[benchmark] throughput on ANE (200 in-memory images):", flush=True) |
| fps_fp16 = benchmark(fp16_path) |
| print(f" fp16: {fps_fp16:6.1f} img/s", flush=True) |
| if pal_path is not None: |
| fps_pal = benchmark(pal_path) |
| print(f" {args.palettize}-bit palettized: {fps_pal:6.1f} img/s", flush=True) |
|
|
| print(f"\n[done] artifacts in {args.output_dir}", flush=True) |
| print(f" fp16: {fp16_path.name} ({fp16_size/1e6:.0f} MB)", flush=True) |
| if pal_path is not None: |
| print(f" {args.palettize}-bit palett.: {pal_path.name} ({pal_size/1e6:.0f} MB, " |
| f"{fp16_size/pal_size:.1f}x smaller)", flush=True) |
| print(f"\nNext: rename one to '{args.model}_image.mlpackage' inside the output dir to " |
| f"make embed_media_mobileclip.py use it.", flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|