ViT-B-16-SigLIP2-Image-CoreML / example_usage.py
batmac's picture
Upload folder using huggingface_hub
424bd46 verified
#!/usr/bin/env python3
"""Minimal end-to-end example: image-to-image similarity with this model.
Loads the 8-bit Core ML SigLIP2 image encoder, embeds two images, prints
cosine similarity. ANE-accelerated on Apple Silicon.
pip install coremltools pillow numpy
python example_usage.py path/to/image1.jpg path/to/image2.jpg
"""
import sys
from pathlib import Path
import coremltools as ct
import numpy as np
from PIL import Image
# Pick whichever variant suits you β€” see README "Available variants" table.
DEFAULT_MODEL = "ViT-B-16-SigLIP2_image_8bit.mlpackage"
def embed(model, image_path: Path) -> np.ndarray:
"""PIL β†’ 224Γ—224 RGB β†’ Core ML predict β†’ L2-normalized 768-d embedding."""
img = Image.open(image_path).convert("RGB").resize((224, 224), Image.BICUBIC)
out = model.predict({"image": img})
emb = next(iter(out.values()))[0].astype(np.float32)
# Model already L2-normalizes internally; this is belt-and-suspenders.
return emb / np.linalg.norm(emb)
def main():
if len(sys.argv) != 3:
sys.exit(f"usage: {sys.argv[0]} <image1> <image2>")
img1, img2 = Path(sys.argv[1]), Path(sys.argv[2])
model_path = Path(__file__).parent / DEFAULT_MODEL
if not model_path.exists():
sys.exit(f"model not found: {model_path}")
print(f"loading {model_path.name} on ANE …")
model = ct.models.MLModel(str(model_path), compute_units=ct.ComputeUnit.CPU_AND_NE)
print(f"embedding {img1.name} + {img2.name} …")
e1 = embed(model, img1)
e2 = embed(model, img2)
similarity = float(np.dot(e1, e2))
print(f"\ncosine similarity: {similarity:.4f}")
if similarity > 0.7:
print(" β†’ very similar (likely same scene/subject)")
elif similarity > 0.4:
print(" β†’ moderately similar")
elif similarity > 0.2:
print(" β†’ loosely related")
else:
print(" β†’ unrelated")
if __name__ == "__main__":
main()