import onnxruntime as ort import numpy as np from PIL import Image from transformers import AutoProcessor from io import BytesIO import httpx # load the processor # ckpt = "google/siglip2-base-patch16-224" # processor = AutoProcessor.from_pretrained(ckpt) #load from local tokenizer folder processor = AutoProcessor.from_pretrained('./tokenizer/') # load the image url = "http://images.cocodataset.org/val2017/000000039769.jpg" with httpx.stream("GET", url) as response: image = Image.open(BytesIO(response.read())) texts = ["a photo of 2 cats", "a photo of 2 dogs"] inputs = processor(text=texts,images=image,padding="max_length",return_tensors="pt") # run infernece onnx_image_encoder = ort.InferenceSession(f'./onnx/siglip2-base-patch16-224_vision.onnx',providers=['CPUExecutionProvider']) onnx_text_encoder = ort.InferenceSession(f'./onnx/siglip2-base-patch16-224_text.onnx',providers=['CPUExecutionProvider']) image_features = onnx_image_encoder.run(None,{'image':np.array(inputs.pixel_values)})[0] text_features=[] for i in range(inputs.input_ids.shape[0]): text_feature = onnx_text_encoder.run(None,{'text':np.array([inputs.input_ids[i]])})[0] text_features.append(text_feature) # normalized features text_features = np.array([t[0] for t in text_features]) image_features /= np.linalg.norm(image_features, axis=-1, keepdims=True) text_features /= np.linalg.norm(text_features, axis=-1, keepdims=True) # cosine similarity as logits logit_scale = np.array(4.7244534) #got from model.logit_scale logit_bias = np.array(-16.771725) #got from model.logit_bias logits_per_text = np.dot(text_features, image_features.T) logits_per_text = logits_per_text * np.exp(logit_scale) + logit_bias logits_per_image = logits_per_text.T probs = 1 / (1 + np.exp(-logits_per_image)) # these are the probabilities print(probs) print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")