import onnxruntime as ort
import numpy as np
from PIL import Image
from transformers import AutoProcessor
from io import BytesIO
import httpx

# load the processor
# ckpt = "google/siglip2-base-patch16-224"
# processor = AutoProcessor.from_pretrained(ckpt)
#load from local tokenizer folder
processor = AutoProcessor.from_pretrained('./tokenizer/')
# load the image
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
with httpx.stream("GET", url) as response:
    image = Image.open(BytesIO(response.read()))
texts = ["a photo of 2 cats", "a photo of 2 dogs"]
inputs = processor(text=texts,images=image,padding="max_length",return_tensors="pt")
# run infernece
onnx_image_encoder = ort.InferenceSession(f'./onnx/siglip2-base-patch16-224_vision.onnx',providers=['CPUExecutionProvider'])
onnx_text_encoder  = ort.InferenceSession(f'./onnx/siglip2-base-patch16-224_text.onnx',providers=['CPUExecutionProvider'])
image_features = onnx_image_encoder.run(None,{'image':np.array(inputs.pixel_values)})[0]
text_features=[]
for i in range(inputs.input_ids.shape[0]):
    text_feature = onnx_text_encoder.run(None,{'text':np.array([inputs.input_ids[i]])})[0]
    text_features.append(text_feature)
# normalized features
text_features = np.array([t[0] for t in text_features])
image_features /= np.linalg.norm(image_features, axis=-1, keepdims=True)
text_features /= np.linalg.norm(text_features, axis=-1, keepdims=True)

# cosine similarity as logits
logit_scale = np.array(4.7244534)       #got from model.logit_scale
logit_bias = np.array(-16.771725)       #got from model.logit_bias
logits_per_text = np.dot(text_features, image_features.T)
logits_per_text = logits_per_text * np.exp(logit_scale) + logit_bias

logits_per_image = logits_per_text.T
probs = 1 / (1 + np.exp(-logits_per_image)) # these are the probabilities
print(probs)
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")