|
|
import axengine as axe |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
from transformers import AutoProcessor |
|
|
from io import BytesIO |
|
|
import httpx |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processor = AutoProcessor.from_pretrained('./tokenizer/') |
|
|
|
|
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
|
with httpx.stream("GET", url) as response: |
|
|
image = Image.open(BytesIO(response.read())) |
|
|
texts = ["a photo of 2 cats", "a photo of 2 dogs"] |
|
|
inputs = processor(text=texts,images=image,padding="max_length",return_tensors="pt") |
|
|
|
|
|
onnx_image_encoder = axe.InferenceSession(f'./ax650/siglip2-base-patch16-224_vision.axmodel') |
|
|
onnx_text_encoder = axe.InferenceSession(f'./ax650/siglip2-base-patch16-224_text.axmodel') |
|
|
image_features = onnx_image_encoder.run(None,{'image':np.array(inputs.pixel_values)})[0] |
|
|
text_features=[] |
|
|
for i in range(inputs.input_ids.shape[0]): |
|
|
text_feature = onnx_text_encoder.run(None,{'text':np.array([inputs.input_ids[i]]).astype(np.int32)})[0] |
|
|
text_features.append(text_feature) |
|
|
|
|
|
text_features = np.array([t[0] for t in text_features]) |
|
|
image_features /= np.linalg.norm(image_features, axis=-1, keepdims=True) |
|
|
text_features /= np.linalg.norm(text_features, axis=-1, keepdims=True) |
|
|
|
|
|
|
|
|
logit_scale = np.array(4.7244534) |
|
|
logit_bias = np.array(-16.771725) |
|
|
logits_per_text = np.dot(text_features, image_features.T) |
|
|
logits_per_text = logits_per_text * np.exp(logit_scale) + logit_bias |
|
|
|
|
|
logits_per_image = logits_per_text.T |
|
|
probs = 1 / (1 + np.exp(-logits_per_image)) |
|
|
print(probs) |
|
|
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") |
|
|
|