siglip2-base-patch16-224 / python /axmodel_infer.py

wzf19947

Fix: use LFS for images

1038086 3 days ago

1.85 kB

	import axengine as axe
	import numpy as np
	from PIL import Image
	from transformers import AutoProcessor
	from io import BytesIO
	import httpx

	# load the processor
	# ckpt = "google/siglip2-base-patch16-224"
	# processor = AutoProcessor.from_pretrained(ckpt)
	#load from local tokenizer folder
	processor = AutoProcessor.from_pretrained('./tokenizer/')
	# load the image
	url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	with httpx.stream("GET", url) as response:
	image = Image.open(BytesIO(response.read()))
	texts = ["a photo of 2 cats", "a photo of 2 dogs"]
	inputs = processor(text=texts,images=image,padding="max_length",return_tensors="pt")
	# run infernece
	onnx_image_encoder = axe.InferenceSession(f'./ax650/siglip2-base-patch16-224_vision.axmodel')
	onnx_text_encoder = axe.InferenceSession(f'./ax650/siglip2-base-patch16-224_text.axmodel')
	image_features = onnx_image_encoder.run(None,{'image':np.array(inputs.pixel_values)})[0]
	text_features=[]
	for i in range(inputs.input_ids.shape[0]):
	text_feature = onnx_text_encoder.run(None,{'text':np.array([inputs.input_ids[i]]).astype(np.int32)})[0]
	text_features.append(text_feature)
	# normalized features
	text_features = np.array([t[0] for t in text_features])
	image_features /= np.linalg.norm(image_features, axis=-1, keepdims=True)
	text_features /= np.linalg.norm(text_features, axis=-1, keepdims=True)

	# cosine similarity as logits
	logit_scale = np.array(4.7244534) #got from model.logit_scale
	logit_bias = np.array(-16.771725) #got from model.logit_bias
	logits_per_text = np.dot(text_features, image_features.T)
	logits_per_text = logits_per_text * np.exp(logit_scale) + logit_bias

	logits_per_image = logits_per_text.T
	probs = 1 / (1 + np.exp(-logits_per_image)) # these are the probabilities
	print(probs)
	print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")