import json from PIL import Image import requests from transformers import CLIPProcessor, CLIPModel from transformers.models.clip.modeling_clip import _get_vector_norm import torch import numpy as np import platform import sys import os processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) inputs = processor(text="two cats on a pink blanket", images=image, return_tensors="pt", padding="max_length", truncation=True) np_inputs = {k: v.numpy() for k, v in inputs.data.items()} class VisionModel(torch.nn.Module): def __init__(self, model): super(VisionModel, self).__init__() self.model = model def forward(self, x): model = self.model vision_outputs = model.vision_model.forward(x) pooled_output = vision_outputs.pooler_output image_features = self.model.visual_projection(pooled_output) image_features = image_features / _get_vector_norm(image_features) return image_features def eval(self): self.model.eval() self.model.vision_model.eval() self.model.visual_projection.eval() return super().eval() class TextModel(torch.nn.Module): def __init__(self, model): super(TextModel, self).__init__() self.model = model def forward(self, input_ids, attention_mask): model = self.model text_outputs = model.text_model.forward(input_ids, attention_mask) pooled_output = text_outputs.pooler_output text_features = self.model.text_projection(pooled_output) text_features = text_features / _get_vector_norm(text_features) return text_features def eval(self): self.model.eval() self.model.text_model.eval() self.model.text_projection.eval() return super().eval() torch.set_grad_enabled(False) ptmodel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") with torch.no_grad(): vision = VisionModel(ptmodel) vision.eval() traced_vision_model = torch.jit.trace(vision, inputs.data['pixel_values']) text = TextModel(ptmodel) text.eval() traced_text_model = torch.jit.trace(text, (inputs.data['input_ids'], inputs.data['attention_mask'])) def convert_coreml(): import coremltools as ct coreml_model = ct.convert(traced_vision_model, inputs=[ct.TensorType(shape=inputs.data['pixel_values'].shape)]) coreml_model.save('vision.mlpackage') coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)]) coreml_model.save('text.mlpackage') # convert_coreml() def infer_coreml(): import coremltools as ct coreml_vision_model = ct.models.MLModel('vision.mlpackage') coreml_text_model = ct.models.MLModel('text.mlpackage') vision_predictions = coreml_vision_model.predict({'x': np_inputs['pixel_values']}) text_predictions = coreml_text_model.predict({'input_ids_1': np_inputs['input_ids'].astype(np.float32), 'attention_mask_1': np_inputs['attention_mask'].astype(np.float32)}) image_embeds = vision_predictions['var_877'] text_embeds = text_predictions['var_1050'] # Compute logits logits_per_text = text_embeds @ image_embeds.T print("similarity:", logits_per_text.item()) def convert_onnx(): torch.onnx.export(traced_vision_model, inputs.data['pixel_values'], "vision.onnx") torch.onnx.export(traced_text_model, (inputs.data['input_ids'], inputs.data['input_ids']), "text.onnx") # convert_onnx() def infer_onnx(): import onnxruntime as ort providers: list[str] = [] if sys.platform == "darwin": providers.append("CoreMLExecutionProvider") if ("linux" in sys.platform or "win" in sys.platform) and ( platform.machine() == "x86_64" or platform.machine() == "AMD64" ): providers.append(("CUDAExecutionProvider", {"device_id": 0})) providers.append("CPUExecutionProvider") vision_session = ort.InferenceSession("vision.onnx", providers=providers) text_session = ort.InferenceSession("text.onnx", providers=providers) vision_inputs = {vision_session.get_inputs()[0].name: np_inputs['pixel_values']} text_inputs = { text_session.get_inputs()[0].name: np_inputs['input_ids'], text_session.get_inputs()[1].name: np_inputs['attention_mask'] } vision_predictions = vision_session.run(None, vision_inputs) text_predictions = text_session.run(None, text_inputs) image_embeds = vision_predictions[0] text_embeds = text_predictions[0] logits_per_text = text_embeds @ image_embeds.T print("similarity:", logits_per_text.item()) # infer_onnx() def convert_openvino(): import openvino as ov ov_vision_model = ov.convert_model(traced_vision_model, example_input=inputs.data['pixel_values']) ov.save_model(ov_vision_model, "openvino/vision.xml") ov_text_model = ov.convert_model(traced_text_model, example_input=(inputs.data['input_ids'], inputs.data['attention_mask'])) ov.save_model(ov_text_model, "openvino/text.xml") # convert_openvino() def infer_openvino(): import openvino as ov ov_vision_model = ov.Core().read_model("openvino/vision.xml") ov_text_model = ov.Core().read_model("openvino/text.xml") compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU") compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU") vision_predictions = compiled_vision_model(inputs.data['pixel_values']) text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask'])) image_embeds = vision_predictions[0] text_embeds = text_predictions[0] logits_per_text = text_embeds @ image_embeds.T print("similarity:", logits_per_text.item()) # infer_openvino() def export_openvino_int8(): import openvino as ov import text_calibration import image_calibration import nncf ov_vision_model = ov.Core().read_model("openvino/vision.xml") ov_text_model = ov.Core().read_model("openvino/text.xml") vision_calibration_dataset = image_calibration.get_image_calibration_data() text_calibration_dataset = text_calibration.get_text_calibration_data() vision_dataset = nncf.Dataset(vision_calibration_dataset) text_dataset = nncf.Dataset(text_calibration_dataset) quantized_vision_model = nncf.quantize(ov_vision_model, vision_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, # advanced_parameters=nncf.AdvancedQuantizationParameters(disable_bias_correction=True) ) quantized_text_model = nncf.quantize(ov_text_model, text_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, # advanced_parameters=nncf.AdvancedQuantizationParameters(disable_bias_correction=True) ) ov.save_model(quantized_vision_model, "openvino/vision_int8.xml") ov.save_model(quantized_text_model, "openvino/text_int8.xml") export_openvino_int8() def infer_openvino_int8(): import openvino as ov ov_vision_model = ov.Core().read_model("openvino/vision_int8.xml") ov_text_model = ov.Core().read_model("openvino/text_int8.xml") compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU") compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU") vision_predictions = compiled_vision_model(inputs.data['pixel_values']) text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask'])) image_embeds = vision_predictions[0] text_embeds = text_predictions[0] logits_per_text = text_embeds @ image_embeds.T print("similarity:", logits_per_text.item()) infer_openvino_int8() def export_ncnn(): traced_vision_model.save(f"vision.pt") input_shape_str = json.dumps(list(inputs.data['pixel_values'].shape)).replace(" ", "") os.system(f"pnnx vision.pt 'inputshape={input_shape_str}'") traced_text_model.save(f"text.pt") input_shape_str = json.dumps(list(inputs.data['input_ids'].shape)).replace(" ", "") input_shape2_str = json.dumps(list(inputs.data['attention_mask'].shape)).replace(" ", "") os.system(f"pnnx text.pt 'inputshape={input_shape_str}i64,{input_shape2_str}i64'") # export_ncnn() def infer_ncnn(): import ncnn vision_extractor = ncnn.Net() vision_extractor.load_param("vision.ncnn.param") vision_extractor.load_model("vision.ncnn.bin") text_extractor = ncnn.Net() text_extractor.load_param("text.ncnn.param") text_extractor.load_model("text.ncnn.bin") vision_mat = ncnn.Mat(inputs.data['pixel_values'].numpy()) text_input_ids_mat = ncnn.Mat(inputs.data['input_ids'].numpy()) text_attention_mask_mat = ncnn.Mat(inputs.data['attention_mask'].numpy()) vision_extractor.input(vision_extractor.input_names()[0], vision_mat) text_extractor.input(text_extractor.input_names()[0], text_input_ids_mat) text_extractor.input(text_extractor.input_names()[1], text_attention_mask_mat) image_embeds = vision_extractor.extract("out0") text_embeds = text_extractor.extract("out0") logits_per_text = text_embeds @ image_embeds.T print("similarity:", logits_per_text[0]) # infer_ncnn() def infer_torch(): outputs = ptmodel(**inputs) logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities print(probs)