Koushik Dutta
commited on
Commit
·
af89e10
1
Parent(s):
94a263a
wip
Browse files- .gitignore +4 -0
- export.py +49 -11
.gitignore
CHANGED
|
@@ -1 +1,5 @@
|
|
| 1 |
.venv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
.venv
|
| 2 |
+
vision.pt
|
| 3 |
+
text.pt
|
| 4 |
+
*pnnx*
|
| 5 |
+
*_ncnn.py
|
export.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from PIL import Image
|
| 2 |
import requests
|
| 3 |
from transformers import CLIPProcessor, CLIPModel
|
|
@@ -7,6 +8,7 @@ import torch
|
|
| 7 |
import numpy as np
|
| 8 |
import platform
|
| 9 |
import sys
|
|
|
|
| 10 |
|
| 11 |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 12 |
|
|
@@ -73,6 +75,8 @@ def convert_coreml():
|
|
| 73 |
coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)])
|
| 74 |
coreml_model.save('text.mlpackage')
|
| 75 |
|
|
|
|
|
|
|
| 76 |
def infer_coreml():
|
| 77 |
coreml_vision_model = ct.models.MLModel('vision.mlpackage')
|
| 78 |
coreml_text_model = ct.models.MLModel('text.mlpackage')
|
|
@@ -95,7 +99,6 @@ def convert_onnx():
|
|
| 95 |
|
| 96 |
# convert_onnx()
|
| 97 |
|
| 98 |
-
|
| 99 |
def infer_onnx():
|
| 100 |
import onnxruntime as ort
|
| 101 |
|
|
@@ -129,8 +132,7 @@ def infer_onnx():
|
|
| 129 |
|
| 130 |
print("similarity:", logits_per_text.item())
|
| 131 |
|
| 132 |
-
infer_onnx()
|
| 133 |
-
|
| 134 |
|
| 135 |
def convert_openvino():
|
| 136 |
import openvino as ov
|
|
@@ -162,13 +164,49 @@ def infer_openvino():
|
|
| 162 |
|
| 163 |
# infer_openvino()
|
| 164 |
|
| 165 |
-
# convert_coreml()
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
from PIL import Image
|
| 3 |
import requests
|
| 4 |
from transformers import CLIPProcessor, CLIPModel
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
import platform
|
| 10 |
import sys
|
| 11 |
+
import os
|
| 12 |
|
| 13 |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 14 |
|
|
|
|
| 75 |
coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)])
|
| 76 |
coreml_model.save('text.mlpackage')
|
| 77 |
|
| 78 |
+
# convert_coreml()
|
| 79 |
+
|
| 80 |
def infer_coreml():
|
| 81 |
coreml_vision_model = ct.models.MLModel('vision.mlpackage')
|
| 82 |
coreml_text_model = ct.models.MLModel('text.mlpackage')
|
|
|
|
| 99 |
|
| 100 |
# convert_onnx()
|
| 101 |
|
|
|
|
| 102 |
def infer_onnx():
|
| 103 |
import onnxruntime as ort
|
| 104 |
|
|
|
|
| 132 |
|
| 133 |
print("similarity:", logits_per_text.item())
|
| 134 |
|
| 135 |
+
# infer_onnx()
|
|
|
|
| 136 |
|
| 137 |
def convert_openvino():
|
| 138 |
import openvino as ov
|
|
|
|
| 164 |
|
| 165 |
# infer_openvino()
|
| 166 |
|
|
|
|
| 167 |
|
| 168 |
+
def export_ncnn():
|
| 169 |
+
traced_vision_model.save(f"vision.pt")
|
| 170 |
+
input_shape_str = json.dumps(list(inputs.data['pixel_values'].shape)).replace(" ", "")
|
| 171 |
+
os.system(f"pnnx vision.pt 'inputshape={input_shape_str}'")
|
| 172 |
+
|
| 173 |
+
traced_text_model.save(f"text.pt")
|
| 174 |
+
input_shape_str = json.dumps(list(inputs.data['input_ids'].shape)).replace(" ", "")
|
| 175 |
+
input_shape2_str = json.dumps(list(inputs.data['attention_mask'].shape)).replace(" ", "")
|
| 176 |
+
os.system(f"pnnx text.pt 'inputshape={input_shape_str}i64,{input_shape2_str}i64'")
|
| 177 |
+
|
| 178 |
+
export_ncnn()
|
| 179 |
+
|
| 180 |
+
def infer_ncnn():
|
| 181 |
+
import ncnn
|
| 182 |
+
|
| 183 |
+
vision_extractor = ncnn.Net()
|
| 184 |
+
vision_extractor.load_param("vision.param")
|
| 185 |
+
vision_extractor.load_model("vision.bin")
|
| 186 |
+
|
| 187 |
+
text_extractor = ncnn.Net()
|
| 188 |
+
text_extractor.load_param("text.param")
|
| 189 |
+
text_extractor.load_model("text.bin")
|
| 190 |
+
|
| 191 |
+
vision_mat = ncnn.Mat(inputs.data['pixel_values'].numpy())
|
| 192 |
+
text_input_ids_mat = ncnn.Mat(inputs.data['input_ids'].numpy())
|
| 193 |
+
text_attention_mask_mat = ncnn.Mat(inputs.data['attention_mask'].numpy())
|
| 194 |
+
|
| 195 |
+
vision_extractor.input(vision_extractor.input_names()[0], vision_mat)
|
| 196 |
+
text_extractor.input(text_extractor.input_names()[0], text_input_ids_mat)
|
| 197 |
+
text_extractor.input(text_extractor.input_names()[1], text_attention_mask_mat)
|
| 198 |
+
|
| 199 |
+
image_embeds = vision_extractor.extract("out0")
|
| 200 |
+
text_embeds = text_extractor.extract("out0")
|
| 201 |
+
|
| 202 |
+
logits_per_text = text_embeds @ image_embeds.T
|
| 203 |
+
|
| 204 |
+
print("similarity:", logits_per_text[0])
|
| 205 |
+
|
| 206 |
+
infer_ncnn()
|
| 207 |
|
| 208 |
+
def infer_torch():
|
| 209 |
+
outputs = ptmodel(**inputs)
|
| 210 |
+
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
| 211 |
+
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
|
| 212 |
+
print(probs)
|