Koushik Dutta commited on
Commit
af89e10
·
1 Parent(s): 94a263a
Files changed (2) hide show
  1. .gitignore +4 -0
  2. export.py +49 -11
.gitignore CHANGED
@@ -1 +1,5 @@
1
  .venv
 
 
 
 
 
1
  .venv
2
+ vision.pt
3
+ text.pt
4
+ *pnnx*
5
+ *_ncnn.py
export.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from PIL import Image
2
  import requests
3
  from transformers import CLIPProcessor, CLIPModel
@@ -7,6 +8,7 @@ import torch
7
  import numpy as np
8
  import platform
9
  import sys
 
10
 
11
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
12
 
@@ -73,6 +75,8 @@ def convert_coreml():
73
  coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)])
74
  coreml_model.save('text.mlpackage')
75
 
 
 
76
  def infer_coreml():
77
  coreml_vision_model = ct.models.MLModel('vision.mlpackage')
78
  coreml_text_model = ct.models.MLModel('text.mlpackage')
@@ -95,7 +99,6 @@ def convert_onnx():
95
 
96
  # convert_onnx()
97
 
98
-
99
  def infer_onnx():
100
  import onnxruntime as ort
101
 
@@ -129,8 +132,7 @@ def infer_onnx():
129
 
130
  print("similarity:", logits_per_text.item())
131
 
132
- infer_onnx()
133
-
134
 
135
  def convert_openvino():
136
  import openvino as ov
@@ -162,13 +164,49 @@ def infer_openvino():
162
 
163
  # infer_openvino()
164
 
165
- # convert_coreml()
166
 
167
- # Create an ONNX Runtime session
168
- # session = ort.InferenceSession(model_path)
169
- # outputs = session.run(None, np_inputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- # outputs = ptmodel(**inputs)
172
- # logits_per_image = outputs.logits_per_image # this is the image-text similarity score
173
- # probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
174
- # print(logits_per_image)
 
 
1
+ import json
2
  from PIL import Image
3
  import requests
4
  from transformers import CLIPProcessor, CLIPModel
 
8
  import numpy as np
9
  import platform
10
  import sys
11
+ import os
12
 
13
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
14
 
 
75
  coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)])
76
  coreml_model.save('text.mlpackage')
77
 
78
+ # convert_coreml()
79
+
80
  def infer_coreml():
81
  coreml_vision_model = ct.models.MLModel('vision.mlpackage')
82
  coreml_text_model = ct.models.MLModel('text.mlpackage')
 
99
 
100
  # convert_onnx()
101
 
 
102
  def infer_onnx():
103
  import onnxruntime as ort
104
 
 
132
 
133
  print("similarity:", logits_per_text.item())
134
 
135
+ # infer_onnx()
 
136
 
137
  def convert_openvino():
138
  import openvino as ov
 
164
 
165
  # infer_openvino()
166
 
 
167
 
168
+ def export_ncnn():
169
+ traced_vision_model.save(f"vision.pt")
170
+ input_shape_str = json.dumps(list(inputs.data['pixel_values'].shape)).replace(" ", "")
171
+ os.system(f"pnnx vision.pt 'inputshape={input_shape_str}'")
172
+
173
+ traced_text_model.save(f"text.pt")
174
+ input_shape_str = json.dumps(list(inputs.data['input_ids'].shape)).replace(" ", "")
175
+ input_shape2_str = json.dumps(list(inputs.data['attention_mask'].shape)).replace(" ", "")
176
+ os.system(f"pnnx text.pt 'inputshape={input_shape_str}i64,{input_shape2_str}i64'")
177
+
178
+ export_ncnn()
179
+
180
+ def infer_ncnn():
181
+ import ncnn
182
+
183
+ vision_extractor = ncnn.Net()
184
+ vision_extractor.load_param("vision.param")
185
+ vision_extractor.load_model("vision.bin")
186
+
187
+ text_extractor = ncnn.Net()
188
+ text_extractor.load_param("text.param")
189
+ text_extractor.load_model("text.bin")
190
+
191
+ vision_mat = ncnn.Mat(inputs.data['pixel_values'].numpy())
192
+ text_input_ids_mat = ncnn.Mat(inputs.data['input_ids'].numpy())
193
+ text_attention_mask_mat = ncnn.Mat(inputs.data['attention_mask'].numpy())
194
+
195
+ vision_extractor.input(vision_extractor.input_names()[0], vision_mat)
196
+ text_extractor.input(text_extractor.input_names()[0], text_input_ids_mat)
197
+ text_extractor.input(text_extractor.input_names()[1], text_attention_mask_mat)
198
+
199
+ image_embeds = vision_extractor.extract("out0")
200
+ text_embeds = text_extractor.extract("out0")
201
+
202
+ logits_per_text = text_embeds @ image_embeds.T
203
+
204
+ print("similarity:", logits_per_text[0])
205
+
206
+ infer_ncnn()
207
 
208
+ def infer_torch():
209
+ outputs = ptmodel(**inputs)
210
+ logits_per_image = outputs.logits_per_image # this is the image-text similarity score
211
+ probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
212
+ print(probs)