Koushik Dutta commited on
Commit
94a263a
·
1 Parent(s): 4629fa5
Files changed (1) hide show
  1. export.py +39 -1
export.py CHANGED
@@ -5,6 +5,8 @@ from transformers.models.clip.modeling_clip import _get_vector_norm
5
  import coremltools as ct
6
  import torch
7
  import numpy as np
 
 
8
 
9
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
10
 
@@ -94,6 +96,42 @@ def convert_onnx():
94
  # convert_onnx()
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def convert_openvino():
98
  import openvino as ov
99
  ov_vision_model = ov.convert_model(traced_vision_model, example_input=inputs.data['pixel_values'])
@@ -122,7 +160,7 @@ def infer_openvino():
122
 
123
  print("similarity:", logits_per_text.item())
124
 
125
- infer_openvino()
126
 
127
  # convert_coreml()
128
 
 
5
  import coremltools as ct
6
  import torch
7
  import numpy as np
8
+ import platform
9
+ import sys
10
 
11
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
12
 
 
96
  # convert_onnx()
97
 
98
 
99
+ def infer_onnx():
100
+ import onnxruntime as ort
101
+
102
+ providers: list[str] = []
103
+ if sys.platform == "darwin":
104
+ providers.append("CoreMLExecutionProvider")
105
+
106
+ if ("linux" in sys.platform or "win" in sys.platform) and (
107
+ platform.machine() == "x86_64" or platform.machine() == "AMD64"
108
+ ):
109
+ providers.append(("CUDAExecutionProvider", {"device_id": 0}))
110
+
111
+ providers.append("CPUExecutionProvider")
112
+
113
+ vision_session = ort.InferenceSession("vision.onnx", providers=providers)
114
+ text_session = ort.InferenceSession("text.onnx", providers=providers)
115
+
116
+ vision_inputs = {vision_session.get_inputs()[0].name: np_inputs['pixel_values']}
117
+ text_inputs = {
118
+ text_session.get_inputs()[0].name: np_inputs['input_ids'],
119
+ text_session.get_inputs()[1].name: np_inputs['attention_mask']
120
+ }
121
+
122
+ vision_predictions = vision_session.run(None, vision_inputs)
123
+ text_predictions = text_session.run(None, text_inputs)
124
+
125
+ image_embeds = vision_predictions[0]
126
+ text_embeds = text_predictions[0]
127
+
128
+ logits_per_text = text_embeds @ image_embeds.T
129
+
130
+ print("similarity:", logits_per_text.item())
131
+
132
+ infer_onnx()
133
+
134
+
135
  def convert_openvino():
136
  import openvino as ov
137
  ov_vision_model = ov.convert_model(traced_vision_model, example_input=inputs.data['pixel_values'])
 
160
 
161
  print("similarity:", logits_per_text.item())
162
 
163
+ # infer_openvino()
164
 
165
  # convert_coreml()
166