Add ONNX exports and ONNXRuntime examples
Browse files- .gitattributes +1 -0
- README.md +69 -0
- onnx/image_encoder.onnx +3 -0
- onnx/text_encoder.onnx +3 -0
.gitattributes
CHANGED
|
@@ -3,5 +3,6 @@
|
|
| 3 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 4 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 6 |
sp.model filter=lfs diff=lfs merge=lfs -text
|
| 7 |
vlmo/tokenizer/sp.model filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 3 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 4 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 7 |
sp.model filter=lfs diff=lfs merge=lfs -text
|
| 8 |
vlmo/tokenizer/sp.model filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -63,6 +63,75 @@ print(probs)
|
|
| 63 |
`model(**inputs)` also returns `logits_per_image` and `logits_per_text`, which use the model's learned `logit_scale`.
|
| 64 |
Those logits are useful, but they are not the same computation as the raw dot product in the original ModelScope demo.
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
## Upload
|
| 67 |
|
| 68 |
Option 1:
|
|
|
|
| 63 |
`model(**inputs)` also returns `logits_per_image` and `logits_per_text`, which use the model's learned `logit_scale`.
|
| 64 |
Those logits are useful, but they are not the same computation as the raw dot product in the original ModelScope demo.
|
| 65 |
|
| 66 |
+
### ONNXRuntime
|
| 67 |
+
|
| 68 |
+
This repo also includes two ONNX exports:
|
| 69 |
+
|
| 70 |
+
- `onnx/text_encoder.onnx`
|
| 71 |
+
- `onnx/image_encoder.onnx`
|
| 72 |
+
|
| 73 |
+
Example:
|
| 74 |
+
|
| 75 |
+
```python
|
| 76 |
+
import importlib
|
| 77 |
+
import json
|
| 78 |
+
import os
|
| 79 |
+
import sys
|
| 80 |
+
|
| 81 |
+
import onnxruntime as ort
|
| 82 |
+
from huggingface_hub import snapshot_download
|
| 83 |
+
from PIL import Image
|
| 84 |
+
|
| 85 |
+
repo_id = "malusama/M2-Encoder-0.4B"
|
| 86 |
+
model_dir = snapshot_download(repo_id=repo_id)
|
| 87 |
+
sys.path.insert(0, model_dir)
|
| 88 |
+
|
| 89 |
+
tokenizer_config = json.load(open(os.path.join(model_dir, "tokenizer_config.json"), "r", encoding="utf-8"))
|
| 90 |
+
GLMChineseTokenizer = importlib.import_module("tokenization_glm").GLMChineseTokenizer
|
| 91 |
+
M2EncoderImageProcessor = importlib.import_module("image_processing_m2_encoder").M2EncoderImageProcessor
|
| 92 |
+
|
| 93 |
+
tokenizer = GLMChineseTokenizer(
|
| 94 |
+
vocab_file=os.path.join(model_dir, "sp.model"),
|
| 95 |
+
eos_token=tokenizer_config.get("eos_token"),
|
| 96 |
+
pad_token=tokenizer_config.get("pad_token"),
|
| 97 |
+
cls_token=tokenizer_config.get("cls_token"),
|
| 98 |
+
mask_token=tokenizer_config.get("mask_token"),
|
| 99 |
+
unk_token=tokenizer_config.get("unk_token"),
|
| 100 |
+
)
|
| 101 |
+
image_processor = M2EncoderImageProcessor.from_pretrained(model_dir)
|
| 102 |
+
|
| 103 |
+
text_inputs = tokenizer(
|
| 104 |
+
["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"],
|
| 105 |
+
padding="max_length",
|
| 106 |
+
truncation=True,
|
| 107 |
+
max_length=52,
|
| 108 |
+
return_special_tokens_mask=True,
|
| 109 |
+
return_tensors="np",
|
| 110 |
+
)
|
| 111 |
+
image_inputs = image_processor(Image.open("pokemon.jpeg").convert("RGB"), return_tensors="np")
|
| 112 |
+
|
| 113 |
+
text_session = ort.InferenceSession(
|
| 114 |
+
os.path.join(model_dir, "onnx", "text_encoder.onnx"),
|
| 115 |
+
providers=["CPUExecutionProvider"],
|
| 116 |
+
)
|
| 117 |
+
image_session = ort.InferenceSession(
|
| 118 |
+
os.path.join(model_dir, "onnx", "image_encoder.onnx"),
|
| 119 |
+
providers=["CPUExecutionProvider"],
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
text_embeds = text_session.run(
|
| 123 |
+
None,
|
| 124 |
+
{
|
| 125 |
+
"input_ids": text_inputs["input_ids"],
|
| 126 |
+
"attention_mask": text_inputs["attention_mask"],
|
| 127 |
+
},
|
| 128 |
+
)[0]
|
| 129 |
+
image_embeds = image_session.run(
|
| 130 |
+
None,
|
| 131 |
+
{"pixel_values": image_inputs["pixel_values"]},
|
| 132 |
+
)[0]
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
## Upload
|
| 136 |
|
| 137 |
Option 1:
|
onnx/image_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94e83a83b64205d24ea242817ea1742165f317a9847f3ae0d6d9643fdfe41b81
|
| 3 |
+
size 346249795
|
onnx/text_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69e17059411d962f013178ec080f928b6b5e0020da8a601219f12289e9aa2f68
|
| 3 |
+
size 700439482
|