Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- README.md +41 -7
- convert_to_onnx.py +107 -0
- requirements-convert.txt +9 -0
- textual.onnx +3 -0
- textual.onnx.data +3 -0
- verify_onnx.py +80 -0
- visual.onnx +3 -0
- visual.onnx.data +3 -0
.gitattributes
CHANGED
|
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
textual.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
visual.onnx.data filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -20,9 +20,24 @@ Model was trained by [Sber AI](https://github.com/sberbank-ai) and [SberDevices]
|
|
| 20 |
* Vision Width: `1024`
|
| 21 |
* Vision Patch Size: `14`
|
| 22 |
|
| 23 |
-
##
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
pip install ruclip
|
| 27 |
```
|
| 28 |
|
|
@@ -30,6 +45,27 @@ pip install ruclip
|
|
| 30 |
clip, processor = ruclip.load("ruclip-vit-large-patch14-336", device="cuda")
|
| 31 |
```
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
## Performance
|
| 34 |
We have evaluated the performance on the following datasets:
|
| 35 |
|
|
@@ -55,9 +91,7 @@ We have evaluated the performance on the following datasets:
|
|
| 55 |
| HatefulMemes | roc-auc | 0.519 |
|
| 56 |
|
| 57 |
|
| 58 |
-
# Authors
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
+ Denis Dimitrov: [Github](https://github.com/denndimitrov)
|
| 63 |
-
+ Igor Pavlov: [Github](https://github.com/boomb0om)
|
|
|
|
| 20 |
* Vision Width: `1024`
|
| 21 |
* Vision Patch Size: `14`
|
| 22 |
|
| 23 |
+
## Files
|
| 24 |
|
| 25 |
+
| File | Purpose |
|
| 26 |
+
|------|---------|
|
| 27 |
+
| `config.json` | Hyperparameters (resolution 336, embed 768, etc.) |
|
| 28 |
+
| `bpe.model` | BPE tokenizer for text |
|
| 29 |
+
| `pytorch_model.bin` | PyTorch weights |
|
| 30 |
+
| `visual.onnx` | Vision encoder (ONNX) — input `[N,3,336,336]` float32, output `[N,768]` |
|
| 31 |
+
| `textual.onnx` | Text encoder (ONNX) — input `[N,77]` int64, output `[N,768]` |
|
| 32 |
+
| `convert_to_onnx.py` | Export PyTorch → ONNX |
|
| 33 |
+
| `verify_onnx.py` | Sanity check ONNX; `--compare` checks against PyTorch (needs `pytorch_model.bin`) |
|
| 34 |
+
| `requirements-convert.txt` | Dependencies for conversion |
|
| 35 |
+
|
| 36 |
+
## Usage
|
| 37 |
+
|
| 38 |
+
### PyTorch
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
pip install ruclip
|
| 42 |
```
|
| 43 |
|
|
|
|
| 45 |
clip, processor = ruclip.load("ruclip-vit-large-patch14-336", device="cuda")
|
| 46 |
```
|
| 47 |
|
| 48 |
+
### ONNX
|
| 49 |
+
|
| 50 |
+
Image: normalized `[N,3,336,336]` (config mean/std). Text: tokenized via `bpe.model` to `[N,77]`.
|
| 51 |
+
|
| 52 |
+
```python
|
| 53 |
+
import onnxruntime as ort
|
| 54 |
+
|
| 55 |
+
v = ort.InferenceSession("visual.onnx", providers=["CPUExecutionProvider"])
|
| 56 |
+
t = ort.InferenceSession("textual.onnx", providers=["CPUExecutionProvider"])
|
| 57 |
+
img_emb = v.run(None, {v.get_inputs()[0].name: image})[0]
|
| 58 |
+
txt_emb = t.run(None, {t.get_inputs()[0].name: text})[0]
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### Regenerating ONNX
|
| 62 |
+
|
| 63 |
+
1. `pip install Cython && pip install --no-build-isolation ruclip`
|
| 64 |
+
2. `pip install torch onnx onnxruntime Pillow numpy`
|
| 65 |
+
3. `pip install --no-build-isolation git+https://github.com/Lednik7/CLIP-ONNX.git`
|
| 66 |
+
4. `python convert_to_onnx.py --output-dir .`
|
| 67 |
+
5. `python verify_onnx.py` (optionally `--compare` if `pytorch_model.bin` is present)
|
| 68 |
+
|
| 69 |
## Performance
|
| 70 |
We have evaluated the performance on the following datasets:
|
| 71 |
|
|
|
|
| 91 |
| HatefulMemes | roc-auc | 0.519 |
|
| 92 |
|
| 93 |
|
| 94 |
+
## Authors
|
| 95 |
|
| 96 |
+
RuCLIP: Alex Shonenkov, Daniil Chesakov, Denis Dimitrov, Igor Pavlov — [ai-forever/ru-clip](https://github.com/ai-forever/ru-clip).
|
| 97 |
+
ONNX conversion: [Tim Tkachev](https://github.com/timuchen).
|
|
|
|
|
|
convert_to_onnx.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Convert RuCLIP vit-large-patch14-336 from PyTorch to ONNX.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import argparse
|
| 8 |
+
import torch
|
| 9 |
+
from torch import nn
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# RuCLIP-specific Textual wrapper (uses eos_id instead of argmax)
|
| 13 |
+
class RuCLIPTextual(nn.Module):
|
| 14 |
+
"""Text encoder wrapper for RuCLIP - uses eos_id for pooling."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, model):
|
| 17 |
+
super().__init__()
|
| 18 |
+
self.eos_id = model.eos_id
|
| 19 |
+
self.transformer = model.transformer
|
| 20 |
+
self.positional_embedding = model.positional_embedding
|
| 21 |
+
self.ln_final = model.ln_final
|
| 22 |
+
self.text_projection = model.text_projection
|
| 23 |
+
self.token_embedding = model.token_embedding
|
| 24 |
+
|
| 25 |
+
def forward(self, text):
|
| 26 |
+
x = self.token_embedding(text)
|
| 27 |
+
x = x + self.positional_embedding
|
| 28 |
+
x = x.permute(1, 0, 2) # NLD -> LND
|
| 29 |
+
x = self.transformer(x)
|
| 30 |
+
x = x.permute(1, 0, 2) # LND -> NLD
|
| 31 |
+
x = self.ln_final(x)
|
| 32 |
+
|
| 33 |
+
# RuCLIP: take features at EOS position (eos_id=3)
|
| 34 |
+
eos_positions = torch.where(text == self.eos_id)[1]
|
| 35 |
+
x = x[torch.arange(x.shape[0], device=x.device), eos_positions] @ self.text_projection
|
| 36 |
+
return x
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def convert(model_dir: str, output_dir: str):
|
| 40 |
+
# Install clip_onnx if not present
|
| 41 |
+
try:
|
| 42 |
+
from clip_onnx import clip_onnx
|
| 43 |
+
except ImportError:
|
| 44 |
+
print("Installing clip-onnx...")
|
| 45 |
+
os.system(f"{sys.executable} -m pip install git+https://github.com/Lednik7/CLIP-ONNX.git -q")
|
| 46 |
+
from clip_onnx import clip_onnx
|
| 47 |
+
|
| 48 |
+
import ruclip
|
| 49 |
+
from PIL import Image
|
| 50 |
+
import numpy as np
|
| 51 |
+
|
| 52 |
+
print("Loading RuCLIP model (device=cpu for ONNX export)...")
|
| 53 |
+
# Use local dir if pytorch_model.bin exists, else ruclip loads from ai-forever/ruclip-vit-large-patch14-336
|
| 54 |
+
if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "pytorch_model.bin")):
|
| 55 |
+
from ruclip.model import CLIP
|
| 56 |
+
from ruclip.processor import RuCLIPProcessor
|
| 57 |
+
model = CLIP.from_pretrained(model_dir).eval().to("cpu")
|
| 58 |
+
processor = RuCLIPProcessor.from_pretrained(model_dir)
|
| 59 |
+
else:
|
| 60 |
+
model, processor = ruclip.load("ruclip-vit-large-patch14-336", device="cpu")
|
| 61 |
+
|
| 62 |
+
# Create dummy inputs
|
| 63 |
+
# Image: 336x336, batch=1
|
| 64 |
+
dummy_image = Image.new("RGB", (336, 336), color="gray")
|
| 65 |
+
labels = ["тест", "собака", "кошка"]
|
| 66 |
+
dummy_input = processor(text=labels, images=[dummy_image], return_tensors="pt", padding=True)
|
| 67 |
+
|
| 68 |
+
image = dummy_input["pixel_values"] # [1, 3, 336, 336]
|
| 69 |
+
text = dummy_input["input_ids"] # [3, 77]
|
| 70 |
+
|
| 71 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 72 |
+
visual_path = os.path.join(output_dir, "visual.onnx")
|
| 73 |
+
textual_path = os.path.join(output_dir, "textual.onnx")
|
| 74 |
+
|
| 75 |
+
print("Converting to ONNX...")
|
| 76 |
+
onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)
|
| 77 |
+
onnx_model.convert2onnx(
|
| 78 |
+
image,
|
| 79 |
+
text,
|
| 80 |
+
verbose=True,
|
| 81 |
+
textual_wrapper=RuCLIPTextual,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
print(f"\nDone! ONNX models saved to {output_dir}/")
|
| 85 |
+
print(f" - {visual_path}")
|
| 86 |
+
print(f" - {textual_path}")
|
| 87 |
+
return visual_path, textual_path
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def main():
|
| 91 |
+
parser = argparse.ArgumentParser(description="Convert RuCLIP to ONNX")
|
| 92 |
+
parser.add_argument(
|
| 93 |
+
"--model-dir",
|
| 94 |
+
default=".",
|
| 95 |
+
help="Path to model (local dir or HF model id). Default: current dir or ttkacheff/ruclip-vit-large-patch14-336-onnx",
|
| 96 |
+
)
|
| 97 |
+
parser.add_argument(
|
| 98 |
+
"--output-dir",
|
| 99 |
+
default=".",
|
| 100 |
+
help="Output directory for ONNX files. Default: current dir",
|
| 101 |
+
)
|
| 102 |
+
args = parser.parse_args()
|
| 103 |
+
convert(args.model_dir, args.output_dir)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
main()
|
requirements-convert.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Install: pip install Cython && pip install --no-build-isolation ruclip
|
| 2 |
+
# Then: pip install torch onnx onnxruntime Pillow numpy
|
| 3 |
+
# And: pip install git+https://github.com/Lednik7/CLIP-ONNX.git
|
| 4 |
+
Cython
|
| 5 |
+
torch>=1.9.0
|
| 6 |
+
onnx>=1.10.0
|
| 7 |
+
onnxruntime>=1.10.0
|
| 8 |
+
Pillow
|
| 9 |
+
numpy
|
textual.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ff9e5a7869c3fa354dee602559d27c43b8b6d1f94656d75b8d903a9e3ec6ca8
|
| 3 |
+
size 1167191
|
textual.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f93e69dab9d8b57b5c92c7c5b65c98fee1725238afadf5951bbc37ba56f6c4de
|
| 3 |
+
size 494927872
|
verify_onnx.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Verify ONNX models: run inference and optionally compare with PyTorch.
|
| 4 |
+
Usage: python verify_onnx.py [--compare] # --compare needs local pytorch_model.bin
|
| 5 |
+
"""
|
| 6 |
+
import argparse
|
| 7 |
+
import os
|
| 8 |
+
import numpy as np
|
| 9 |
+
import onnxruntime as ort
|
| 10 |
+
|
| 11 |
+
def verify_onnx_only():
|
| 12 |
+
"""Basic check: load ONNX models and run inference."""
|
| 13 |
+
visual_session = ort.InferenceSession("visual.onnx", providers=["CPUExecutionProvider"])
|
| 14 |
+
textual_session = ort.InferenceSession("textual.onnx", providers=["CPUExecutionProvider"])
|
| 15 |
+
|
| 16 |
+
# Dummy inputs: image [1,3,336,336], text [3,77]
|
| 17 |
+
image_np = np.random.randn(1, 3, 336, 336).astype(np.float32) * 0.1
|
| 18 |
+
text_np = np.zeros((3, 77), dtype=np.int64)
|
| 19 |
+
text_np[:, 0] = 49406 # BOS
|
| 20 |
+
text_np[:, 1] = 3 # EOS (RuCLIP eos_id)
|
| 21 |
+
|
| 22 |
+
img_out = visual_session.run(None, {visual_session.get_inputs()[0].name: image_np})[0]
|
| 23 |
+
txt_out = textual_session.run(None, {textual_session.get_inputs()[0].name: text_np})[0]
|
| 24 |
+
|
| 25 |
+
assert img_out.shape == (1, 768), f"visual: expected (1,768), got {img_out.shape}"
|
| 26 |
+
assert txt_out.shape == (3, 768), f"textual: expected (3,768), got {txt_out.shape}"
|
| 27 |
+
assert not np.any(np.isnan(img_out)) and not np.any(np.isnan(txt_out)), "NaNs in output"
|
| 28 |
+
print("ONNX-only check: PASS (models load and produce valid outputs)")
|
| 29 |
+
|
| 30 |
+
def verify_with_pytorch():
|
| 31 |
+
"""Compare ONNX vs PyTorch outputs (requires local pytorch_model.bin)."""
|
| 32 |
+
import torch
|
| 33 |
+
from PIL import Image
|
| 34 |
+
|
| 35 |
+
model_dir = "."
|
| 36 |
+
if not os.path.exists(os.path.join(model_dir, "pytorch_model.bin")):
|
| 37 |
+
print("pytorch_model.bin not found. Run without --compare for ONNX-only verification.")
|
| 38 |
+
return
|
| 39 |
+
from ruclip.model import CLIP
|
| 40 |
+
from ruclip.processor import RuCLIPProcessor
|
| 41 |
+
model_pt = CLIP.from_pretrained(model_dir).eval().to("cpu")
|
| 42 |
+
processor = RuCLIPProcessor.from_pretrained(model_dir)
|
| 43 |
+
|
| 44 |
+
visual_session = ort.InferenceSession("visual.onnx", providers=["CPUExecutionProvider"])
|
| 45 |
+
textual_session = ort.InferenceSession("textual.onnx", providers=["CPUExecutionProvider"])
|
| 46 |
+
|
| 47 |
+
dummy_image = Image.new("RGB", (336, 336), color="gray")
|
| 48 |
+
texts = ["тест", "собака", "кошка"]
|
| 49 |
+
batch = processor(text=texts, images=[dummy_image], return_tensors="pt", padding=True)
|
| 50 |
+
image_pt = batch["pixel_values"]
|
| 51 |
+
text_pt = batch["input_ids"]
|
| 52 |
+
image_np = image_pt.numpy().astype(np.float32)
|
| 53 |
+
text_np = text_pt.numpy().astype(np.int64)
|
| 54 |
+
|
| 55 |
+
with torch.no_grad():
|
| 56 |
+
img_feat_pt = model_pt.encode_image(image_pt)
|
| 57 |
+
txt_feat_pt = model_pt.encode_text(text_pt)
|
| 58 |
+
|
| 59 |
+
img_feat_onnx = visual_session.run(None, {visual_session.get_inputs()[0].name: image_np})[0]
|
| 60 |
+
txt_feat_onnx = textual_session.run(None, {textual_session.get_inputs()[0].name: text_np})[0]
|
| 61 |
+
|
| 62 |
+
def compare(name, a, b):
|
| 63 |
+
a, b = np.asarray(a), np.asarray(b)
|
| 64 |
+
diff = np.abs(a - b)
|
| 65 |
+
print(f"{name}: max_diff={diff.max():.6f}, mean_diff={diff.mean():.6f}")
|
| 66 |
+
|
| 67 |
+
compare("Image features", img_feat_pt.numpy(), img_feat_onnx)
|
| 68 |
+
compare("Text features", txt_feat_pt.numpy(), txt_feat_onnx)
|
| 69 |
+
|
| 70 |
+
tol = 1e-3
|
| 71 |
+
ok = np.allclose(img_feat_pt.numpy(), img_feat_onnx, atol=tol) and np.allclose(txt_feat_pt.numpy(), txt_feat_onnx, atol=tol)
|
| 72 |
+
print(f"\n{'PASS' if ok else 'FAIL'}: ONNX matches PyTorch within atol={tol}")
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
p = argparse.ArgumentParser()
|
| 76 |
+
p.add_argument("--compare", action="store_true", help="Compare with PyTorch (needs pytorch_model.bin)")
|
| 77 |
+
args = p.parse_args()
|
| 78 |
+
verify_onnx_only()
|
| 79 |
+
if args.compare:
|
| 80 |
+
verify_with_pytorch()
|
visual.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e57c551ade9dbf802fc0ad63b7ea39c8b76082a332c2d7703da96bbeb9aba992
|
| 3 |
+
size 1920447
|
visual.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b82be69273aede44958a87543fb3c3ea5de21ce9f2a3458871e9505e4a9135d
|
| 3 |
+
size 1217265664
|