ruclip-vit-large-patch14-336-onnx

ttkacheff commited on Feb 22

Commit

609ea96

verified ·

1 Parent(s): 92f6fdb

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitattributes +2 -0
README.md +41 -7
convert_to_onnx.py +107 -0
requirements-convert.txt +9 -0
textual.onnx +3 -0
textual.onnx.data +3 -0
verify_onnx.py +80 -0
visual.onnx +3 -0
visual.onnx.data +3 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+textual.onnx.data filter=lfs diff=lfs merge=lfs -text
+visual.onnx.data filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -20,9 +20,24 @@ Model was trained by [Sber AI](https://github.com/sberbank-ai) and [SberDevices]
 * Vision Width: `1024`
 * Vision Patch Size: `14`
-## Usage [Github](https://github.com/sberbank-ai/ru-clip)
-```
 pip install ruclip
 ```
@@ -30,6 +45,27 @@ pip install ruclip
 clip, processor = ruclip.load("ruclip-vit-large-patch14-336", device="cuda")
 ```
 ## Performance
 We have evaluated the performance on the following datasets:
@@ -55,9 +91,7 @@ We have evaluated the performance on the following datasets:
 | HatefulMemes  | roc-auc        | 0.519               |
-# Authors
-+ Alex Shonenkov: [Github](https://github.com/shonenkov), [Kaggle GM](https://www.kaggle.com/shonenkov)
-+ Daniil Chesakov: [Github](https://github.com/Danyache)
-+ Denis Dimitrov: [Github](https://github.com/denndimitrov)
-+ Igor Pavlov: [Github](https://github.com/boomb0om)

 * Vision Width: `1024`
 * Vision Patch Size: `14`
+## Files
+| File | Purpose |
+|------|---------|
+| `config.json` | Hyperparameters (resolution 336, embed 768, etc.) |
+| `bpe.model` | BPE tokenizer for text |
+| `pytorch_model.bin` | PyTorch weights |
+| `visual.onnx` | Vision encoder (ONNX) — input `[N,3,336,336]` float32, output `[N,768]` |
+| `textual.onnx` | Text encoder (ONNX) — input `[N,77]` int64, output `[N,768]` |
+| `convert_to_onnx.py` | Export PyTorch → ONNX |
+| `verify_onnx.py` | Sanity check ONNX; `--compare` checks against PyTorch (needs `pytorch_model.bin`) |
+| `requirements-convert.txt` | Dependencies for conversion |
+## Usage
+### PyTorch
+```bash
 pip install ruclip
 ```
 clip, processor = ruclip.load("ruclip-vit-large-patch14-336", device="cuda")
 ```
+### ONNX
+Image: normalized `[N,3,336,336]` (config mean/std). Text: tokenized via `bpe.model` to `[N,77]`.
+```python
+import onnxruntime as ort
+v = ort.InferenceSession("visual.onnx", providers=["CPUExecutionProvider"])
+t = ort.InferenceSession("textual.onnx", providers=["CPUExecutionProvider"])
+img_emb = v.run(None, {v.get_inputs()[0].name: image})[0]
+txt_emb = t.run(None, {t.get_inputs()[0].name: text})[0]
+```
+### Regenerating ONNX
+1. `pip install Cython && pip install --no-build-isolation ruclip`
+2. `pip install torch onnx onnxruntime Pillow numpy`
+3. `pip install --no-build-isolation git+https://github.com/Lednik7/CLIP-ONNX.git`
+4. `python convert_to_onnx.py --output-dir .`
+5. `python verify_onnx.py` (optionally `--compare` if `pytorch_model.bin` is present)
 ## Performance
 We have evaluated the performance on the following datasets:
 | HatefulMemes  | roc-auc        | 0.519               |
+## Authors
+RuCLIP: Alex Shonenkov, Daniil Chesakov, Denis Dimitrov, Igor Pavlov — [ai-forever/ru-clip](https://github.com/ai-forever/ru-clip).
+ONNX conversion: [Tim Tkachev](https://github.com/timuchen).

convert_to_onnx.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python3
+"""
+Convert RuCLIP vit-large-patch14-336 from PyTorch to ONNX.
+"""
+import os
+import sys
+import argparse
+import torch
+from torch import nn
+# RuCLIP-specific Textual wrapper (uses eos_id instead of argmax)
+class RuCLIPTextual(nn.Module):
+    """Text encoder wrapper for RuCLIP - uses eos_id for pooling."""
+    def __init__(self, model):
+        super().__init__()
+        self.eos_id = model.eos_id
+        self.transformer = model.transformer
+        self.positional_embedding = model.positional_embedding
+        self.ln_final = model.ln_final
+        self.text_projection = model.text_projection
+        self.token_embedding = model.token_embedding
+    def forward(self, text):
+        x = self.token_embedding(text)
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        # RuCLIP: take features at EOS position (eos_id=3)
+        eos_positions = torch.where(text == self.eos_id)[1]
+        x = x[torch.arange(x.shape[0], device=x.device), eos_positions] @ self.text_projection
+        return x
+def convert(model_dir: str, output_dir: str):
+    # Install clip_onnx if not present
+    try:
+        from clip_onnx import clip_onnx
+    except ImportError:
+        print("Installing clip-onnx...")
+        os.system(f"{sys.executable} -m pip install git+https://github.com/Lednik7/CLIP-ONNX.git -q")
+        from clip_onnx import clip_onnx
+    import ruclip
+    from PIL import Image
+    import numpy as np
+    print("Loading RuCLIP model (device=cpu for ONNX export)...")
+    # Use local dir if pytorch_model.bin exists, else ruclip loads from ai-forever/ruclip-vit-large-patch14-336
+    if os.path.isdir(model_dir) and os.path.exists(os.path.join(model_dir, "pytorch_model.bin")):
+        from ruclip.model import CLIP
+        from ruclip.processor import RuCLIPProcessor
+        model = CLIP.from_pretrained(model_dir).eval().to("cpu")
+        processor = RuCLIPProcessor.from_pretrained(model_dir)
+    else:
+        model, processor = ruclip.load("ruclip-vit-large-patch14-336", device="cpu")
+    # Create dummy inputs
+    # Image: 336x336, batch=1
+    dummy_image = Image.new("RGB", (336, 336), color="gray")
+    labels = ["тест", "собака", "кошка"]
+    dummy_input = processor(text=labels, images=[dummy_image], return_tensors="pt", padding=True)
+    image = dummy_input["pixel_values"]  # [1, 3, 336, 336]
+    text = dummy_input["input_ids"]      # [3, 77]
+    os.makedirs(output_dir, exist_ok=True)
+    visual_path = os.path.join(output_dir, "visual.onnx")
+    textual_path = os.path.join(output_dir, "textual.onnx")
+    print("Converting to ONNX...")
+    onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)
+    onnx_model.convert2onnx(
+        image,
+        text,
+        verbose=True,
+        textual_wrapper=RuCLIPTextual,
+    )
+    print(f"\nDone! ONNX models saved to {output_dir}/")
+    print(f"  - {visual_path}")
+    print(f"  - {textual_path}")
+    return visual_path, textual_path
+def main():
+    parser = argparse.ArgumentParser(description="Convert RuCLIP to ONNX")
+    parser.add_argument(
+        "--model-dir",
+        default=".",
+        help="Path to model (local dir or HF model id). Default: current dir or ttkacheff/ruclip-vit-large-patch14-336-onnx",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=".",
+        help="Output directory for ONNX files. Default: current dir",
+    )
+    args = parser.parse_args()
+    convert(args.model_dir, args.output_dir)
+if __name__ == "__main__":
+    main()

requirements-convert.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# Install: pip install Cython && pip install --no-build-isolation ruclip
+# Then: pip install torch onnx onnxruntime Pillow numpy
+# And: pip install git+https://github.com/Lednik7/CLIP-ONNX.git
+Cython
+torch>=1.9.0
+onnx>=1.10.0
+onnxruntime>=1.10.0
+Pillow
+numpy

textual.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ff9e5a7869c3fa354dee602559d27c43b8b6d1f94656d75b8d903a9e3ec6ca8
+size 1167191

textual.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f93e69dab9d8b57b5c92c7c5b65c98fee1725238afadf5951bbc37ba56f6c4de
+size 494927872

verify_onnx.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python3
+"""
+Verify ONNX models: run inference and optionally compare with PyTorch.
+Usage: python verify_onnx.py [--compare]  # --compare needs local pytorch_model.bin
+"""
+import argparse
+import os
+import numpy as np
+import onnxruntime as ort
+def verify_onnx_only():
+    """Basic check: load ONNX models and run inference."""
+    visual_session = ort.InferenceSession("visual.onnx", providers=["CPUExecutionProvider"])
+    textual_session = ort.InferenceSession("textual.onnx", providers=["CPUExecutionProvider"])
+    # Dummy inputs: image [1,3,336,336], text [3,77]
+    image_np = np.random.randn(1, 3, 336, 336).astype(np.float32) * 0.1
+    text_np = np.zeros((3, 77), dtype=np.int64)
+    text_np[:, 0] = 49406  # BOS
+    text_np[:, 1] = 3     # EOS (RuCLIP eos_id)
+    img_out = visual_session.run(None, {visual_session.get_inputs()[0].name: image_np})[0]
+    txt_out = textual_session.run(None, {textual_session.get_inputs()[0].name: text_np})[0]
+    assert img_out.shape == (1, 768), f"visual: expected (1,768), got {img_out.shape}"
+    assert txt_out.shape == (3, 768), f"textual: expected (3,768), got {txt_out.shape}"
+    assert not np.any(np.isnan(img_out)) and not np.any(np.isnan(txt_out)), "NaNs in output"
+    print("ONNX-only check: PASS (models load and produce valid outputs)")
+def verify_with_pytorch():
+    """Compare ONNX vs PyTorch outputs (requires local pytorch_model.bin)."""
+    import torch
+    from PIL import Image
+    model_dir = "."
+    if not os.path.exists(os.path.join(model_dir, "pytorch_model.bin")):
+        print("pytorch_model.bin not found. Run without --compare for ONNX-only verification.")
+        return
+    from ruclip.model import CLIP
+    from ruclip.processor import RuCLIPProcessor
+    model_pt = CLIP.from_pretrained(model_dir).eval().to("cpu")
+    processor = RuCLIPProcessor.from_pretrained(model_dir)
+    visual_session = ort.InferenceSession("visual.onnx", providers=["CPUExecutionProvider"])
+    textual_session = ort.InferenceSession("textual.onnx", providers=["CPUExecutionProvider"])
+    dummy_image = Image.new("RGB", (336, 336), color="gray")
+    texts = ["тест", "собака", "кошка"]
+    batch = processor(text=texts, images=[dummy_image], return_tensors="pt", padding=True)
+    image_pt = batch["pixel_values"]
+    text_pt = batch["input_ids"]
+    image_np = image_pt.numpy().astype(np.float32)
+    text_np = text_pt.numpy().astype(np.int64)
+    with torch.no_grad():
+        img_feat_pt = model_pt.encode_image(image_pt)
+        txt_feat_pt = model_pt.encode_text(text_pt)
+    img_feat_onnx = visual_session.run(None, {visual_session.get_inputs()[0].name: image_np})[0]
+    txt_feat_onnx = textual_session.run(None, {textual_session.get_inputs()[0].name: text_np})[0]
+    def compare(name, a, b):
+        a, b = np.asarray(a), np.asarray(b)
+        diff = np.abs(a - b)
+        print(f"{name}: max_diff={diff.max():.6f}, mean_diff={diff.mean():.6f}")
+    compare("Image features", img_feat_pt.numpy(), img_feat_onnx)
+    compare("Text features", txt_feat_pt.numpy(), txt_feat_onnx)
+    tol = 1e-3
+    ok = np.allclose(img_feat_pt.numpy(), img_feat_onnx, atol=tol) and np.allclose(txt_feat_pt.numpy(), txt_feat_onnx, atol=tol)
+    print(f"\n{'PASS' if ok else 'FAIL'}: ONNX matches PyTorch within atol={tol}")
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--compare", action="store_true", help="Compare with PyTorch (needs pytorch_model.bin)")
+    args = p.parse_args()
+    verify_onnx_only()
+    if args.compare:
+        verify_with_pytorch()

visual.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e57c551ade9dbf802fc0ad63b7ea39c8b76082a332c2d7703da96bbeb9aba992
+size 1920447

visual.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b82be69273aede44958a87543fb3c3ea5de21ce9f2a3458871e9505e4a9135d
+size 1217265664