Ryenhails commited on Mar 18

Commit

905e386

verified ·

1 Parent(s): c69ea8e

Upload NanoVDR-M-Multi model (multilingual variant)

Browse files

Files changed (20) hide show

0_Transformer/config.json +25 -0
0_Transformer/model.safetensors +3 -0
0_Transformer/sentence_bert_config.json +4 -0
0_Transformer/special_tokens_map.json +7 -0
0_Transformer/tokenizer.json +0 -0
0_Transformer/tokenizer_config.json +56 -0
0_Transformer/vocab.txt +0 -0
1_Pooling/config.json +9 -0
2_Dense/config.json +6 -0
2_Dense/pytorch_model.bin +3 -0
3_Dense/config.json +6 -0
3_Dense/pytorch_model.bin +3 -0
README.md +153 -0
config.json +25 -0
config_sentence_transformers.json +5 -0
modules.json +32 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +56 -0
vocab.txt +0 -0

0_Transformer/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.56.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

0_Transformer/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ba37632d781081d8ec828f55a5e4581863f5385663026d48bacd8f36de58c27
+size 437951328

0_Transformer/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": true
+}

0_Transformer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

0_Transformer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

0_Transformer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

0_Transformer/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false
+}

2_Dense/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "in_features": 768,
+  "out_features": 768,
+  "bias": true,
+  "activation_function": "torch.nn.modules.activation.GELU"
+}

2_Dense/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:263be3f66ae6f965511e0ffc7b8189655d2f26b27fbc5aa1159e58d92cf09ecb
+size 2364309

3_Dense/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "in_features": 768,
+  "out_features": 2048,
+  "bias": true,
+  "activation_function": "torch.nn.modules.linear.Identity"
+}

3_Dense/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55758286f9a35831511a1afa2428bcbfb5d32688a12b0982342d9240a229f88d
+size 6301589

README.md ADDED Viewed

	@@ -0,0 +1,153 @@

+---
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- visual-document-retrieval
+- cross-modal-distillation
+- multilingual
+- nanovdr
+base_model: google-bert/bert-base-uncased
+language:
+- en
+- de
+- fr
+- es
+- it
+- pt
+license: apache-2.0
+---
+> **Paper**: [NanoVDR: Distilling a 2B Vision-Language Retriever into a 70M Text-Only Encoder for Visual Document Retrieval](https://arxiv.org/abs/2603.12824) | [Blog](https://huggingface.co/blog/Ryenhails/nanovdr)
+# NanoVDR-M-Multi: Multilingual Query Encoder for Visual Document Retrieval
+**NanoVDR-M-Multi** is a 116M-parameter multilingual text-only query encoder for visual document retrieval. It retrieves document page images as effectively as Vision-Language Models 30-100x its size, with strong cross-lingual transfer across 6 languages.
+Built on [NanoVDR-S](https://huggingface.co/nanovdr/NanoVDR-S) and further trained with multilingual query augmentation (English + German, French, Spanish, Italian, Portuguese), it is the recommended model for production use with multilingual or mixed-language queries.
+## Results
+| Model | Params | ViDoRe v1 (en) | ViDoRe v2 (multi) | ViDoRe v3 (multi) |
+|-------|--------|----------------|--------------------|--------------------|
+| Qwen3-VL-Emb (Teacher) | 2.0B | 84.3 | 65.3 | 50.0 |
+| **NanoVDR-M-Multi** | **112M** | **82.5** | **62.8** | **47.5** |
+| NanoVDR-S-Multi | 69M | 82.2 | 61.9 | 46.5 |
+| ColPali | ~3B | 84.2 | 54.7 | 42.0 |
+### Per-Language Teacher Retention
+| Language | NDCG@5 | Teacher Retention |
+|----------|--------|-------------------|
+| English | 50.7 | 93.0% |
+| French | 47.8 | 93.6% |
+| Spanish | 47.8 | 93.1% |
+| Italian | 45.7 | 93.3% |
+| German | 45.4 | 92.0% |
+| Portuguese | 46.1 | 94.6% |
+All 6 languages achieve >92% of the 2B teacher's performance.
+## How It Works
+NanoVDR decouples query encoding from document encoding in visual document retrieval:
+- **Offline indexing**: The VLM teacher (Qwen3-VL-Embedding-2B) encodes document page images into single-vector embeddings. This is a one-time cost.
+- **Online querying**: NanoVDR-M-Multi encodes text queries in any supported language into the same embedding space via a lightweight text encoder + MLP projector. No vision model needed at query time.
+Retrieval uses standard cosine similarity between query and document embeddings.
+## Usage
+```python
+from sentence_transformers import SentenceTransformer
+# Load the multilingual query encoder
+model = SentenceTransformer("nanovdr/NanoVDR-M-Multi")
+# Encode queries in any supported language
+queries = [
+    "What was the revenue growth in Q3 2024?",           # English
+    "Quel est le chiffre d'affaires du trimestre?",       # French
+    "Wie hoch war das Umsatzwachstum im dritten Quartal?", # German
+    "Cual fue el crecimiento de ingresos en el Q3?",      # Spanish
+]
+query_embeddings = model.encode(queries)
+print(query_embeddings.shape)  # (4, 2048)
+# Retrieve against pre-indexed document embeddings from the VLM teacher
+# scores = query_embeddings @ doc_embeddings.T
+```
+### Full Retrieval Pipeline
+```python
+from sentence_transformers import SentenceTransformer
+# Step 1: Index documents with the VLM teacher (one-time, offline)
+from transformers import AutoModel
+teacher = AutoModel.from_pretrained("Qwen/Qwen3-VL-Embedding-2B")
+# doc_embeddings = teacher.encode(document_images)  # See Qwen3-VL-Embedding docs
+# Step 2: Query with NanoVDR-S-Multi (online, fast, CPU-only)
+student = SentenceTransformer("nanovdr/NanoVDR-M-Multi")
+query_emb = student.encode("Quel est le chiffre d'affaires?")
+# Step 3: Retrieve
+scores = query_emb @ doc_embeddings.T
+top_k = scores.argsort()[-5:][::-1]
+```
+## Training Details
+- **Architecture**: google-bert/bert-base-uncased + 2-layer MLP projector (768 → 768 → 2048)
+- **Training objective**: Pointwise cosine alignment with teacher query embeddings
+- **Training data**: 1.49M query-document pairs — 711K original (4 public sources) + 778K machine-translated queries in 5 languages (DE, FR, ES, IT, PT) via Helsinki-NLP Opus-MT models
+- **Training cost**: ~15 GPU-hours on a single H200
+- **Epochs**: 10, lr=3e-4, batch size 1024 (effective)
+### Multilingual Augmentation Pipeline
+1. Extract 489K English queries from training data
+2. Translate to 5 target languages using [Helsinki-NLP Opus-MT](https://huggingface.co/Helsinki-NLP) models (~200K per language)
+3. Re-encode translated queries with the frozen teacher in text mode to produce target embeddings
+4. Combine with original 711K pairs → 1.49M total training samples
+## Key Properties
+- **Output dimension**: 2048 (aligned with Qwen3-VL-Embedding-2B)
+- **Max sequence length**: 512 tokens
+- **Supported languages**: English, German, French, Spanish, Italian, Portuguese
+- **Similarity function**: Cosine similarity
+- **Pooling**: Mean pooling
+- **Normalization**: L2-normalized output
+## Efficiency
+| Metric | NanoVDR-M-Multi | ColPali (3B) | Teacher (2B) |
+|--------|------------|--------------|--------------|
+| Query latency (CPU, B=1) | 51 ms | 7,300 ms | GPU only |
+| Model size | 116M | ~3B | 2B |
+| Index type | Single-vector | Multi-vector | Single-vector |
+| Scoring | Cosine | MaxSim | Cosine |
+## Related Models
+- [NanoVDR-S](https://huggingface.co/nanovdr/NanoVDR-S) — English-focused, same architecture
+- [NanoVDR-M](https://huggingface.co/nanovdr/NanoVDR-M) — BERT-base backbone (116M)
+- [NanoVDR-L](https://huggingface.co/nanovdr/NanoVDR-L) — ModernBERT backbone (155M)
+## Citation
+```bibtex
+@article{nanovdr2026,
+  title={NanoVDR: Asymmetric Cross-Modal Distillation for Efficient Visual Document Retrieval},
+  author={...},
+  journal={arXiv preprint},
+  year={2026}
+}
+```
+## License
+Apache 2.0

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.56.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "prompts": {},
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

modules.json ADDED Viewed

	@@ -0,0 +1,32 @@

+[
+  {
+    "idx": 0,
+    "name": "0_Transformer",
+    "path": "0_Transformer",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1_Pooling",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2_Dense",
+    "path": "2_Dense",
+    "type": "sentence_transformers.models.Dense"
+  },
+  {
+    "idx": 3,
+    "name": "3_Dense",
+    "path": "3_Dense",
+    "type": "sentence_transformers.models.Dense"
+  },
+  {
+    "idx": 4,
+    "name": "4_Normalize",
+    "path": "4_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff