Ryenhails commited on
Commit
905e386
·
verified ·
1 Parent(s): c69ea8e

Upload NanoVDR-M-Multi model (multilingual variant)

Browse files
0_Transformer/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.56.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
0_Transformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ba37632d781081d8ec828f55a5e4581863f5385663026d48bacd8f36de58c27
3
+ size 437951328
0_Transformer/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": true
4
+ }
0_Transformer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
0_Transformer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
0_Transformer/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
0_Transformer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
1_Pooling/config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false
9
+ }
2_Dense/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "in_features": 768,
3
+ "out_features": 768,
4
+ "bias": true,
5
+ "activation_function": "torch.nn.modules.activation.GELU"
6
+ }
2_Dense/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:263be3f66ae6f965511e0ffc7b8189655d2f26b27fbc5aa1159e58d92cf09ecb
3
+ size 2364309
3_Dense/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "in_features": 768,
3
+ "out_features": 2048,
4
+ "bias": true,
5
+ "activation_function": "torch.nn.modules.linear.Identity"
6
+ }
3_Dense/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55758286f9a35831511a1afa2428bcbfb5d32688a12b0982342d9240a229f88d
3
+ size 6301589
README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: sentence-transformers
3
+ pipeline_tag: sentence-similarity
4
+ tags:
5
+ - sentence-transformers
6
+ - feature-extraction
7
+ - visual-document-retrieval
8
+ - cross-modal-distillation
9
+ - multilingual
10
+ - nanovdr
11
+ base_model: google-bert/bert-base-uncased
12
+ language:
13
+ - en
14
+ - de
15
+ - fr
16
+ - es
17
+ - it
18
+ - pt
19
+ license: apache-2.0
20
+ ---
21
+
22
+ > **Paper**: [NanoVDR: Distilling a 2B Vision-Language Retriever into a 70M Text-Only Encoder for Visual Document Retrieval](https://arxiv.org/abs/2603.12824) | [Blog](https://huggingface.co/blog/Ryenhails/nanovdr)
23
+
24
+ # NanoVDR-M-Multi: Multilingual Query Encoder for Visual Document Retrieval
25
+
26
+ **NanoVDR-M-Multi** is a 116M-parameter multilingual text-only query encoder for visual document retrieval. It retrieves document page images as effectively as Vision-Language Models 30-100x its size, with strong cross-lingual transfer across 6 languages.
27
+
28
+ Built on [NanoVDR-S](https://huggingface.co/nanovdr/NanoVDR-S) and further trained with multilingual query augmentation (English + German, French, Spanish, Italian, Portuguese), it is the recommended model for production use with multilingual or mixed-language queries.
29
+
30
+ ## Results
31
+
32
+ | Model | Params | ViDoRe v1 (en) | ViDoRe v2 (multi) | ViDoRe v3 (multi) |
33
+ |-------|--------|----------------|--------------------|--------------------|
34
+ | Qwen3-VL-Emb (Teacher) | 2.0B | 84.3 | 65.3 | 50.0 |
35
+ | **NanoVDR-M-Multi** | **112M** | **82.5** | **62.8** | **47.5** |
36
+ | NanoVDR-S-Multi | 69M | 82.2 | 61.9 | 46.5 |
37
+ | ColPali | ~3B | 84.2 | 54.7 | 42.0 |
38
+
39
+ ### Per-Language Teacher Retention
40
+
41
+ | Language | NDCG@5 | Teacher Retention |
42
+ |----------|--------|-------------------|
43
+ | English | 50.7 | 93.0% |
44
+ | French | 47.8 | 93.6% |
45
+ | Spanish | 47.8 | 93.1% |
46
+ | Italian | 45.7 | 93.3% |
47
+ | German | 45.4 | 92.0% |
48
+ | Portuguese | 46.1 | 94.6% |
49
+
50
+ All 6 languages achieve >92% of the 2B teacher's performance.
51
+
52
+ ## How It Works
53
+
54
+ NanoVDR decouples query encoding from document encoding in visual document retrieval:
55
+ - **Offline indexing**: The VLM teacher (Qwen3-VL-Embedding-2B) encodes document page images into single-vector embeddings. This is a one-time cost.
56
+ - **Online querying**: NanoVDR-M-Multi encodes text queries in any supported language into the same embedding space via a lightweight text encoder + MLP projector. No vision model needed at query time.
57
+
58
+ Retrieval uses standard cosine similarity between query and document embeddings.
59
+
60
+ ## Usage
61
+
62
+ ```python
63
+ from sentence_transformers import SentenceTransformer
64
+
65
+ # Load the multilingual query encoder
66
+ model = SentenceTransformer("nanovdr/NanoVDR-M-Multi")
67
+
68
+ # Encode queries in any supported language
69
+ queries = [
70
+ "What was the revenue growth in Q3 2024?", # English
71
+ "Quel est le chiffre d'affaires du trimestre?", # French
72
+ "Wie hoch war das Umsatzwachstum im dritten Quartal?", # German
73
+ "Cual fue el crecimiento de ingresos en el Q3?", # Spanish
74
+ ]
75
+ query_embeddings = model.encode(queries)
76
+ print(query_embeddings.shape) # (4, 2048)
77
+
78
+ # Retrieve against pre-indexed document embeddings from the VLM teacher
79
+ # scores = query_embeddings @ doc_embeddings.T
80
+ ```
81
+
82
+ ### Full Retrieval Pipeline
83
+
84
+ ```python
85
+ from sentence_transformers import SentenceTransformer
86
+
87
+ # Step 1: Index documents with the VLM teacher (one-time, offline)
88
+ from transformers import AutoModel
89
+ teacher = AutoModel.from_pretrained("Qwen/Qwen3-VL-Embedding-2B")
90
+ # doc_embeddings = teacher.encode(document_images) # See Qwen3-VL-Embedding docs
91
+
92
+ # Step 2: Query with NanoVDR-S-Multi (online, fast, CPU-only)
93
+ student = SentenceTransformer("nanovdr/NanoVDR-M-Multi")
94
+ query_emb = student.encode("Quel est le chiffre d'affaires?")
95
+
96
+ # Step 3: Retrieve
97
+ scores = query_emb @ doc_embeddings.T
98
+ top_k = scores.argsort()[-5:][::-1]
99
+ ```
100
+
101
+ ## Training Details
102
+
103
+ - **Architecture**: google-bert/bert-base-uncased + 2-layer MLP projector (768 → 768 → 2048)
104
+ - **Training objective**: Pointwise cosine alignment with teacher query embeddings
105
+ - **Training data**: 1.49M query-document pairs — 711K original (4 public sources) + 778K machine-translated queries in 5 languages (DE, FR, ES, IT, PT) via Helsinki-NLP Opus-MT models
106
+ - **Training cost**: ~15 GPU-hours on a single H200
107
+ - **Epochs**: 10, lr=3e-4, batch size 1024 (effective)
108
+
109
+ ### Multilingual Augmentation Pipeline
110
+
111
+ 1. Extract 489K English queries from training data
112
+ 2. Translate to 5 target languages using [Helsinki-NLP Opus-MT](https://huggingface.co/Helsinki-NLP) models (~200K per language)
113
+ 3. Re-encode translated queries with the frozen teacher in text mode to produce target embeddings
114
+ 4. Combine with original 711K pairs → 1.49M total training samples
115
+
116
+ ## Key Properties
117
+
118
+ - **Output dimension**: 2048 (aligned with Qwen3-VL-Embedding-2B)
119
+ - **Max sequence length**: 512 tokens
120
+ - **Supported languages**: English, German, French, Spanish, Italian, Portuguese
121
+ - **Similarity function**: Cosine similarity
122
+ - **Pooling**: Mean pooling
123
+ - **Normalization**: L2-normalized output
124
+
125
+ ## Efficiency
126
+
127
+ | Metric | NanoVDR-M-Multi | ColPali (3B) | Teacher (2B) |
128
+ |--------|------------|--------------|--------------|
129
+ | Query latency (CPU, B=1) | 51 ms | 7,300 ms | GPU only |
130
+ | Model size | 116M | ~3B | 2B |
131
+ | Index type | Single-vector | Multi-vector | Single-vector |
132
+ | Scoring | Cosine | MaxSim | Cosine |
133
+
134
+ ## Related Models
135
+
136
+ - [NanoVDR-S](https://huggingface.co/nanovdr/NanoVDR-S) — English-focused, same architecture
137
+ - [NanoVDR-M](https://huggingface.co/nanovdr/NanoVDR-M) — BERT-base backbone (116M)
138
+ - [NanoVDR-L](https://huggingface.co/nanovdr/NanoVDR-L) — ModernBERT backbone (155M)
139
+
140
+ ## Citation
141
+
142
+ ```bibtex
143
+ @article{nanovdr2026,
144
+ title={NanoVDR: Asymmetric Cross-Modal Distillation for Efficient Visual Document Retrieval},
145
+ author={...},
146
+ journal={arXiv preprint},
147
+ year={2026}
148
+ }
149
+ ```
150
+
151
+ ## License
152
+
153
+ Apache 2.0
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.56.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "prompts": {},
3
+ "default_prompt_name": null,
4
+ "similarity_fn_name": "cosine"
5
+ }
modules.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0_Transformer",
5
+ "path": "0_Transformer",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1_Pooling",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2_Dense",
17
+ "path": "2_Dense",
18
+ "type": "sentence_transformers.models.Dense"
19
+ },
20
+ {
21
+ "idx": 3,
22
+ "name": "3_Dense",
23
+ "path": "3_Dense",
24
+ "type": "sentence_transformers.models.Dense"
25
+ },
26
+ {
27
+ "idx": 4,
28
+ "name": "4_Normalize",
29
+ "path": "4_Normalize",
30
+ "type": "sentence_transformers.models.Normalize"
31
+ }
32
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff