Ryenhails commited on
Commit
452b0ab
·
verified ·
1 Parent(s): 868056a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ banner.png filter=lfs diff=lfs merge=lfs -text
0_Transformer/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertModel"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "dtype": "float32",
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "qa_dropout": 0.1,
18
+ "seq_classif_dropout": 0.2,
19
+ "sinusoidal_pos_embds": false,
20
+ "tie_weights_": true,
21
+ "transformers_version": "4.57.6",
22
+ "vocab_size": 30522
23
+ }
0_Transformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94a6e24feffa39d1bb68301894d7df0d8f79303246e2b2f41ce2442145164d7d
3
+ size 265462608
0_Transformer/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": true
4
+ }
0_Transformer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
0_Transformer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
0_Transformer/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
0_Transformer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
1_Pooling/config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false
9
+ }
2_Dense/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "in_features": 768,
3
+ "out_features": 768,
4
+ "bias": true,
5
+ "activation_function": "torch.nn.modules.activation.GELU"
6
+ }
2_Dense/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:878c5b354f0f87e2f3bbde25c161790eb0332d496b34dc7dce503adb307e9645
3
+ size 2364309
3_Dense/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "in_features": 768,
3
+ "out_features": 2048,
4
+ "bias": true,
5
+ "activation_function": "torch.nn.modules.linear.Identity"
6
+ }
3_Dense/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ae433824a6c8d050d21e70f11118f3269d765a87790d19343bd5fa65334083a
3
+ size 6301589
README.md ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: sentence-transformers
3
+ pipeline_tag: sentence-similarity
4
+ tags:
5
+ - sentence-transformers
6
+ - feature-extraction
7
+ - sentence-similarity
8
+ - visual-document-retrieval
9
+ - cross-modal-distillation
10
+ - knowledge-distillation
11
+ - nanovdr
12
+ base_model: distilbert/distilbert-base-uncased
13
+ language:
14
+ - en
15
+ license: apache-2.0
16
+ datasets:
17
+ - openbmb/VisRAG-Ret-Train-Synthetic-data
18
+ - openbmb/VisRAG-Ret-Train-In-domain-data
19
+ - vidore/colpali_train_set
20
+ - llamaindex/vdr-multilingual-train
21
+ model-index:
22
+ - name: NanoVDR-S
23
+ results:
24
+ - task:
25
+ type: retrieval
26
+ dataset:
27
+ name: ViDoRe v1
28
+ type: vidore/vidore-benchmark-667173f98e70a1c0fa4d
29
+ metrics:
30
+ - name: NDCG@5
31
+ type: ndcg_at_5
32
+ value: 82.2
33
+ - task:
34
+ type: retrieval
35
+ dataset:
36
+ name: ViDoRe v2
37
+ type: vidore/vidore-benchmark-v2
38
+ metrics:
39
+ - name: NDCG@5
40
+ type: ndcg_at_5
41
+ value: 60.5
42
+ ---
43
+
44
+ <p align="center">
45
+ <img width="560" src="banner.png" alt="NanoVDR"/>
46
+ </p>
47
+
48
+ # NanoVDR-S
49
+
50
+ **English-only baseline variant.** For production use (especially with multilingual queries), we recommend **[NanoVDR-S-Multi](https://huggingface.co/nanovdr/NanoVDR-S-Multi)**.
51
+
52
+ NanoVDR-S is a 69M-parameter text-only query encoder for visual document retrieval, trained via asymmetric cross-modal distillation from [Qwen3-VL-Embedding-2B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-2B). It uses DistilBERT + a 2-layer MLP projector to encode text queries into the teacher's embedding space.
53
+
54
+ ## Results
55
+
56
+ | Model | Params | ViDoRe v1 | ViDoRe v2 | ViDoRe v3 | Avg Retention |
57
+ |-------|--------|-----------|-----------|-----------|---------------|
58
+ | Qwen3-VL-Emb (Teacher) | 2.0B | 84.3 | 65.3 | 50.0 | — |
59
+ | **NanoVDR-S** | **69M** | **82.2** | **60.5** | **43.5** | **92.4%** |
60
+ | NanoVDR-S-Multi | 69M | 82.2 | 61.9 | 46.5 | 95.1% |
61
+
62
+ <sub>NDCG@5 (×100). Retention = Student / Teacher averaged across v1/v2/v3.</sub>
63
+
64
+ ## Usage
65
+
66
+ > **Prerequisite:** Documents must be indexed offline using [Qwen3-VL-Embedding-2B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-2B) (the teacher model). See the [NanoVDR-S-Multi model page](https://huggingface.co/nanovdr/NanoVDR-S-Multi#prerequisites-document-indexing-with-teacher-model) for a complete indexing guide.
67
+
68
+ ```python
69
+ from sentence_transformers import SentenceTransformer
70
+ import numpy as np
71
+
72
+ # doc_embeddings: (N, 2048) from teacher indexing (see prerequisite above)
73
+
74
+ model = SentenceTransformer("nanovdr/NanoVDR-S")
75
+ query_embeddings = model.encode(["What was the revenue growth in Q3?"]) # (1, 2048)
76
+
77
+ scores = query_embeddings @ doc_embeddings.T
78
+ top_k_indices = np.argsort(scores[0])[-5:][::-1]
79
+ ```
80
+
81
+ ## Training Details
82
+
83
+ | | Value |
84
+ |--|-------|
85
+ | Architecture | DistilBERT (66M) + MLP projector (768 → 768 → 2048, 2.4M) = 69M |
86
+ | Objective | Pointwise cosine alignment with teacher query embeddings |
87
+ | Data | 711K query-document pairs |
88
+ | Epochs / lr | 20 / 2e-4 |
89
+ | Training cost | ~10 GPU-hours (1× H200) |
90
+ | CPU query latency | 51 ms |
91
+
92
+ ## All NanoVDR Models
93
+
94
+ | Model | Backbone | Params | v1 | v2 | v3 | Retention |
95
+ |-------|----------|--------|----|----|----| ----------|
96
+ | **[NanoVDR-S-Multi](https://huggingface.co/nanovdr/NanoVDR-S-Multi)** | **DistilBERT** | **69M** | **82.2** | **61.9** | **46.5** | **95.1%** |
97
+ | [NanoVDR-S](https://huggingface.co/nanovdr/NanoVDR-S) | DistilBERT | 69M | 82.2 | 60.5 | 43.5 | 92.4% |
98
+ | [NanoVDR-M](https://huggingface.co/nanovdr/NanoVDR-M) | BERT-base | 112M | 82.1 | 62.2 | 44.7 | 94.0% |
99
+ | [NanoVDR-L](https://huggingface.co/nanovdr/NanoVDR-L) | ModernBERT | 151M | 82.4 | 61.5 | 44.2 | 93.4% |
100
+
101
+ ## Citation
102
+
103
+ ```bibtex
104
+ @article{nanovdr2026,
105
+ title={NanoVDR: Distilling a 2B Vision-Language Retriever into a 70M Text-Only Encoder for Visual Document Retrieval},
106
+ author={Liu, Zhuchenyang and Zhang, Yao and Xiao, Yu},
107
+ journal={arXiv preprint arXiv:2502.XXXXX},
108
+ year={2026}
109
+ }
110
+ ```
111
+
112
+ ## License
113
+
114
+ Apache 2.0
banner.png ADDED

Git LFS Details

  • SHA256: caec09fa91e56deeb2d53e7ca2613573fc67c4e5af50817437f67a606d8fe200
  • Pointer size: 132 Bytes
  • Size of remote file: 7.47 MB
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertModel"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "dtype": "float32",
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "qa_dropout": 0.1,
18
+ "seq_classif_dropout": 0.2,
19
+ "sinusoidal_pos_embds": false,
20
+ "tie_weights_": true,
21
+ "transformers_version": "4.57.6",
22
+ "vocab_size": 30522
23
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "prompts": {},
3
+ "default_prompt_name": null,
4
+ "similarity_fn_name": "cosine"
5
+ }
modules.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0_Transformer",
5
+ "path": "0_Transformer",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1_Pooling",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2_Dense",
17
+ "path": "2_Dense",
18
+ "type": "sentence_transformers.models.Dense"
19
+ },
20
+ {
21
+ "idx": 3,
22
+ "name": "3_Dense",
23
+ "path": "3_Dense",
24
+ "type": "sentence_transformers.models.Dense"
25
+ },
26
+ {
27
+ "idx": 4,
28
+ "name": "4_Normalize",
29
+ "path": "4_Normalize",
30
+ "type": "sentence_transformers.models.Normalize"
31
+ }
32
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff