Upload NanoVDR-M-Multi model (multilingual variant)
Browse files- 0_Transformer/config.json +25 -0
- 0_Transformer/model.safetensors +3 -0
- 0_Transformer/sentence_bert_config.json +4 -0
- 0_Transformer/special_tokens_map.json +7 -0
- 0_Transformer/tokenizer.json +0 -0
- 0_Transformer/tokenizer_config.json +56 -0
- 0_Transformer/vocab.txt +0 -0
- 1_Pooling/config.json +9 -0
- 2_Dense/config.json +6 -0
- 2_Dense/pytorch_model.bin +3 -0
- 3_Dense/config.json +6 -0
- 3_Dense/pytorch_model.bin +3 -0
- README.md +153 -0
- config.json +25 -0
- config_sentence_transformers.json +5 -0
- modules.json +32 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +56 -0
- vocab.txt +0 -0
0_Transformer/config.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"dtype": "float32",
|
| 8 |
+
"gradient_checkpointing": false,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"position_embedding_type": "absolute",
|
| 21 |
+
"transformers_version": "4.56.2",
|
| 22 |
+
"type_vocab_size": 2,
|
| 23 |
+
"use_cache": true,
|
| 24 |
+
"vocab_size": 30522
|
| 25 |
+
}
|
0_Transformer/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ba37632d781081d8ec828f55a5e4581863f5385663026d48bacd8f36de58c27
|
| 3 |
+
size 437951328
|
0_Transformer/sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 512,
|
| 3 |
+
"do_lower_case": true
|
| 4 |
+
}
|
0_Transformer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
0_Transformer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
0_Transformer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_lower_case": true,
|
| 47 |
+
"extra_special_tokens": {},
|
| 48 |
+
"mask_token": "[MASK]",
|
| 49 |
+
"model_max_length": 512,
|
| 50 |
+
"pad_token": "[PAD]",
|
| 51 |
+
"sep_token": "[SEP]",
|
| 52 |
+
"strip_accents": null,
|
| 53 |
+
"tokenize_chinese_chars": true,
|
| 54 |
+
"tokenizer_class": "BertTokenizer",
|
| 55 |
+
"unk_token": "[UNK]"
|
| 56 |
+
}
|
0_Transformer/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
1_Pooling/config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 768,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": true,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": false
|
| 9 |
+
}
|
2_Dense/config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"in_features": 768,
|
| 3 |
+
"out_features": 768,
|
| 4 |
+
"bias": true,
|
| 5 |
+
"activation_function": "torch.nn.modules.activation.GELU"
|
| 6 |
+
}
|
2_Dense/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:263be3f66ae6f965511e0ffc7b8189655d2f26b27fbc5aa1159e58d92cf09ecb
|
| 3 |
+
size 2364309
|
3_Dense/config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"in_features": 768,
|
| 3 |
+
"out_features": 2048,
|
| 4 |
+
"bias": true,
|
| 5 |
+
"activation_function": "torch.nn.modules.linear.Identity"
|
| 6 |
+
}
|
3_Dense/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55758286f9a35831511a1afa2428bcbfb5d32688a12b0982342d9240a229f88d
|
| 3 |
+
size 6301589
|
README.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: sentence-transformers
|
| 3 |
+
pipeline_tag: sentence-similarity
|
| 4 |
+
tags:
|
| 5 |
+
- sentence-transformers
|
| 6 |
+
- feature-extraction
|
| 7 |
+
- visual-document-retrieval
|
| 8 |
+
- cross-modal-distillation
|
| 9 |
+
- multilingual
|
| 10 |
+
- nanovdr
|
| 11 |
+
base_model: google-bert/bert-base-uncased
|
| 12 |
+
language:
|
| 13 |
+
- en
|
| 14 |
+
- de
|
| 15 |
+
- fr
|
| 16 |
+
- es
|
| 17 |
+
- it
|
| 18 |
+
- pt
|
| 19 |
+
license: apache-2.0
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
> **Paper**: [NanoVDR: Distilling a 2B Vision-Language Retriever into a 70M Text-Only Encoder for Visual Document Retrieval](https://arxiv.org/abs/2603.12824) | [Blog](https://huggingface.co/blog/Ryenhails/nanovdr)
|
| 23 |
+
|
| 24 |
+
# NanoVDR-M-Multi: Multilingual Query Encoder for Visual Document Retrieval
|
| 25 |
+
|
| 26 |
+
**NanoVDR-M-Multi** is a 116M-parameter multilingual text-only query encoder for visual document retrieval. It retrieves document page images as effectively as Vision-Language Models 30-100x its size, with strong cross-lingual transfer across 6 languages.
|
| 27 |
+
|
| 28 |
+
Built on [NanoVDR-S](https://huggingface.co/nanovdr/NanoVDR-S) and further trained with multilingual query augmentation (English + German, French, Spanish, Italian, Portuguese), it is the recommended model for production use with multilingual or mixed-language queries.
|
| 29 |
+
|
| 30 |
+
## Results
|
| 31 |
+
|
| 32 |
+
| Model | Params | ViDoRe v1 (en) | ViDoRe v2 (multi) | ViDoRe v3 (multi) |
|
| 33 |
+
|-------|--------|----------------|--------------------|--------------------|
|
| 34 |
+
| Qwen3-VL-Emb (Teacher) | 2.0B | 84.3 | 65.3 | 50.0 |
|
| 35 |
+
| **NanoVDR-M-Multi** | **112M** | **82.5** | **62.8** | **47.5** |
|
| 36 |
+
| NanoVDR-S-Multi | 69M | 82.2 | 61.9 | 46.5 |
|
| 37 |
+
| ColPali | ~3B | 84.2 | 54.7 | 42.0 |
|
| 38 |
+
|
| 39 |
+
### Per-Language Teacher Retention
|
| 40 |
+
|
| 41 |
+
| Language | NDCG@5 | Teacher Retention |
|
| 42 |
+
|----------|--------|-------------------|
|
| 43 |
+
| English | 50.7 | 93.0% |
|
| 44 |
+
| French | 47.8 | 93.6% |
|
| 45 |
+
| Spanish | 47.8 | 93.1% |
|
| 46 |
+
| Italian | 45.7 | 93.3% |
|
| 47 |
+
| German | 45.4 | 92.0% |
|
| 48 |
+
| Portuguese | 46.1 | 94.6% |
|
| 49 |
+
|
| 50 |
+
All 6 languages achieve >92% of the 2B teacher's performance.
|
| 51 |
+
|
| 52 |
+
## How It Works
|
| 53 |
+
|
| 54 |
+
NanoVDR decouples query encoding from document encoding in visual document retrieval:
|
| 55 |
+
- **Offline indexing**: The VLM teacher (Qwen3-VL-Embedding-2B) encodes document page images into single-vector embeddings. This is a one-time cost.
|
| 56 |
+
- **Online querying**: NanoVDR-M-Multi encodes text queries in any supported language into the same embedding space via a lightweight text encoder + MLP projector. No vision model needed at query time.
|
| 57 |
+
|
| 58 |
+
Retrieval uses standard cosine similarity between query and document embeddings.
|
| 59 |
+
|
| 60 |
+
## Usage
|
| 61 |
+
|
| 62 |
+
```python
|
| 63 |
+
from sentence_transformers import SentenceTransformer
|
| 64 |
+
|
| 65 |
+
# Load the multilingual query encoder
|
| 66 |
+
model = SentenceTransformer("nanovdr/NanoVDR-M-Multi")
|
| 67 |
+
|
| 68 |
+
# Encode queries in any supported language
|
| 69 |
+
queries = [
|
| 70 |
+
"What was the revenue growth in Q3 2024?", # English
|
| 71 |
+
"Quel est le chiffre d'affaires du trimestre?", # French
|
| 72 |
+
"Wie hoch war das Umsatzwachstum im dritten Quartal?", # German
|
| 73 |
+
"Cual fue el crecimiento de ingresos en el Q3?", # Spanish
|
| 74 |
+
]
|
| 75 |
+
query_embeddings = model.encode(queries)
|
| 76 |
+
print(query_embeddings.shape) # (4, 2048)
|
| 77 |
+
|
| 78 |
+
# Retrieve against pre-indexed document embeddings from the VLM teacher
|
| 79 |
+
# scores = query_embeddings @ doc_embeddings.T
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
### Full Retrieval Pipeline
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
from sentence_transformers import SentenceTransformer
|
| 86 |
+
|
| 87 |
+
# Step 1: Index documents with the VLM teacher (one-time, offline)
|
| 88 |
+
from transformers import AutoModel
|
| 89 |
+
teacher = AutoModel.from_pretrained("Qwen/Qwen3-VL-Embedding-2B")
|
| 90 |
+
# doc_embeddings = teacher.encode(document_images) # See Qwen3-VL-Embedding docs
|
| 91 |
+
|
| 92 |
+
# Step 2: Query with NanoVDR-S-Multi (online, fast, CPU-only)
|
| 93 |
+
student = SentenceTransformer("nanovdr/NanoVDR-M-Multi")
|
| 94 |
+
query_emb = student.encode("Quel est le chiffre d'affaires?")
|
| 95 |
+
|
| 96 |
+
# Step 3: Retrieve
|
| 97 |
+
scores = query_emb @ doc_embeddings.T
|
| 98 |
+
top_k = scores.argsort()[-5:][::-1]
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
## Training Details
|
| 102 |
+
|
| 103 |
+
- **Architecture**: google-bert/bert-base-uncased + 2-layer MLP projector (768 → 768 → 2048)
|
| 104 |
+
- **Training objective**: Pointwise cosine alignment with teacher query embeddings
|
| 105 |
+
- **Training data**: 1.49M query-document pairs — 711K original (4 public sources) + 778K machine-translated queries in 5 languages (DE, FR, ES, IT, PT) via Helsinki-NLP Opus-MT models
|
| 106 |
+
- **Training cost**: ~15 GPU-hours on a single H200
|
| 107 |
+
- **Epochs**: 10, lr=3e-4, batch size 1024 (effective)
|
| 108 |
+
|
| 109 |
+
### Multilingual Augmentation Pipeline
|
| 110 |
+
|
| 111 |
+
1. Extract 489K English queries from training data
|
| 112 |
+
2. Translate to 5 target languages using [Helsinki-NLP Opus-MT](https://huggingface.co/Helsinki-NLP) models (~200K per language)
|
| 113 |
+
3. Re-encode translated queries with the frozen teacher in text mode to produce target embeddings
|
| 114 |
+
4. Combine with original 711K pairs → 1.49M total training samples
|
| 115 |
+
|
| 116 |
+
## Key Properties
|
| 117 |
+
|
| 118 |
+
- **Output dimension**: 2048 (aligned with Qwen3-VL-Embedding-2B)
|
| 119 |
+
- **Max sequence length**: 512 tokens
|
| 120 |
+
- **Supported languages**: English, German, French, Spanish, Italian, Portuguese
|
| 121 |
+
- **Similarity function**: Cosine similarity
|
| 122 |
+
- **Pooling**: Mean pooling
|
| 123 |
+
- **Normalization**: L2-normalized output
|
| 124 |
+
|
| 125 |
+
## Efficiency
|
| 126 |
+
|
| 127 |
+
| Metric | NanoVDR-M-Multi | ColPali (3B) | Teacher (2B) |
|
| 128 |
+
|--------|------------|--------------|--------------|
|
| 129 |
+
| Query latency (CPU, B=1) | 51 ms | 7,300 ms | GPU only |
|
| 130 |
+
| Model size | 116M | ~3B | 2B |
|
| 131 |
+
| Index type | Single-vector | Multi-vector | Single-vector |
|
| 132 |
+
| Scoring | Cosine | MaxSim | Cosine |
|
| 133 |
+
|
| 134 |
+
## Related Models
|
| 135 |
+
|
| 136 |
+
- [NanoVDR-S](https://huggingface.co/nanovdr/NanoVDR-S) — English-focused, same architecture
|
| 137 |
+
- [NanoVDR-M](https://huggingface.co/nanovdr/NanoVDR-M) — BERT-base backbone (116M)
|
| 138 |
+
- [NanoVDR-L](https://huggingface.co/nanovdr/NanoVDR-L) — ModernBERT backbone (155M)
|
| 139 |
+
|
| 140 |
+
## Citation
|
| 141 |
+
|
| 142 |
+
```bibtex
|
| 143 |
+
@article{nanovdr2026,
|
| 144 |
+
title={NanoVDR: Asymmetric Cross-Modal Distillation for Efficient Visual Document Retrieval},
|
| 145 |
+
author={...},
|
| 146 |
+
journal={arXiv preprint},
|
| 147 |
+
year={2026}
|
| 148 |
+
}
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
## License
|
| 152 |
+
|
| 153 |
+
Apache 2.0
|
config.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"dtype": "float32",
|
| 8 |
+
"gradient_checkpointing": false,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"position_embedding_type": "absolute",
|
| 21 |
+
"transformers_version": "4.56.2",
|
| 22 |
+
"type_vocab_size": 2,
|
| 23 |
+
"use_cache": true,
|
| 24 |
+
"vocab_size": 30522
|
| 25 |
+
}
|
config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"prompts": {},
|
| 3 |
+
"default_prompt_name": null,
|
| 4 |
+
"similarity_fn_name": "cosine"
|
| 5 |
+
}
|
modules.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0_Transformer",
|
| 5 |
+
"path": "0_Transformer",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1_Pooling",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2_Dense",
|
| 17 |
+
"path": "2_Dense",
|
| 18 |
+
"type": "sentence_transformers.models.Dense"
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"idx": 3,
|
| 22 |
+
"name": "3_Dense",
|
| 23 |
+
"path": "3_Dense",
|
| 24 |
+
"type": "sentence_transformers.models.Dense"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"idx": 4,
|
| 28 |
+
"name": "4_Normalize",
|
| 29 |
+
"path": "4_Normalize",
|
| 30 |
+
"type": "sentence_transformers.models.Normalize"
|
| 31 |
+
}
|
| 32 |
+
]
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_lower_case": true,
|
| 47 |
+
"extra_special_tokens": {},
|
| 48 |
+
"mask_token": "[MASK]",
|
| 49 |
+
"model_max_length": 512,
|
| 50 |
+
"pad_token": "[PAD]",
|
| 51 |
+
"sep_token": "[SEP]",
|
| 52 |
+
"strip_accents": null,
|
| 53 |
+
"tokenize_chinese_chars": true,
|
| 54 |
+
"tokenizer_class": "BertTokenizer",
|
| 55 |
+
"unk_token": "[UNK]"
|
| 56 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|