Upload model
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +238 -0
- added_tokens.json +28 -0
- chat_template.jinja +85 -0
- config.json +60 -0
- config_sentence_transformers.json +9 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- modules.json +20 -0
- mteb_v2_eval_prompts.json +133 -0
- special_tokens_map.json +31 -0
- tokenizer.json +3 -0
- tokenizer_config.json +239 -0
- vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 1024,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": false,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": true,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- mteb
|
| 4 |
+
- sentence-transformers
|
| 5 |
+
- transformers
|
| 6 |
+
language:
|
| 7 |
+
- multilingual
|
| 8 |
+
- af
|
| 9 |
+
- am
|
| 10 |
+
- ar
|
| 11 |
+
- as
|
| 12 |
+
- az
|
| 13 |
+
- be
|
| 14 |
+
- bg
|
| 15 |
+
- bn
|
| 16 |
+
- br
|
| 17 |
+
- bs
|
| 18 |
+
- ca
|
| 19 |
+
- cs
|
| 20 |
+
- cy
|
| 21 |
+
- da
|
| 22 |
+
- de
|
| 23 |
+
- el
|
| 24 |
+
- en
|
| 25 |
+
- eo
|
| 26 |
+
- es
|
| 27 |
+
- et
|
| 28 |
+
- eu
|
| 29 |
+
- fa
|
| 30 |
+
- fi
|
| 31 |
+
- fr
|
| 32 |
+
- fy
|
| 33 |
+
- ga
|
| 34 |
+
- gd
|
| 35 |
+
- gl
|
| 36 |
+
- gu
|
| 37 |
+
- ha
|
| 38 |
+
- he
|
| 39 |
+
- hi
|
| 40 |
+
- hr
|
| 41 |
+
- hu
|
| 42 |
+
- hy
|
| 43 |
+
- id
|
| 44 |
+
- is
|
| 45 |
+
- it
|
| 46 |
+
- ja
|
| 47 |
+
- jv
|
| 48 |
+
- ka
|
| 49 |
+
- kk
|
| 50 |
+
- km
|
| 51 |
+
- kn
|
| 52 |
+
- ko
|
| 53 |
+
- ku
|
| 54 |
+
- ky
|
| 55 |
+
- la
|
| 56 |
+
- lo
|
| 57 |
+
- lt
|
| 58 |
+
- lv
|
| 59 |
+
- mg
|
| 60 |
+
- mk
|
| 61 |
+
- ml
|
| 62 |
+
- mn
|
| 63 |
+
- mr
|
| 64 |
+
- ms
|
| 65 |
+
- my
|
| 66 |
+
- ne
|
| 67 |
+
- nl
|
| 68 |
+
- 'no'
|
| 69 |
+
- om
|
| 70 |
+
- or
|
| 71 |
+
- pa
|
| 72 |
+
- pl
|
| 73 |
+
- ps
|
| 74 |
+
- pt
|
| 75 |
+
- ro
|
| 76 |
+
- ru
|
| 77 |
+
- sa
|
| 78 |
+
- sd
|
| 79 |
+
- si
|
| 80 |
+
- sk
|
| 81 |
+
- sl
|
| 82 |
+
- so
|
| 83 |
+
- sq
|
| 84 |
+
- sr
|
| 85 |
+
- su
|
| 86 |
+
- sv
|
| 87 |
+
- sw
|
| 88 |
+
- ta
|
| 89 |
+
- te
|
| 90 |
+
- th
|
| 91 |
+
- tl
|
| 92 |
+
- tr
|
| 93 |
+
- ug
|
| 94 |
+
- uk
|
| 95 |
+
- ur
|
| 96 |
+
- uz
|
| 97 |
+
- vi
|
| 98 |
+
- xh
|
| 99 |
+
- yi
|
| 100 |
+
- zh
|
| 101 |
+
license: mit
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## harrier-oss-v1
|
| 105 |
+
|
| 106 |
+
harrier-oss-v1 is a family of multilingual text embedding models developed by Microsoft.
|
| 107 |
+
The models use decoder-only architectures with last-token pooling and L2 normalization to produce dense text embeddings.
|
| 108 |
+
They can be applied to a wide range of tasks, including but not limited to **retrieval**, **clustering**, **semantic similarity**, **classification**, **bitext mining**, and **reranking**.
|
| 109 |
+
The models achieve state-of-the-art results on the [Multilingual MTEB v2](https://huggingface.co/spaces/mteb/leaderboard) benchmark as of the release date.
|
| 110 |
+
|
| 111 |
+
| Model | Parameters | Embedding Dimension | Max Tokens | MTEB v2 Score |
|
| 112 |
+
|-----------------------------------------------------------------------------|------------|---------------------|------------|---------------|
|
| 113 |
+
| [harrier-oss-v1-270m](https://huggingface.co/microsoft/harrier-oss-v1-270m) | 270M | 640 | 32,768 | 66.5 |
|
| 114 |
+
| [harrier-oss-v1-0.6b](https://huggingface.co/microsoft/harrier-oss-v1-0.6b) | 0.6B | 1,024 | 32,768 | 69.0 |
|
| 115 |
+
| [harrier-oss-v1-27b](https://huggingface.co/microsoft/harrier-oss-v1-27b) | 27B | 5,376 | 32,768 | **74.3** |
|
| 116 |
+
|
| 117 |
+
## Training
|
| 118 |
+
|
| 119 |
+
All models are trained with contrastive learning objectives on a large-scale mixture of multilingual datasets covering diverse tasks.
|
| 120 |
+
The 270m and 0.6b variants are additionally trained with knowledge distillation from larger embedding models.
|
| 121 |
+
|
| 122 |
+
## Usage
|
| 123 |
+
|
| 124 |
+
Below is an example to encode queries and passages from the MS-MARCO passage ranking dataset.
|
| 125 |
+
|
| 126 |
+
### Sentence Transformers
|
| 127 |
+
|
| 128 |
+
```python
|
| 129 |
+
from sentence_transformers import SentenceTransformer
|
| 130 |
+
|
| 131 |
+
model = SentenceTransformer("microsoft/harrier-oss-v1-0.6b", model_kwargs={"dtype": "auto"})
|
| 132 |
+
|
| 133 |
+
queries = [
|
| 134 |
+
"how much protein should a female eat",
|
| 135 |
+
"summit define",
|
| 136 |
+
]
|
| 137 |
+
documents = [
|
| 138 |
+
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
|
| 139 |
+
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments."
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
query_embeddings = model.encode(queries, prompt_name="web_search_query")
|
| 143 |
+
document_embeddings = model.encode(documents)
|
| 144 |
+
|
| 145 |
+
scores = (query_embeddings @ document_embeddings.T) * 100
|
| 146 |
+
print(scores.tolist())
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
Have a look at [config_sentence_transformers.json](config_sentence_transformers.json) for the prompts that are pre-configured, such as `web_search_query`, `sts_query`, and `bitext_query`. You can also use a custom instruction directly via e.g. `model.encode(queries, prompt="Instruct: Retrieve semantically similar text\nQuery: ")`.
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
### Transformers
|
| 153 |
+
|
| 154 |
+
```python
|
| 155 |
+
import torch
|
| 156 |
+
import torch.nn.functional as F
|
| 157 |
+
|
| 158 |
+
from torch import Tensor
|
| 159 |
+
from transformers import AutoTokenizer, AutoModel
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
|
| 163 |
+
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
| 164 |
+
if left_padding:
|
| 165 |
+
return last_hidden_states[:, -1]
|
| 166 |
+
else:
|
| 167 |
+
sequence_lengths = attention_mask.sum(dim=1) - 1
|
| 168 |
+
batch_size = last_hidden_states.shape[0]
|
| 169 |
+
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def get_detailed_instruct(task_description: str, query: str) -> str:
|
| 173 |
+
return f'Instruct: {task_description}\nQuery: {query}'
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# Each query must come with a one-sentence instruction that describes the task
|
| 177 |
+
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
| 178 |
+
queries = [
|
| 179 |
+
get_detailed_instruct(task, 'how much protein should a female eat'),
|
| 180 |
+
get_detailed_instruct(task, 'summit define')
|
| 181 |
+
]
|
| 182 |
+
# No need to add instruction for retrieval documents
|
| 183 |
+
documents = [
|
| 184 |
+
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
|
| 185 |
+
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments."
|
| 186 |
+
]
|
| 187 |
+
input_texts = queries + documents
|
| 188 |
+
|
| 189 |
+
tokenizer = AutoTokenizer.from_pretrained('microsoft/harrier-oss-v1-0.6b')
|
| 190 |
+
model = AutoModel.from_pretrained('microsoft/harrier-oss-v1-0.6b', dtype='auto')
|
| 191 |
+
model.eval()
|
| 192 |
+
model.cuda()
|
| 193 |
+
|
| 194 |
+
max_length = 32768
|
| 195 |
+
# Tokenize the input texts
|
| 196 |
+
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
|
| 197 |
+
batch_dict = {k: v.cuda() for k, v in batch_dict.items()}
|
| 198 |
+
|
| 199 |
+
outputs = model(**batch_dict)
|
| 200 |
+
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
| 201 |
+
|
| 202 |
+
# normalize embeddings
|
| 203 |
+
embeddings = F.normalize(embeddings, p=2, dim=1)
|
| 204 |
+
scores = (embeddings[:2] @ embeddings[2:].T) * 100
|
| 205 |
+
print(scores.tolist())
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
## Supported Languages
|
| 209 |
+
|
| 210 |
+
The models are trained on multilingual data and support a wide range of languages,
|
| 211 |
+
including but not limited to: Arabic, Bulgarian, Catalan, Czech, Danish, German, Greek, English, Spanish,
|
| 212 |
+
Estonian, Persian, Finnish, French, Hebrew, Hindi, Croatian, Hungarian, Indonesian, Italian, Japanese,
|
| 213 |
+
Korean, Lithuanian, Latvian, Macedonian, Malay, Dutch, Norwegian, Polish, Portuguese, Romanian, Russian,
|
| 214 |
+
Slovak, Slovenian, Albanian, Serbian, Swedish, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Chinese.
|
| 215 |
+
|
| 216 |
+
## Evaluation
|
| 217 |
+
|
| 218 |
+
Please follow the [mteb](https://github.com/embeddings-benchmark/mteb) repository on how to reproduce our scores.
|
| 219 |
+
The evaluation prompts used for each task are also available at [mteb_v2_eval_prompts.json](mteb_v2_eval_prompts.json).
|
| 220 |
+
|
| 221 |
+
## FAQ
|
| 222 |
+
|
| 223 |
+
**1. Do I need to add instructions to the query?**
|
| 224 |
+
|
| 225 |
+
Yes, this is how the model is trained, otherwise you will see a performance degradation.
|
| 226 |
+
The task definition should be a one-sentence instruction that describes the task.
|
| 227 |
+
This is a way to customize text embeddings for different scenarios through natural language instructions.
|
| 228 |
+
|
| 229 |
+
On the other hand, there is no need to add instructions to the document side.
|
| 230 |
+
|
| 231 |
+
**2. Why are my reproduced results slightly different from reported in the model card?**
|
| 232 |
+
|
| 233 |
+
Different versions of `transformers` and `pytorch` could cause negligible but non-zero performance differences.
|
| 234 |
+
|
| 235 |
+
**3. What pooling strategy does this model use?**
|
| 236 |
+
|
| 237 |
+
The model uses **last-token pooling** — the embedding of the last non-padding token is used as the sentence representation.
|
| 238 |
+
The embedding is then L2-normalized. This is handled automatically when using Sentence Transformers.
|
added_tokens.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</think>": 151668,
|
| 3 |
+
"</tool_call>": 151658,
|
| 4 |
+
"</tool_response>": 151666,
|
| 5 |
+
"<think>": 151667,
|
| 6 |
+
"<tool_call>": 151657,
|
| 7 |
+
"<tool_response>": 151665,
|
| 8 |
+
"<|box_end|>": 151649,
|
| 9 |
+
"<|box_start|>": 151648,
|
| 10 |
+
"<|endoftext|>": 151643,
|
| 11 |
+
"<|file_sep|>": 151664,
|
| 12 |
+
"<|fim_middle|>": 151660,
|
| 13 |
+
"<|fim_pad|>": 151662,
|
| 14 |
+
"<|fim_prefix|>": 151659,
|
| 15 |
+
"<|fim_suffix|>": 151661,
|
| 16 |
+
"<|im_end|>": 151645,
|
| 17 |
+
"<|im_start|>": 151644,
|
| 18 |
+
"<|image_pad|>": 151655,
|
| 19 |
+
"<|object_ref_end|>": 151647,
|
| 20 |
+
"<|object_ref_start|>": 151646,
|
| 21 |
+
"<|quad_end|>": 151651,
|
| 22 |
+
"<|quad_start|>": 151650,
|
| 23 |
+
"<|repo_name|>": 151663,
|
| 24 |
+
"<|video_pad|>": 151656,
|
| 25 |
+
"<|vision_end|>": 151653,
|
| 26 |
+
"<|vision_pad|>": 151654,
|
| 27 |
+
"<|vision_start|>": 151652
|
| 28 |
+
}
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
config.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3Model"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"dtype": "bfloat16",
|
| 8 |
+
"eos_token_id": 151645,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_types": [
|
| 15 |
+
"full_attention",
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention"
|
| 43 |
+
],
|
| 44 |
+
"max_position_embeddings": 32768,
|
| 45 |
+
"max_window_layers": 28,
|
| 46 |
+
"model_type": "qwen3",
|
| 47 |
+
"num_attention_heads": 16,
|
| 48 |
+
"num_hidden_layers": 28,
|
| 49 |
+
"num_key_value_heads": 8,
|
| 50 |
+
"pad_token_id": 151643,
|
| 51 |
+
"rms_norm_eps": 1e-06,
|
| 52 |
+
"rope_scaling": null,
|
| 53 |
+
"rope_theta": 1000000,
|
| 54 |
+
"sliding_window": null,
|
| 55 |
+
"tie_word_embeddings": true,
|
| 56 |
+
"transformers_version": "4.57.6",
|
| 57 |
+
"use_cache": false,
|
| 58 |
+
"use_sliding_window": false,
|
| 59 |
+
"vocab_size": 151936
|
| 60 |
+
}
|
config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"prompts": {
|
| 3 |
+
"web_search_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: ",
|
| 4 |
+
"sts_query": "Instruct: Retrieve semantically similar text\nQuery: ",
|
| 5 |
+
"bitext_query": "Instruct: Retrieve parallel sentences\nQuery: "
|
| 6 |
+
},
|
| 7 |
+
"default_prompt_name": null,
|
| 8 |
+
"similarity_fn_name": "cosine"
|
| 9 |
+
}
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bb124227f33c3dbf7fbbd38119b2afa8be959e93666d3c9be7142b66708b66c
|
| 3 |
+
size 1192133232
|
modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
mteb_v2_eval_prompts.json
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"AILAStatutes-query": "Identifying the most relevant statutes for a given situation",
|
| 3 |
+
"AfriSentiClassification": "Given a text, categorized by sentiment into positive, negative, or neutral",
|
| 4 |
+
"AlloProfClusteringS2S.v2": "Identify the topic of document titles from Allo Prof dataset",
|
| 5 |
+
"AlloprofReranking-query": "Given a question, retrieve passages that answer the question",
|
| 6 |
+
"AmazonCounterfactualClassification": "Given an Amazon review, judge whether it is counterfactual.",
|
| 7 |
+
"ArXivHierarchicalClusteringP2P": "Identify the main and secondary category of Arxiv papers based on the titles and abstracts",
|
| 8 |
+
"ArXivHierarchicalClusteringS2S": "Identify the main and secondary category of Arxiv papers based on the titles",
|
| 9 |
+
"ArguAna-query": "Given a claim, find documents that refute the claim",
|
| 10 |
+
"ArmenianParaphrasePC": "Retrieve semantically similar text",
|
| 11 |
+
"BUCC.v2": "Retrieve parallel sentences",
|
| 12 |
+
"BelebeleRetrieval-query": "Retrieval the relevant passage for the given query",
|
| 13 |
+
"BibleNLPBitextMining": "Retrieve parallel sentences",
|
| 14 |
+
"BigPatentClustering.v2": "Identify the category of documents from the Big Patent dataset",
|
| 15 |
+
"BiorxivClusteringP2P.v2": "Identify the main category of Biorxiv papers based on the titles and abstracts",
|
| 16 |
+
"BornholmBitextMining": "Retrieve parallel sentences",
|
| 17 |
+
"BrazilianToxicTweetsClassification": "Classify the toxic tweets in Brazilian Portuguese into one of the six categories: LGBTQ+phobia, Xenophobia, Obscene, Insult, Misogyny and Racism.",
|
| 18 |
+
"BulgarianStoreReviewSentimentClassfication": "Classify user reviews into positive, negative or mixed sentiment",
|
| 19 |
+
"CEDRClassification": "Given a comment as query, classify expressed emotions into joy, sadness, surprise, fear, and anger",
|
| 20 |
+
"CLSClusteringP2P.v2": "Identify the main category of scholar papers based on the titles and abstracts",
|
| 21 |
+
"CSFDSKMovieReviewSentimentClassification": "Given a movie review, classify its rating on a scale from 0 to 5",
|
| 22 |
+
"CTKFactsNLI": "Retrieve semantically similar text",
|
| 23 |
+
"CataloniaTweetClassification": "Given a tweet, classify its sentiment into AGAINST, FAVOR or NEUTRAL towards Catalonia's independence.",
|
| 24 |
+
"Core17InstructionRetrieval-query": "Retrieve relevant passages for the given query with conditions",
|
| 25 |
+
"CovidRetrieval-query": "Given a question on COVID-19, retrieve news articles that answer the question",
|
| 26 |
+
"CyrillicTurkicLangClassification": "Given a text, classify its language",
|
| 27 |
+
"CzechProductReviewSentimentClassification": "Classify product reviews into positive, neutral, or negative sentiment",
|
| 28 |
+
"DBpediaClassification": "Given the following text, retrieve the appropriate DBpedia category including Company, EducationalInstitution, Artist, Athlete, OfficeHolder, MeanOfTransportation, Building, NaturalPlace, Village, Animal, Plant, Album, Film, WrittenWork.",
|
| 29 |
+
"DalajClassification": "Classify texts based on linguistic acceptability in Swedish",
|
| 30 |
+
"DiaBlaBitextMining": "Retrieve parallel sentences",
|
| 31 |
+
"EstonianValenceClassification": "Given a news article, categorized by sentiment into negatiivne, positiivne, neutraalne or vastuolulin",
|
| 32 |
+
"FaroeseSTS": "Retrieve semantically similar text",
|
| 33 |
+
"FilipinoShopeeReviewsClassification": "Given a shop review, classify its rating on a scale from 1 to 5",
|
| 34 |
+
"FinParaSTS": "Retrieve semantically similar text",
|
| 35 |
+
"FinancialPhrasebankClassification": "Given financial news, categorized by sentiment into positive, negative, or neutral",
|
| 36 |
+
"FloresBitextMining": "Retrieve parallel sentences",
|
| 37 |
+
"GermanSTSBenchmark": "Retrieve semantically similar text",
|
| 38 |
+
"GreekLegalCodeClassification": "Given a greek legal text, classify its topic",
|
| 39 |
+
"GujaratiNewsClassification": "Given a Gujarati news articles, classify ist topic",
|
| 40 |
+
"HALClusteringS2S.v2": "Identify the topic of titles from HAL",
|
| 41 |
+
"HagridRetrieval-query": "Given a question, retrieve relevant responses",
|
| 42 |
+
"IN22GenBitextMining": "Retrieve parallel sentences",
|
| 43 |
+
"IndicCrosslingualSTS": "Retrieve semantically similar text",
|
| 44 |
+
"IndicGenBenchFloresBitextMining": "Retrieve parallel sentences",
|
| 45 |
+
"IndicLangClassification": "Given a text, classify its language",
|
| 46 |
+
"IndonesianIdClickbaitClassification": "Given an Indonesian news headlines, classify its into clickbait or non-clickbait",
|
| 47 |
+
"IsiZuluNewsClassification": "Given a news article, classify its topic",
|
| 48 |
+
"ItaCaseholdClassification": "Given a judgments, classify its topic",
|
| 49 |
+
"JSICK": "Retrieve semantically similar text",
|
| 50 |
+
"KorHateSpeechMLClassification": "Given a Korean online news comments, classify its fine-grained hate speech classes",
|
| 51 |
+
"KorSarcasmClassification": "Given a twitter, categorized it into sarcasm or not_sarcasm",
|
| 52 |
+
"KurdishSentimentClassification": "Given a text, categorized by sentiment into positive or negative",
|
| 53 |
+
"LEMBPasskeyRetrieval-query": "Retrieval the relevant passage for the given query",
|
| 54 |
+
"LegalBenchCorporateLobbying-query": "Given a query, retrieve relevant legal bill summaries",
|
| 55 |
+
"MIRACLRetrievalHardNegatives-query": "Retrieve Wikipedia passages that answer the question",
|
| 56 |
+
"MLQARetrieval-query": "Retrieval the relevant passage for the given query",
|
| 57 |
+
"MacedonianTweetSentimentClassification": "Given a Macedonian tweet, categorized by sentiment into positive, negative, or neutral",
|
| 58 |
+
"MalteseNewsClassification": "Given a maltese new, classify its topic",
|
| 59 |
+
"MasakhaNEWSClassification": "Classify the News in the given texts into one of the seven category: politics,sports,health,business,entertainment,technology,religion ",
|
| 60 |
+
"MasakhaNEWSClusteringS2S": "Identify the topic or theme of the given news articles based on the titles",
|
| 61 |
+
"MassiveIntentClassification": "Given a user utterance as query, find the user intents",
|
| 62 |
+
"MedrxivClusteringP2P.v2": "Identify the main category of Medrxiv papers based on the titles and abstracts",
|
| 63 |
+
"MultiEURLEXMultilabelClassification": "Given a text, classify its topic",
|
| 64 |
+
"MultiHateClassification": "Given a text, categorized by sentiment into hate or non-hate",
|
| 65 |
+
"NTREXBitextMining": "Retrieve parallel sentences",
|
| 66 |
+
"NepaliNewsClassification": "Given a news article, categorized it into business, entertainment or sports",
|
| 67 |
+
"News21InstructionRetrieval-query": "Retrieve relevant passages for the given query with conditions",
|
| 68 |
+
"NollySentiBitextMining": "Retrieve parallel sentences",
|
| 69 |
+
"NordicLangClassification": "Given a text in a Nordic language, classify the language into one of the following categories: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokmål), Faroese, Icelandic.",
|
| 70 |
+
"NorwegianCourtsBitextMining": "Retrieve parallel sentences",
|
| 71 |
+
"NusaParagraphEmotionClassification": "Classify the emotion into one of the following categories: fear, sadness, anger, happy, love, surprise, shame.",
|
| 72 |
+
"NusaTranslationBitextMining": "Retrieve parallel sentences",
|
| 73 |
+
"NusaX-senti": "Given a text, categorized by sentiment into positive or negative",
|
| 74 |
+
"NusaXBitextMining": "Retrieve parallel sentences",
|
| 75 |
+
"OdiaNewsClassification": "Given a news article, categorized it into business, entertainment or sports",
|
| 76 |
+
"OpusparcusPC": "Retrieve semantically similar text",
|
| 77 |
+
"PAC": "Classify Polish contract clauses into one of the following two types: \"Safe Contract Clauses\" and \"Unfair Contract Clauses\".",
|
| 78 |
+
"PawsXPairClassification": "Retrieve semantically similar text",
|
| 79 |
+
"PlscClusteringP2P.v2": "Identify the category of titles+abstracts from Library of Science",
|
| 80 |
+
"PoemSentimentClassification": "Given the following verse from a poem, classify its sentiment as negative, neutral, positive, or mixed.",
|
| 81 |
+
"PolEmo2.0-OUT": "Classify the sentiment of products and school online reviews",
|
| 82 |
+
"PpcPC": "Retrieve semantically similar text",
|
| 83 |
+
"PunjabiNewsClassification": "Given a news article, categorized it into two-classes",
|
| 84 |
+
"RTE3": "Retrieve semantically similar text",
|
| 85 |
+
"Robust04InstructionRetrieval-query": "Retrieve relevant passages for the given query with conditions",
|
| 86 |
+
"RomaniBibleClustering": "Identify verses from the Bible in Kalderash Romani by book.",
|
| 87 |
+
"RuBQReranking-query": "Given a question, retrieve Wikipedia passages that answer the question",
|
| 88 |
+
"SCIDOCS-query": "Given a scientific paper title, retrieve paper abstracts that are cited by the given paper",
|
| 89 |
+
"SIB200ClusteringS2S": "Identify the category of documents",
|
| 90 |
+
"SICK-R": "Retrieve semantically similar text",
|
| 91 |
+
"STS12": "Retrieve semantically related sentences",
|
| 92 |
+
"STS13": "Retrieve semantically similar text",
|
| 93 |
+
"STS14": "Retrieve semantically similar text",
|
| 94 |
+
"STS15": "Retrieve semantically similar text",
|
| 95 |
+
"STS17": "Retrieve semantically similar text",
|
| 96 |
+
"STS22.v2": "Given a document, retrieve semantically related documents",
|
| 97 |
+
"STSB": "Retrieve semantically similar text",
|
| 98 |
+
"STSBenchmark": "Retrieve semantically similar text",
|
| 99 |
+
"STSES": "Given a Spanish sentence, retrieve semantically related Spanish sentences",
|
| 100 |
+
"ScalaClassification": "Classify passages into correct or correct in Scandinavian Languages based on linguistic acceptability",
|
| 101 |
+
"SemRel24STS": "Retrieve semantically similar text",
|
| 102 |
+
"SentimentAnalysisHindi": "Given a hindi text, categorized by sentiment into positive, negative or neutral",
|
| 103 |
+
"SinhalaNewsClassification": "Given a news article, categorized it into political, business, technology, sports and Entertainment",
|
| 104 |
+
"SiswatiNewsClassification": "Identify fine-grained news categories in Siswati language.",
|
| 105 |
+
"SlovakMovieReviewSentimentClassification": "Given a movie review, categorized it into positive or negative",
|
| 106 |
+
"SpartQA-query": "Given the following spatial reasoning question, retrieve the right answer.",
|
| 107 |
+
"SprintDuplicateQuestions": "Find questions that have the same meaning as the input question",
|
| 108 |
+
"StackExchangeClustering.v2": "Identify the topic or theme of StackExchange posts based on the titles",
|
| 109 |
+
"StackOverflowQA-query": "Given a question about coding, retrieval code or passage that can solve user's question",
|
| 110 |
+
"StatcanDialogueDatasetRetrieval-query": "Retrieval the relevant passage for the given query",
|
| 111 |
+
"SwahiliNewsClassification": "Given a news article, classify its domain",
|
| 112 |
+
"SwednClusteringP2P": "Identify news categories in Swedish passages",
|
| 113 |
+
"SwissJudgementClassification": "Given a news article, categorized it into approval or dismissal",
|
| 114 |
+
"T2Reranking-query": "Given a Chinese search query, retrieve web passages that answer the question",
|
| 115 |
+
"TERRa": "Given a premise, retrieve a hypothesis that is entailed by the premise",
|
| 116 |
+
"TRECCOVID-query": "Given a medical query, retrieve documents that answer the query",
|
| 117 |
+
"Tatoeba": "Retrieve parallel sentences",
|
| 118 |
+
"TempReasonL1-query": "Given the following question about time, retrieve the correct answer.",
|
| 119 |
+
"ToxicConversationsClassification": "Classify the given comments as either toxic or not toxic",
|
| 120 |
+
"TswanaNewsClassification": "Given a news article, classify its topic",
|
| 121 |
+
"TweetTopicSingleClassification": "Gvien a twitter, classify its topic",
|
| 122 |
+
"TwitterHjerneRetrieval-query": "Retrieve answers to questions asked in Danish tweets",
|
| 123 |
+
"TwitterURLCorpus": "Find tweets that have the same meaning as the input tweet",
|
| 124 |
+
"VoyageMMarcoReranking-query": "Given a Japanese search query, retrieve web passages that answer the question",
|
| 125 |
+
"WebLINXCandidatesReranking-query": "Retrieval the relevant passage for the given query",
|
| 126 |
+
"WikiCitiesClustering": "Identify of Wikipedia articles of cities by country",
|
| 127 |
+
"WikiClusteringP2P.v2": "Identify the category of wiki passages",
|
| 128 |
+
"WikipediaRerankingMultilingual-query": "Retrieval the relevant passage for the given query",
|
| 129 |
+
"WikipediaRetrievalMultilingual-query": "Retrieval the relevant passage for the given query",
|
| 130 |
+
"WinoGrande-query": "Given the following sentence, retrieve an appropriate answer to fill in the missing underscored part.",
|
| 131 |
+
"XNLI": "Retrieve semantically similar text",
|
| 132 |
+
"indonli": "Retrieve semantically similar text"
|
| 133 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:def76fb086971c7867b829c23a26261e38d9d74e02139253b38aeb9df8b4b50a
|
| 3 |
+
size 11423705
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
}
|
| 213 |
+
},
|
| 214 |
+
"additional_special_tokens": [
|
| 215 |
+
"<|im_start|>",
|
| 216 |
+
"<|im_end|>",
|
| 217 |
+
"<|object_ref_start|>",
|
| 218 |
+
"<|object_ref_end|>",
|
| 219 |
+
"<|box_start|>",
|
| 220 |
+
"<|box_end|>",
|
| 221 |
+
"<|quad_start|>",
|
| 222 |
+
"<|quad_end|>",
|
| 223 |
+
"<|vision_start|>",
|
| 224 |
+
"<|vision_end|>",
|
| 225 |
+
"<|vision_pad|>",
|
| 226 |
+
"<|image_pad|>",
|
| 227 |
+
"<|video_pad|>"
|
| 228 |
+
],
|
| 229 |
+
"bos_token": null,
|
| 230 |
+
"clean_up_tokenization_spaces": false,
|
| 231 |
+
"eos_token": "<|im_end|>",
|
| 232 |
+
"errors": "replace",
|
| 233 |
+
"extra_special_tokens": {},
|
| 234 |
+
"model_max_length": 131072,
|
| 235 |
+
"pad_token": "<|endoftext|>",
|
| 236 |
+
"split_special_tokens": false,
|
| 237 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 238 |
+
"unk_token": null
|
| 239 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|