Sentence Similarity
sentence-transformers
Safetensors
Hebrew
hebrew
semantic-retrieval
information-retrieval
dense-retrieval
reranking
ensemble
competition
Instructions to use HebArabNlpProject/Semantic-Retrieval-1st-place with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use HebArabNlpProject/Semantic-Retrieval-1st-place with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("HebArabNlpProject/Semantic-Retrieval-1st-place") sentences = [ "The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium." ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [3, 3] - Notebooks
- Google Colab
- Kaggle
uploading 1st place model
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +7 -0
- victord/sub19/model.py +1712 -0
- victord/sub19/models/Solon-embeddings-large-0.1/1_Pooling/config.json +10 -0
- victord/sub19/models/Solon-embeddings-large-0.1/README.md +810 -0
- victord/sub19/models/Solon-embeddings-large-0.1/config.json +27 -0
- victord/sub19/models/Solon-embeddings-large-0.1/config_sentence_transformers.json +14 -0
- victord/sub19/models/Solon-embeddings-large-0.1/model.safetensors +3 -0
- victord/sub19/models/Solon-embeddings-large-0.1/modules.json +20 -0
- victord/sub19/models/Solon-embeddings-large-0.1/sentence_bert_config.json +4 -0
- victord/sub19/models/Solon-embeddings-large-0.1/sentencepiece.bpe.model +3 -0
- victord/sub19/models/Solon-embeddings-large-0.1/special_tokens_map.json +51 -0
- victord/sub19/models/Solon-embeddings-large-0.1/tokenizer.json +3 -0
- victord/sub19/models/Solon-embeddings-large-0.1/tokenizer_config.json +56 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/1_Pooling/config.json +10 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/README.md +93 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config.json +27 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config_sentence_transformers.json +14 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/model.safetensors +3 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/modules.json +20 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/sentence_bert_config.json +4 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/special_tokens_map.json +51 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer.json +3 -0
- victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer_config.json +62 -0
- victord/sub19/models/bge-m3/1_Pooling/config.json +10 -0
- victord/sub19/models/bge-m3/README.md +300 -0
- victord/sub19/models/bge-m3/config.json +27 -0
- victord/sub19/models/bge-m3/config_sentence_transformers.json +14 -0
- victord/sub19/models/bge-m3/model.safetensors +3 -0
- victord/sub19/models/bge-m3/modules.json +20 -0
- victord/sub19/models/bge-m3/sentence_bert_config.json +4 -0
- victord/sub19/models/bge-m3/sentencepiece.bpe.model +3 -0
- victord/sub19/models/bge-m3/special_tokens_map.json +51 -0
- victord/sub19/models/bge-m3/tokenizer.json +3 -0
- victord/sub19/models/bge-m3/tokenizer_config.json +56 -0
- victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/README.md +503 -0
- victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/config.json +37 -0
- victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/model.safetensors +3 -0
- victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/sentencepiece.bpe.model +3 -0
- victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/special_tokens_map.json +51 -0
- victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer.json +3 -0
- victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer_config.json +57 -0
- victord/sub19/models/multilingual-e5-large-instruct/1_Pooling/config.json +10 -0
- victord/sub19/models/multilingual-e5-large-instruct/README.md +0 -0
- victord/sub19/models/multilingual-e5-large-instruct/config.json +27 -0
- victord/sub19/models/multilingual-e5-large-instruct/config_sentence_transformers.json +14 -0
- victord/sub19/models/multilingual-e5-large-instruct/model.safetensors +3 -0
- victord/sub19/models/multilingual-e5-large-instruct/modules.json +20 -0
- victord/sub19/models/multilingual-e5-large-instruct/sentence_bert_config.json +4 -0
- victord/sub19/models/multilingual-e5-large-instruct/sentencepiece.bpe.model +3 -0
- victord/sub19/models/multilingual-e5-large-instruct/special_tokens_map.json +51 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
victord/sub19/models/bge-m3/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
victord/sub19/models/multilingual-e5-large-instruct/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
victord/sub19/models/Solon-embeddings-large-0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
victord/sub19/model.py
ADDED
|
@@ -0,0 +1,1712 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 4 |
+
os.environ.setdefault("HF_HUB_OFFLINE", "1") # never hit the Hub
|
| 5 |
+
os.environ.setdefault("TRANSFORMERS_OFFLINE", "1") # Transformers offline
|
| 6 |
+
os.environ.setdefault("HF_DATASETS_OFFLINE", "1") # Datasets offline
|
| 7 |
+
os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
|
| 8 |
+
os.environ.setdefault("DISABLE_TQDM", '1')
|
| 9 |
+
os.environ.setdefault("TQDM_DISABLE", '1')
|
| 10 |
+
os.environ["TQDM_DISABLE"] = "1"
|
| 11 |
+
os.environ["DISABLE_TQDM"] = "1"
|
| 12 |
+
import json
|
| 13 |
+
import torch
|
| 14 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 15 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
import numpy as np
|
| 18 |
+
from transformers import AutoTokenizer, AutoModel
|
| 19 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 20 |
+
import string
|
| 21 |
+
from sklearn.feature_extraction import _stop_words
|
| 22 |
+
|
| 23 |
+
import time
|
| 24 |
+
|
| 25 |
+
import bm25s
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class BM25sRetriever:
|
| 29 |
+
def __init__(self, passages, ids):
|
| 30 |
+
self.corpus_tokens = bm25s.tokenize(passages, show_progress=False)
|
| 31 |
+
self.retriever = bm25s.BM25()
|
| 32 |
+
self.retriever.index(self.corpus_tokens, show_progress=False, leave_progress=False)
|
| 33 |
+
self.corpus_ids = ids
|
| 34 |
+
self.id2idx = {}
|
| 35 |
+
for i in range(len(self.corpus_ids)):
|
| 36 |
+
self.id2idx[self.corpus_ids[i]] = i
|
| 37 |
+
|
| 38 |
+
def get_scores(self, query_text):
|
| 39 |
+
query_tokens = bm25s.tokenize(query_text)
|
| 40 |
+
return self.retriever.retrieve(query_tokens, sorted=False, k=len(self.corpus_ids), show_progress=False, leave_progress=False)[1][0]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class E5Retriever:
|
| 46 |
+
def __init__(self, model_name=None, device=None):
|
| 47 |
+
"""
|
| 48 |
+
Initializes the E5 retriever using the multilingual E5 base model.
|
| 49 |
+
"""
|
| 50 |
+
# Use local model
|
| 51 |
+
if model_name is None:
|
| 52 |
+
# local_model_path = os.path.join('sub/models', 'multilingual-e5-large') # 'multilingual-e5-large'
|
| 53 |
+
local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'multilingual-e5-large_pseudo_full') # 'multilingual-e5-large'
|
| 54 |
+
if os.path.isdir(local_model_path):
|
| 55 |
+
model_name = local_model_path
|
| 56 |
+
print(f"Using local E5 model from: {model_name}")
|
| 57 |
+
|
| 58 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 59 |
+
|
| 60 |
+
# Clear GPU cache before loading model
|
| 61 |
+
if torch.cuda.is_available():
|
| 62 |
+
torch.cuda.empty_cache()
|
| 63 |
+
|
| 64 |
+
print(f"Loading E5 multilingual model on device: {self.device}")
|
| 65 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 66 |
+
self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device)
|
| 67 |
+
self.model.eval()
|
| 68 |
+
|
| 69 |
+
# Clear cache after model loading
|
| 70 |
+
if torch.cuda.is_available():
|
| 71 |
+
torch.cuda.empty_cache()
|
| 72 |
+
|
| 73 |
+
self.corpus_ids = []
|
| 74 |
+
self.corpus_embeddings = None
|
| 75 |
+
|
| 76 |
+
def embed_texts(self, texts, is_query=False, batch_size=32):
|
| 77 |
+
"""
|
| 78 |
+
Generates embeddings for texts using E5 model with proper prefixes.
|
| 79 |
+
E5 requires specific prefixes for queries vs passages.
|
| 80 |
+
"""
|
| 81 |
+
# E5 model requires specific prefixes
|
| 82 |
+
if is_query:
|
| 83 |
+
# Add query prefix for E5
|
| 84 |
+
prefixed_texts = [f"query: {text.strip()}" for text in texts]
|
| 85 |
+
else:
|
| 86 |
+
# Add passage prefix for E5
|
| 87 |
+
prefixed_texts = [f"passage: {text.strip()}" for text in texts]
|
| 88 |
+
|
| 89 |
+
all_embeddings = []
|
| 90 |
+
total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size
|
| 91 |
+
|
| 92 |
+
for i in range(0, len(prefixed_texts), batch_size):
|
| 93 |
+
batch_num = i // batch_size + 1
|
| 94 |
+
if not is_query and batch_num % 50 == 0:
|
| 95 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 96 |
+
if torch.cuda.is_available():
|
| 97 |
+
torch.cuda.empty_cache()
|
| 98 |
+
|
| 99 |
+
batch_texts = prefixed_texts[i:i + batch_size]
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
encoded = self.tokenizer(
|
| 103 |
+
batch_texts,
|
| 104 |
+
padding=True,
|
| 105 |
+
truncation=True,
|
| 106 |
+
max_length=512,
|
| 107 |
+
return_tensors='pt'
|
| 108 |
+
).to(self.device)
|
| 109 |
+
|
| 110 |
+
with torch.no_grad():
|
| 111 |
+
model_output = self.model(**encoded)
|
| 112 |
+
|
| 113 |
+
# E5 uses mean pooling with attention mask
|
| 114 |
+
attention_mask = encoded['attention_mask']
|
| 115 |
+
embeddings = model_output.last_hidden_state
|
| 116 |
+
|
| 117 |
+
# Mean pooling
|
| 118 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 119 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 120 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 121 |
+
embeddings = sum_embeddings / sum_mask
|
| 122 |
+
|
| 123 |
+
# L2 normalize embeddings (important for E5)
|
| 124 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 125 |
+
|
| 126 |
+
# Move to CPU immediately
|
| 127 |
+
all_embeddings.append(embeddings.cpu())
|
| 128 |
+
|
| 129 |
+
# Clear GPU memory
|
| 130 |
+
del encoded, model_output, embeddings
|
| 131 |
+
if torch.cuda.is_available():
|
| 132 |
+
torch.cuda.empty_cache()
|
| 133 |
+
|
| 134 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 135 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 136 |
+
# Process one item at a time
|
| 137 |
+
for single_text in batch_texts:
|
| 138 |
+
try:
|
| 139 |
+
encoded = self.tokenizer(
|
| 140 |
+
[single_text],
|
| 141 |
+
padding=True,
|
| 142 |
+
truncation=True,
|
| 143 |
+
max_length=512,
|
| 144 |
+
return_tensors='pt'
|
| 145 |
+
).to(self.device)
|
| 146 |
+
|
| 147 |
+
with torch.no_grad():
|
| 148 |
+
model_output = self.model(**encoded)
|
| 149 |
+
attention_mask = encoded['attention_mask']
|
| 150 |
+
embeddings = model_output.last_hidden_state
|
| 151 |
+
|
| 152 |
+
# Mean pooling
|
| 153 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 154 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 155 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 156 |
+
embeddings = sum_embeddings / sum_mask
|
| 157 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 158 |
+
|
| 159 |
+
all_embeddings.append(embeddings.cpu())
|
| 160 |
+
|
| 161 |
+
del encoded, model_output, embeddings
|
| 162 |
+
if torch.cuda.is_available():
|
| 163 |
+
torch.cuda.empty_cache()
|
| 164 |
+
|
| 165 |
+
except Exception as e2:
|
| 166 |
+
print(f"Failed to process single text: {e2}")
|
| 167 |
+
# E5-base has 768 dimensions. Large 1024
|
| 168 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 169 |
+
all_embeddings.append(zero_embedding)
|
| 170 |
+
|
| 171 |
+
return torch.cat(all_embeddings, dim=0).numpy()
|
| 172 |
+
|
| 173 |
+
def prepare_corpus(self, texts, ids, batch_size=32):
|
| 174 |
+
|
| 175 |
+
# Add passage prefix for E5
|
| 176 |
+
prefixed_texts = [f"passage: {text.strip()}" for text in texts]
|
| 177 |
+
|
| 178 |
+
all_embeddings = []
|
| 179 |
+
|
| 180 |
+
all_tokens = []
|
| 181 |
+
all_ids = []
|
| 182 |
+
all_len = []
|
| 183 |
+
|
| 184 |
+
prefix_len = 3
|
| 185 |
+
suffix_len = 1
|
| 186 |
+
max_length = 512
|
| 187 |
+
max_passage = max_length - prefix_len - suffix_len
|
| 188 |
+
overlap = max_passage // 2
|
| 189 |
+
|
| 190 |
+
for i in range(len(prefixed_texts)):
|
| 191 |
+
encoded = self.tokenizer(
|
| 192 |
+
[prefixed_texts[i]],
|
| 193 |
+
padding=False,
|
| 194 |
+
truncation=False,
|
| 195 |
+
# max_length=512,
|
| 196 |
+
# return_tensors='pt'
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
if len(encoded['input_ids'][0]) > max_length:
|
| 200 |
+
_idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len))
|
| 201 |
+
|
| 202 |
+
i0 = 0
|
| 203 |
+
while i0 < len(_idxs) - overlap:
|
| 204 |
+
i1 = min(i0+max_passage, len(_idxs))
|
| 205 |
+
all_ids.append((ids[i], (i0, i1, len(_idxs))))
|
| 206 |
+
all_tokens.append({
|
| 207 |
+
'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:],
|
| 208 |
+
'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:]
|
| 209 |
+
})
|
| 210 |
+
all_len.append(i1-i0)
|
| 211 |
+
i0 += max_passage - overlap
|
| 212 |
+
else:
|
| 213 |
+
all_ids.append((ids[i], None))
|
| 214 |
+
all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]})
|
| 215 |
+
all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
total_batches = (len(all_tokens) + batch_size - 1) // batch_size
|
| 219 |
+
|
| 220 |
+
all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0]))
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
for i in range(0, len(all_tokens), batch_size):
|
| 224 |
+
batch_num = i // batch_size + 1
|
| 225 |
+
if batch_num % 50 == 0:
|
| 226 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 227 |
+
if torch.cuda.is_available():
|
| 228 |
+
torch.cuda.empty_cache()
|
| 229 |
+
|
| 230 |
+
batch_tokens = all_tokens[i:i + batch_size]
|
| 231 |
+
|
| 232 |
+
batch_max = max([len(ids['input_ids']) for ids in batch_tokens])
|
| 233 |
+
encoded = dict()
|
| 234 |
+
encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 235 |
+
encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 236 |
+
encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device)
|
| 237 |
+
encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
try:
|
| 242 |
+
|
| 243 |
+
with torch.no_grad():
|
| 244 |
+
model_output = self.model(**encoded)
|
| 245 |
+
|
| 246 |
+
# E5 uses mean pooling with attention mask
|
| 247 |
+
attention_mask = encoded['attention_mask']
|
| 248 |
+
embeddings = model_output.last_hidden_state
|
| 249 |
+
|
| 250 |
+
# Mean pooling
|
| 251 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 252 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 253 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 254 |
+
embeddings = sum_embeddings / sum_mask
|
| 255 |
+
|
| 256 |
+
# L2 normalize embeddings (important for E5)
|
| 257 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 258 |
+
|
| 259 |
+
# Move to CPU immediately
|
| 260 |
+
all_embeddings.append(embeddings.cpu())
|
| 261 |
+
|
| 262 |
+
# Clear GPU memory
|
| 263 |
+
del encoded, model_output, embeddings
|
| 264 |
+
if torch.cuda.is_available():
|
| 265 |
+
torch.cuda.empty_cache()
|
| 266 |
+
|
| 267 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 268 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 269 |
+
# Process one item at a time
|
| 270 |
+
for j in range(len(encoded)):
|
| 271 |
+
try:
|
| 272 |
+
|
| 273 |
+
encoded0 = encoded[j:j+1]
|
| 274 |
+
|
| 275 |
+
with torch.no_grad():
|
| 276 |
+
model_output = self.model(**encoded0)
|
| 277 |
+
attention_mask = encoded0['attention_mask']
|
| 278 |
+
embeddings = model_output.last_hidden_state
|
| 279 |
+
|
| 280 |
+
# Mean pooling
|
| 281 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 282 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 283 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 284 |
+
embeddings = sum_embeddings / sum_mask
|
| 285 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 286 |
+
|
| 287 |
+
all_embeddings.append(embeddings.cpu())
|
| 288 |
+
|
| 289 |
+
del encoded0, model_output, embeddings
|
| 290 |
+
if torch.cuda.is_available():
|
| 291 |
+
torch.cuda.empty_cache()
|
| 292 |
+
|
| 293 |
+
except Exception as e2:
|
| 294 |
+
print(f"Failed to process single text: {e2}")
|
| 295 |
+
# E5-base has 768 dimensions. Large 1024
|
| 296 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 297 |
+
all_embeddings.append(zero_embedding)
|
| 298 |
+
|
| 299 |
+
id2idx = {}
|
| 300 |
+
for i in range(len(all_ids)):
|
| 301 |
+
if all_ids[i][0] not in id2idx:
|
| 302 |
+
id2idx[all_ids[i][0]] = []
|
| 303 |
+
id2idx[all_ids[i][0]].append(i)
|
| 304 |
+
|
| 305 |
+
return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy()
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
class E5InstructRetriever:
|
| 310 |
+
def __init__(self, model_name=None, device=None):
|
| 311 |
+
"""
|
| 312 |
+
Initializes the E5 Instruct retriever using the multilingual E5 Instruct large model.
|
| 313 |
+
"""
|
| 314 |
+
# Use local model
|
| 315 |
+
if model_name is None:
|
| 316 |
+
# local_model_path = os.path.join('sub/models', 'multilingual-e5-large-instruct')
|
| 317 |
+
local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'multilingual-e5-large-instruct')
|
| 318 |
+
if os.path.isdir(local_model_path):
|
| 319 |
+
model_name = local_model_path
|
| 320 |
+
print(f"Using local E5 Instruct model from: {model_name}")
|
| 321 |
+
|
| 322 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 323 |
+
|
| 324 |
+
# Clear GPU cache before loading model
|
| 325 |
+
if torch.cuda.is_available():
|
| 326 |
+
torch.cuda.empty_cache()
|
| 327 |
+
|
| 328 |
+
print(f"Loading E5 Instruct multilingual model on device: {self.device}")
|
| 329 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 330 |
+
self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) # torch.float16
|
| 331 |
+
self.model.eval()
|
| 332 |
+
|
| 333 |
+
# Clear cache after model loading
|
| 334 |
+
if torch.cuda.is_available():
|
| 335 |
+
torch.cuda.empty_cache()
|
| 336 |
+
|
| 337 |
+
self.corpus_ids = []
|
| 338 |
+
self.corpus_embeddings = None
|
| 339 |
+
|
| 340 |
+
def embed_texts(self, texts, is_query=False, batch_size=32):
|
| 341 |
+
"""
|
| 342 |
+
Generates embeddings for texts using E5 Instruct model with proper prefixes.
|
| 343 |
+
E5 Instruct requires specific prefixes for queries vs passages.
|
| 344 |
+
"""
|
| 345 |
+
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
| 346 |
+
# E5 model requires specific prefixes
|
| 347 |
+
if is_query:
|
| 348 |
+
# Add query prefix for E5
|
| 349 |
+
prefixed_texts = [f"Instruct: {task}\nQuery: {text.strip()}" for text in texts]
|
| 350 |
+
else:
|
| 351 |
+
# Add passage prefix for E5
|
| 352 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 353 |
+
|
| 354 |
+
all_embeddings = []
|
| 355 |
+
total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size
|
| 356 |
+
|
| 357 |
+
for i in range(0, len(prefixed_texts), batch_size):
|
| 358 |
+
batch_num = i // batch_size + 1
|
| 359 |
+
if not is_query and batch_num % 50 == 0:
|
| 360 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 361 |
+
if torch.cuda.is_available():
|
| 362 |
+
torch.cuda.empty_cache()
|
| 363 |
+
|
| 364 |
+
batch_texts = prefixed_texts[i:i + batch_size]
|
| 365 |
+
|
| 366 |
+
try:
|
| 367 |
+
encoded = self.tokenizer(
|
| 368 |
+
batch_texts,
|
| 369 |
+
padding=True,
|
| 370 |
+
truncation=True,
|
| 371 |
+
max_length=512,
|
| 372 |
+
return_tensors='pt'
|
| 373 |
+
).to(self.device)
|
| 374 |
+
|
| 375 |
+
with torch.no_grad():
|
| 376 |
+
model_output = self.model(**encoded)
|
| 377 |
+
|
| 378 |
+
# E5 uses mean pooling with attention mask
|
| 379 |
+
attention_mask = encoded['attention_mask']
|
| 380 |
+
embeddings = model_output.last_hidden_state
|
| 381 |
+
|
| 382 |
+
# Mean pooling
|
| 383 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 384 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 385 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 386 |
+
embeddings = sum_embeddings / sum_mask
|
| 387 |
+
|
| 388 |
+
# L2 normalize embeddings (important for E5)
|
| 389 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 390 |
+
|
| 391 |
+
# Move to CPU immediately
|
| 392 |
+
all_embeddings.append(embeddings.cpu())
|
| 393 |
+
|
| 394 |
+
# Clear GPU memory
|
| 395 |
+
del encoded, model_output, embeddings
|
| 396 |
+
if torch.cuda.is_available():
|
| 397 |
+
torch.cuda.empty_cache()
|
| 398 |
+
|
| 399 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 400 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 401 |
+
# Process one item at a time
|
| 402 |
+
for single_text in batch_texts:
|
| 403 |
+
try:
|
| 404 |
+
encoded = self.tokenizer(
|
| 405 |
+
[single_text],
|
| 406 |
+
padding=True,
|
| 407 |
+
truncation=True,
|
| 408 |
+
max_length=512,
|
| 409 |
+
return_tensors='pt'
|
| 410 |
+
).to(self.device)
|
| 411 |
+
|
| 412 |
+
with torch.no_grad():
|
| 413 |
+
model_output = self.model(**encoded)
|
| 414 |
+
attention_mask = encoded['attention_mask']
|
| 415 |
+
embeddings = model_output.last_hidden_state
|
| 416 |
+
|
| 417 |
+
# Mean pooling
|
| 418 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 419 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 420 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 421 |
+
embeddings = sum_embeddings / sum_mask
|
| 422 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 423 |
+
|
| 424 |
+
all_embeddings.append(embeddings.cpu())
|
| 425 |
+
|
| 426 |
+
del encoded, model_output, embeddings
|
| 427 |
+
if torch.cuda.is_available():
|
| 428 |
+
torch.cuda.empty_cache()
|
| 429 |
+
|
| 430 |
+
except Exception as e2:
|
| 431 |
+
print(f"Failed to process single text: {e2}")
|
| 432 |
+
# E5-base has 768 dimensions. Large 1024
|
| 433 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 434 |
+
all_embeddings.append(zero_embedding)
|
| 435 |
+
|
| 436 |
+
return torch.cat(all_embeddings, dim=0).numpy()
|
| 437 |
+
|
| 438 |
+
def prepare_corpus(self, texts, ids, batch_size=32):
|
| 439 |
+
|
| 440 |
+
# Add passage prefix for E5
|
| 441 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 442 |
+
|
| 443 |
+
all_embeddings = []
|
| 444 |
+
|
| 445 |
+
all_tokens = []
|
| 446 |
+
all_ids = []
|
| 447 |
+
all_len = []
|
| 448 |
+
|
| 449 |
+
prefix_len = 1
|
| 450 |
+
suffix_len = 1
|
| 451 |
+
max_length = 512
|
| 452 |
+
max_passage = max_length - prefix_len - suffix_len
|
| 453 |
+
overlap = max_passage // 2
|
| 454 |
+
|
| 455 |
+
for i in range(len(prefixed_texts)): # tqdm(
|
| 456 |
+
encoded = self.tokenizer(
|
| 457 |
+
[prefixed_texts[i]],
|
| 458 |
+
padding=False,
|
| 459 |
+
truncation=False,
|
| 460 |
+
# max_length=512,
|
| 461 |
+
# return_tensors='pt'
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
if len(encoded['input_ids'][0]) > max_length:
|
| 465 |
+
_idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len))
|
| 466 |
+
|
| 467 |
+
i0 = 0
|
| 468 |
+
while i0 < len(_idxs) - overlap:
|
| 469 |
+
i1 = min(i0+max_passage, len(_idxs))
|
| 470 |
+
all_ids.append((ids[i], (i0, i1, len(_idxs))))
|
| 471 |
+
all_tokens.append({
|
| 472 |
+
'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:],
|
| 473 |
+
'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:]
|
| 474 |
+
})
|
| 475 |
+
all_len.append(i1-i0)
|
| 476 |
+
i0 += max_passage - overlap
|
| 477 |
+
else:
|
| 478 |
+
all_ids.append((ids[i], None))
|
| 479 |
+
all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]})
|
| 480 |
+
all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len)
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
total_batches = (len(all_tokens) + batch_size - 1) // batch_size
|
| 484 |
+
|
| 485 |
+
all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0]))
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
for i in range(0, len(all_tokens), batch_size):
|
| 489 |
+
batch_num = i // batch_size + 1
|
| 490 |
+
if batch_num % 50 == 0:
|
| 491 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 492 |
+
if torch.cuda.is_available():
|
| 493 |
+
torch.cuda.empty_cache()
|
| 494 |
+
|
| 495 |
+
batch_tokens = all_tokens[i:i + batch_size]
|
| 496 |
+
|
| 497 |
+
batch_max = max([len(ids['input_ids']) for ids in batch_tokens])
|
| 498 |
+
encoded = dict()
|
| 499 |
+
encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 500 |
+
encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 501 |
+
encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device)
|
| 502 |
+
encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device)
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
try:
|
| 507 |
+
|
| 508 |
+
with torch.no_grad():
|
| 509 |
+
model_output = self.model(**encoded)
|
| 510 |
+
|
| 511 |
+
# E5 uses mean pooling with attention mask
|
| 512 |
+
attention_mask = encoded['attention_mask']
|
| 513 |
+
embeddings = model_output.last_hidden_state
|
| 514 |
+
|
| 515 |
+
# Mean pooling
|
| 516 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 517 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 518 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 519 |
+
embeddings = sum_embeddings / sum_mask
|
| 520 |
+
|
| 521 |
+
# L2 normalize embeddings (important for E5)
|
| 522 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 523 |
+
|
| 524 |
+
# Move to CPU immediately
|
| 525 |
+
all_embeddings.append(embeddings.cpu())
|
| 526 |
+
|
| 527 |
+
# Clear GPU memory
|
| 528 |
+
del encoded, model_output, embeddings
|
| 529 |
+
if torch.cuda.is_available():
|
| 530 |
+
torch.cuda.empty_cache()
|
| 531 |
+
|
| 532 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 533 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 534 |
+
# Process one item at a time
|
| 535 |
+
for j in range(len(encoded)):
|
| 536 |
+
try:
|
| 537 |
+
|
| 538 |
+
encoded0 = encoded[j:j+1]
|
| 539 |
+
|
| 540 |
+
with torch.no_grad():
|
| 541 |
+
model_output = self.model(**encoded0)
|
| 542 |
+
attention_mask = encoded0['attention_mask']
|
| 543 |
+
embeddings = model_output.last_hidden_state
|
| 544 |
+
|
| 545 |
+
# Mean pooling
|
| 546 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 547 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 548 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 549 |
+
embeddings = sum_embeddings / sum_mask
|
| 550 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 551 |
+
|
| 552 |
+
all_embeddings.append(embeddings.cpu())
|
| 553 |
+
|
| 554 |
+
del encoded0, model_output, embeddings
|
| 555 |
+
if torch.cuda.is_available():
|
| 556 |
+
torch.cuda.empty_cache()
|
| 557 |
+
|
| 558 |
+
except Exception as e2:
|
| 559 |
+
print(f"Failed to process single text: {e2}")
|
| 560 |
+
# E5-base has 768 dimensions. Large 1024
|
| 561 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 562 |
+
all_embeddings.append(zero_embedding)
|
| 563 |
+
|
| 564 |
+
id2idx = {}
|
| 565 |
+
for i in range(len(all_ids)):
|
| 566 |
+
if all_ids[i][0] not in id2idx:
|
| 567 |
+
id2idx[all_ids[i][0]] = []
|
| 568 |
+
id2idx[all_ids[i][0]].append(i)
|
| 569 |
+
|
| 570 |
+
return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy()
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
class SnowflakeInstructRetriever:
|
| 576 |
+
def __init__(self, model_name=None, device=None):
|
| 577 |
+
"""
|
| 578 |
+
Initializes the retriever using the multilingual snowflake-arctic-embed-l-v2.0 model.
|
| 579 |
+
"""
|
| 580 |
+
# Use local model
|
| 581 |
+
if model_name is None:
|
| 582 |
+
# local_model_path = os.path.join('sub/models', 'snowflake-arctic-embed-l-v2.0')
|
| 583 |
+
local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'snowflake-arctic-embed-l-v2.0')
|
| 584 |
+
if os.path.isdir(local_model_path):
|
| 585 |
+
model_name = local_model_path
|
| 586 |
+
print(f"Using local snowflake-arctic-embed-l-v2.0 model from: {model_name}")
|
| 587 |
+
|
| 588 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 589 |
+
|
| 590 |
+
# Clear GPU cache before loading model
|
| 591 |
+
if torch.cuda.is_available():
|
| 592 |
+
torch.cuda.empty_cache()
|
| 593 |
+
|
| 594 |
+
print(f"Loading snowflake-arctic-embed-l-v2.0 multilingual model on device: {self.device}")
|
| 595 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 596 |
+
self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device)
|
| 597 |
+
self.model.eval()
|
| 598 |
+
|
| 599 |
+
# Clear cache after model loading
|
| 600 |
+
if torch.cuda.is_available():
|
| 601 |
+
torch.cuda.empty_cache()
|
| 602 |
+
|
| 603 |
+
self.corpus_ids = []
|
| 604 |
+
self.corpus_embeddings = None
|
| 605 |
+
|
| 606 |
+
def embed_texts(self, texts, is_query=False, batch_size=32):
|
| 607 |
+
"""
|
| 608 |
+
Generates embeddings for texts using snowflake-arctic-embed-l-v2.0 model with proper prefixes.
|
| 609 |
+
snowflake-arctic-embed-l-v2.0 requires specific prefixes for queries vs passages.
|
| 610 |
+
"""
|
| 611 |
+
|
| 612 |
+
# E5 model requires specific prefixes
|
| 613 |
+
if is_query:
|
| 614 |
+
# Add query prefix for E5
|
| 615 |
+
prefixed_texts = [f"query: {text.strip()}" for text in texts]
|
| 616 |
+
else:
|
| 617 |
+
# Add passage prefix for E5
|
| 618 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 619 |
+
|
| 620 |
+
all_embeddings = []
|
| 621 |
+
total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size
|
| 622 |
+
|
| 623 |
+
for i in range(0, len(prefixed_texts), batch_size):
|
| 624 |
+
batch_num = i // batch_size + 1
|
| 625 |
+
if not is_query and batch_num % 50 == 0:
|
| 626 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 627 |
+
if torch.cuda.is_available():
|
| 628 |
+
torch.cuda.empty_cache()
|
| 629 |
+
|
| 630 |
+
batch_texts = prefixed_texts[i:i + batch_size]
|
| 631 |
+
|
| 632 |
+
try:
|
| 633 |
+
encoded = self.tokenizer(
|
| 634 |
+
batch_texts,
|
| 635 |
+
padding=True,
|
| 636 |
+
truncation=True,
|
| 637 |
+
max_length=1024, # 512
|
| 638 |
+
return_tensors='pt'
|
| 639 |
+
).to(self.device)
|
| 640 |
+
|
| 641 |
+
with torch.no_grad():
|
| 642 |
+
model_output = self.model(**encoded)
|
| 643 |
+
|
| 644 |
+
embeddings = model_output[0][:, 0].float()
|
| 645 |
+
# L2 normalize embeddings
|
| 646 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 647 |
+
|
| 648 |
+
# Move to CPU immediately
|
| 649 |
+
all_embeddings.append(embeddings.cpu())
|
| 650 |
+
|
| 651 |
+
# Clear GPU memory
|
| 652 |
+
del encoded, model_output, embeddings
|
| 653 |
+
if torch.cuda.is_available():
|
| 654 |
+
torch.cuda.empty_cache()
|
| 655 |
+
|
| 656 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 657 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 658 |
+
# Process one item at a time
|
| 659 |
+
for single_text in batch_texts:
|
| 660 |
+
try:
|
| 661 |
+
encoded = self.tokenizer(
|
| 662 |
+
[single_text],
|
| 663 |
+
padding=True,
|
| 664 |
+
truncation=True,
|
| 665 |
+
max_length=512,
|
| 666 |
+
return_tensors='pt'
|
| 667 |
+
).to(self.device)
|
| 668 |
+
|
| 669 |
+
with torch.no_grad():
|
| 670 |
+
model_output = self.model(**encoded)
|
| 671 |
+
|
| 672 |
+
embeddings = model_output[0][:, 0].float()
|
| 673 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 674 |
+
|
| 675 |
+
all_embeddings.append(embeddings.cpu())
|
| 676 |
+
|
| 677 |
+
del encoded, model_output, embeddings
|
| 678 |
+
if torch.cuda.is_available():
|
| 679 |
+
torch.cuda.empty_cache()
|
| 680 |
+
|
| 681 |
+
except Exception as e2:
|
| 682 |
+
print(f"Failed to process single text: {e2}")
|
| 683 |
+
|
| 684 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 685 |
+
all_embeddings.append(zero_embedding)
|
| 686 |
+
|
| 687 |
+
return torch.cat(all_embeddings, dim=0).numpy()
|
| 688 |
+
|
| 689 |
+
def prepare_corpus(self, texts, ids, batch_size=32):
|
| 690 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 691 |
+
|
| 692 |
+
all_embeddings = []
|
| 693 |
+
|
| 694 |
+
all_tokens = []
|
| 695 |
+
all_ids = []
|
| 696 |
+
all_len = []
|
| 697 |
+
|
| 698 |
+
prefix_len = 1
|
| 699 |
+
suffix_len = 1
|
| 700 |
+
max_length = 1024 # 512
|
| 701 |
+
max_passage = max_length - prefix_len - suffix_len
|
| 702 |
+
overlap = max_passage // 2
|
| 703 |
+
|
| 704 |
+
for i in range(len(prefixed_texts)):
|
| 705 |
+
encoded = self.tokenizer(
|
| 706 |
+
[prefixed_texts[i]],
|
| 707 |
+
padding=False,
|
| 708 |
+
truncation=False,
|
| 709 |
+
# max_length=512,
|
| 710 |
+
# return_tensors='pt'
|
| 711 |
+
)
|
| 712 |
+
|
| 713 |
+
if len(encoded['input_ids'][0]) > max_length:
|
| 714 |
+
_idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len))
|
| 715 |
+
|
| 716 |
+
i0 = 0
|
| 717 |
+
while i0 < len(_idxs) - overlap:
|
| 718 |
+
i1 = min(i0+max_passage, len(_idxs))
|
| 719 |
+
all_ids.append((ids[i], (i0, i1, len(_idxs))))
|
| 720 |
+
all_tokens.append({
|
| 721 |
+
'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:],
|
| 722 |
+
'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:]
|
| 723 |
+
})
|
| 724 |
+
all_len.append(i1-i0)
|
| 725 |
+
i0 += max_passage - overlap
|
| 726 |
+
else:
|
| 727 |
+
all_ids.append((ids[i], None))
|
| 728 |
+
all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]})
|
| 729 |
+
all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len)
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
total_batches = (len(all_tokens) + batch_size - 1) // batch_size
|
| 733 |
+
|
| 734 |
+
all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0]))
|
| 735 |
+
|
| 736 |
+
|
| 737 |
+
for i in range(0, len(all_tokens), batch_size):
|
| 738 |
+
batch_num = i // batch_size + 1
|
| 739 |
+
if batch_num % 50 == 0:
|
| 740 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 741 |
+
if torch.cuda.is_available():
|
| 742 |
+
torch.cuda.empty_cache()
|
| 743 |
+
|
| 744 |
+
batch_tokens = all_tokens[i:i + batch_size]
|
| 745 |
+
|
| 746 |
+
batch_max = max([len(ids['input_ids']) for ids in batch_tokens])
|
| 747 |
+
encoded = dict()
|
| 748 |
+
encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 749 |
+
encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 750 |
+
encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device)
|
| 751 |
+
encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device)
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
|
| 755 |
+
try:
|
| 756 |
+
|
| 757 |
+
with torch.no_grad():
|
| 758 |
+
model_output = self.model(**encoded)
|
| 759 |
+
|
| 760 |
+
embeddings = model_output[0][:, 0].float()
|
| 761 |
+
# L2 normalize embeddings
|
| 762 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 763 |
+
|
| 764 |
+
# Move to CPU immediately
|
| 765 |
+
all_embeddings.append(embeddings.cpu())
|
| 766 |
+
|
| 767 |
+
# Clear GPU memory
|
| 768 |
+
del encoded, model_output, embeddings
|
| 769 |
+
if torch.cuda.is_available():
|
| 770 |
+
torch.cuda.empty_cache()
|
| 771 |
+
|
| 772 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 773 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 774 |
+
# Process one item at a time
|
| 775 |
+
for j in range(len(encoded)):
|
| 776 |
+
try:
|
| 777 |
+
|
| 778 |
+
encoded0 = encoded[j:j+1]
|
| 779 |
+
|
| 780 |
+
with torch.no_grad():
|
| 781 |
+
model_output = self.model(**encoded0)
|
| 782 |
+
|
| 783 |
+
embeddings = model_output[0][:, 0].float()
|
| 784 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 785 |
+
|
| 786 |
+
all_embeddings.append(embeddings.cpu())
|
| 787 |
+
|
| 788 |
+
del encoded0, model_output, embeddings
|
| 789 |
+
if torch.cuda.is_available():
|
| 790 |
+
torch.cuda.empty_cache()
|
| 791 |
+
|
| 792 |
+
except Exception as e2:
|
| 793 |
+
print(f"Failed to process single text: {e2}")
|
| 794 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 795 |
+
all_embeddings.append(zero_embedding)
|
| 796 |
+
|
| 797 |
+
id2idx = {}
|
| 798 |
+
for i in range(len(all_ids)):
|
| 799 |
+
if all_ids[i][0] not in id2idx:
|
| 800 |
+
id2idx[all_ids[i][0]] = []
|
| 801 |
+
id2idx[all_ids[i][0]].append(i)
|
| 802 |
+
|
| 803 |
+
return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy()
|
| 804 |
+
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
class SolonRetriever:
|
| 808 |
+
def __init__(self, model_name=None, device=None):
|
| 809 |
+
"""
|
| 810 |
+
Initializes the retriever using the multilingual Solon-embeddings-large-0.1 model.
|
| 811 |
+
"""
|
| 812 |
+
# Use local model
|
| 813 |
+
if model_name is None:
|
| 814 |
+
# local_model_path = os.path.join('sub/models', 'Solon-embeddings-large-0.1')
|
| 815 |
+
local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'Solon-embeddings-large-0.1')
|
| 816 |
+
if os.path.isdir(local_model_path):
|
| 817 |
+
model_name = local_model_path
|
| 818 |
+
print(f"Using local Solon-embeddings-large-0.1 model from: {model_name}")
|
| 819 |
+
|
| 820 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 821 |
+
|
| 822 |
+
# Clear GPU cache before loading model
|
| 823 |
+
if torch.cuda.is_available():
|
| 824 |
+
torch.cuda.empty_cache()
|
| 825 |
+
|
| 826 |
+
print(f"Loading Solon-embeddings-large-0.1 multilingual model on device: {self.device}")
|
| 827 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 828 |
+
self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) # torch.float16
|
| 829 |
+
self.model.eval()
|
| 830 |
+
|
| 831 |
+
# Clear cache after model loading
|
| 832 |
+
if torch.cuda.is_available():
|
| 833 |
+
torch.cuda.empty_cache()
|
| 834 |
+
|
| 835 |
+
self.corpus_ids = []
|
| 836 |
+
self.corpus_embeddings = None
|
| 837 |
+
|
| 838 |
+
def embed_texts(self, texts, is_query=False, batch_size=32):
|
| 839 |
+
"""
|
| 840 |
+
Generates embeddings for texts using Solon-embeddings-large-0.1 model with proper prefixes.
|
| 841 |
+
Solon-embeddings-large-0.1 requires specific prefixes for queries vs passages.
|
| 842 |
+
"""
|
| 843 |
+
|
| 844 |
+
# E5 model requires specific prefixes
|
| 845 |
+
if is_query:
|
| 846 |
+
# Add query prefix
|
| 847 |
+
prefixed_texts = [f"query : {text.strip()}" for text in texts]
|
| 848 |
+
else:
|
| 849 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 850 |
+
|
| 851 |
+
all_embeddings = []
|
| 852 |
+
total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size
|
| 853 |
+
|
| 854 |
+
for i in range(0, len(prefixed_texts), batch_size):
|
| 855 |
+
batch_num = i // batch_size + 1
|
| 856 |
+
if not is_query and batch_num % 50 == 0:
|
| 857 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 858 |
+
if torch.cuda.is_available():
|
| 859 |
+
torch.cuda.empty_cache()
|
| 860 |
+
|
| 861 |
+
batch_texts = prefixed_texts[i:i + batch_size]
|
| 862 |
+
|
| 863 |
+
try:
|
| 864 |
+
encoded = self.tokenizer(
|
| 865 |
+
batch_texts,
|
| 866 |
+
padding=True,
|
| 867 |
+
truncation=True,
|
| 868 |
+
max_length=512, # 512
|
| 869 |
+
return_tensors='pt'
|
| 870 |
+
).to(self.device)
|
| 871 |
+
|
| 872 |
+
with torch.no_grad():
|
| 873 |
+
model_output = self.model(**encoded)
|
| 874 |
+
|
| 875 |
+
# E5 uses mean pooling with attention mask
|
| 876 |
+
attention_mask = encoded['attention_mask']
|
| 877 |
+
embeddings = model_output.last_hidden_state
|
| 878 |
+
|
| 879 |
+
# Mean pooling
|
| 880 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 881 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 882 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 883 |
+
embeddings = sum_embeddings / sum_mask
|
| 884 |
+
|
| 885 |
+
# embeddings = model_output[0][:, 0].float()
|
| 886 |
+
# L2 normalize embeddings (important for E5)
|
| 887 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 888 |
+
|
| 889 |
+
# Move to CPU immediately
|
| 890 |
+
all_embeddings.append(embeddings.cpu())
|
| 891 |
+
|
| 892 |
+
# Clear GPU memory
|
| 893 |
+
del encoded, model_output, embeddings
|
| 894 |
+
if torch.cuda.is_available():
|
| 895 |
+
torch.cuda.empty_cache()
|
| 896 |
+
|
| 897 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 898 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 899 |
+
# Process one item at a time
|
| 900 |
+
for single_text in batch_texts:
|
| 901 |
+
try:
|
| 902 |
+
encoded = self.tokenizer(
|
| 903 |
+
[single_text],
|
| 904 |
+
padding=True,
|
| 905 |
+
truncation=True,
|
| 906 |
+
max_length=512,
|
| 907 |
+
return_tensors='pt'
|
| 908 |
+
).to(self.device)
|
| 909 |
+
|
| 910 |
+
with torch.no_grad():
|
| 911 |
+
model_output = self.model(**encoded)
|
| 912 |
+
attention_mask = encoded['attention_mask']
|
| 913 |
+
embeddings = model_output.last_hidden_state
|
| 914 |
+
|
| 915 |
+
# Mean pooling
|
| 916 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 917 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 918 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 919 |
+
embeddings = sum_embeddings / sum_mask
|
| 920 |
+
|
| 921 |
+
# embeddings = model_output[0][:, 0].float()
|
| 922 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 923 |
+
|
| 924 |
+
all_embeddings.append(embeddings.cpu())
|
| 925 |
+
|
| 926 |
+
del encoded, model_output, embeddings
|
| 927 |
+
if torch.cuda.is_available():
|
| 928 |
+
torch.cuda.empty_cache()
|
| 929 |
+
|
| 930 |
+
except Exception as e2:
|
| 931 |
+
print(f"Failed to process single text: {e2}")
|
| 932 |
+
|
| 933 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 934 |
+
all_embeddings.append(zero_embedding)
|
| 935 |
+
|
| 936 |
+
return torch.cat(all_embeddings, dim=0).numpy()
|
| 937 |
+
|
| 938 |
+
def prepare_corpus(self, texts, ids, batch_size=32):
|
| 939 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 940 |
+
|
| 941 |
+
all_embeddings = []
|
| 942 |
+
|
| 943 |
+
all_tokens = []
|
| 944 |
+
all_ids = []
|
| 945 |
+
all_len = []
|
| 946 |
+
|
| 947 |
+
prefix_len = 1
|
| 948 |
+
suffix_len = 1
|
| 949 |
+
max_length = 512 # 1024 # 512
|
| 950 |
+
max_passage = max_length - prefix_len - suffix_len
|
| 951 |
+
overlap = max_passage // 2
|
| 952 |
+
|
| 953 |
+
for i in range(len(prefixed_texts)): # tqdm(
|
| 954 |
+
encoded = self.tokenizer(
|
| 955 |
+
[prefixed_texts[i]],
|
| 956 |
+
padding=False,
|
| 957 |
+
truncation=False,
|
| 958 |
+
# max_length=512,
|
| 959 |
+
# return_tensors='pt'
|
| 960 |
+
)
|
| 961 |
+
|
| 962 |
+
if len(encoded['input_ids'][0]) > max_length:
|
| 963 |
+
_idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len))
|
| 964 |
+
|
| 965 |
+
i0 = 0
|
| 966 |
+
while i0 < len(_idxs) - overlap:
|
| 967 |
+
i1 = min(i0+max_passage, len(_idxs))
|
| 968 |
+
all_ids.append((ids[i], (i0, i1, len(_idxs))))
|
| 969 |
+
all_tokens.append({
|
| 970 |
+
'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:],
|
| 971 |
+
'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:]
|
| 972 |
+
})
|
| 973 |
+
all_len.append(i1-i0)
|
| 974 |
+
i0 += max_passage - overlap
|
| 975 |
+
else:
|
| 976 |
+
all_ids.append((ids[i], None))
|
| 977 |
+
all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]})
|
| 978 |
+
all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len)
|
| 979 |
+
|
| 980 |
+
|
| 981 |
+
total_batches = (len(all_tokens) + batch_size - 1) // batch_size
|
| 982 |
+
|
| 983 |
+
all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0]))
|
| 984 |
+
|
| 985 |
+
|
| 986 |
+
for i in range(0, len(all_tokens), batch_size):
|
| 987 |
+
batch_num = i // batch_size + 1
|
| 988 |
+
if batch_num % 50 == 0:
|
| 989 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 990 |
+
if torch.cuda.is_available():
|
| 991 |
+
torch.cuda.empty_cache()
|
| 992 |
+
|
| 993 |
+
batch_tokens = all_tokens[i:i + batch_size]
|
| 994 |
+
|
| 995 |
+
batch_max = max([len(ids['input_ids']) for ids in batch_tokens])
|
| 996 |
+
encoded = dict()
|
| 997 |
+
encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 998 |
+
encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 999 |
+
encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device)
|
| 1000 |
+
encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device)
|
| 1001 |
+
|
| 1002 |
+
|
| 1003 |
+
|
| 1004 |
+
try:
|
| 1005 |
+
|
| 1006 |
+
with torch.no_grad():
|
| 1007 |
+
model_output = self.model(**encoded)
|
| 1008 |
+
|
| 1009 |
+
# E5 uses mean pooling with attention mask
|
| 1010 |
+
attention_mask = encoded['attention_mask']
|
| 1011 |
+
embeddings = model_output.last_hidden_state
|
| 1012 |
+
|
| 1013 |
+
# Mean pooling
|
| 1014 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 1015 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 1016 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 1017 |
+
embeddings = sum_embeddings / sum_mask
|
| 1018 |
+
|
| 1019 |
+
# L2 normalize embeddings
|
| 1020 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 1021 |
+
|
| 1022 |
+
# Move to CPU immediately
|
| 1023 |
+
all_embeddings.append(embeddings.cpu())
|
| 1024 |
+
|
| 1025 |
+
# Clear GPU memory
|
| 1026 |
+
del encoded, model_output, embeddings
|
| 1027 |
+
if torch.cuda.is_available():
|
| 1028 |
+
torch.cuda.empty_cache()
|
| 1029 |
+
|
| 1030 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 1031 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 1032 |
+
# Process one item at a time
|
| 1033 |
+
for j in range(len(encoded)):
|
| 1034 |
+
try:
|
| 1035 |
+
|
| 1036 |
+
encoded0 = encoded[j:j+1]
|
| 1037 |
+
|
| 1038 |
+
with torch.no_grad():
|
| 1039 |
+
model_output = self.model(**encoded0)
|
| 1040 |
+
attention_mask = encoded0['attention_mask']
|
| 1041 |
+
embeddings = model_output.last_hidden_state
|
| 1042 |
+
|
| 1043 |
+
# Mean pooling
|
| 1044 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
|
| 1045 |
+
sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
|
| 1046 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 1047 |
+
embeddings = sum_embeddings / sum_mask
|
| 1048 |
+
|
| 1049 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 1050 |
+
|
| 1051 |
+
all_embeddings.append(embeddings.cpu())
|
| 1052 |
+
|
| 1053 |
+
del encoded0, model_output, embeddings
|
| 1054 |
+
if torch.cuda.is_available():
|
| 1055 |
+
torch.cuda.empty_cache()
|
| 1056 |
+
|
| 1057 |
+
except Exception as e2:
|
| 1058 |
+
print(f"Failed to process single text: {e2}")
|
| 1059 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 1060 |
+
all_embeddings.append(zero_embedding)
|
| 1061 |
+
|
| 1062 |
+
id2idx = {}
|
| 1063 |
+
for i in range(len(all_ids)):
|
| 1064 |
+
if all_ids[i][0] not in id2idx:
|
| 1065 |
+
id2idx[all_ids[i][0]] = []
|
| 1066 |
+
id2idx[all_ids[i][0]].append(i)
|
| 1067 |
+
|
| 1068 |
+
return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy()
|
| 1069 |
+
|
| 1070 |
+
|
| 1071 |
+
class M3Retriever:
|
| 1072 |
+
def __init__(self, model_name=None, device=None):
|
| 1073 |
+
"""
|
| 1074 |
+
Initializes the bge-m3 retriever using the multilingual bge-m3 model.
|
| 1075 |
+
"""
|
| 1076 |
+
# Use local model
|
| 1077 |
+
if model_name is None:
|
| 1078 |
+
# local_model_path = os.path.join('sub/models', 'bge-m3')
|
| 1079 |
+
local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'bge-m3')
|
| 1080 |
+
if os.path.isdir(local_model_path):
|
| 1081 |
+
model_name = local_model_path
|
| 1082 |
+
print(f"Using local bge-m3 model from: {model_name}")
|
| 1083 |
+
|
| 1084 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 1085 |
+
|
| 1086 |
+
# Clear GPU cache before loading model
|
| 1087 |
+
if torch.cuda.is_available():
|
| 1088 |
+
torch.cuda.empty_cache()
|
| 1089 |
+
|
| 1090 |
+
print(f"Loading bge-m3 multilingual model on device: {self.device}")
|
| 1091 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 1092 |
+
self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) # torch.float16
|
| 1093 |
+
self.model.eval()
|
| 1094 |
+
|
| 1095 |
+
# Clear cache after model loading
|
| 1096 |
+
if torch.cuda.is_available():
|
| 1097 |
+
torch.cuda.empty_cache()
|
| 1098 |
+
|
| 1099 |
+
self.corpus_ids = []
|
| 1100 |
+
self.corpus_embeddings = None
|
| 1101 |
+
|
| 1102 |
+
def embed_texts(self, texts, is_query=False, batch_size=32):
|
| 1103 |
+
"""
|
| 1104 |
+
Generates embeddings for texts using E5 Instruct model with proper prefixes.
|
| 1105 |
+
bge-m3 requires specific prefixes for queries vs passages.
|
| 1106 |
+
"""
|
| 1107 |
+
|
| 1108 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 1109 |
+
|
| 1110 |
+
all_embeddings = []
|
| 1111 |
+
total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size
|
| 1112 |
+
|
| 1113 |
+
for i in range(0, len(prefixed_texts), batch_size):
|
| 1114 |
+
batch_num = i // batch_size + 1
|
| 1115 |
+
if not is_query and batch_num % 50 == 0:
|
| 1116 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 1117 |
+
if torch.cuda.is_available():
|
| 1118 |
+
torch.cuda.empty_cache()
|
| 1119 |
+
|
| 1120 |
+
batch_texts = prefixed_texts[i:i + batch_size]
|
| 1121 |
+
|
| 1122 |
+
try:
|
| 1123 |
+
encoded = self.tokenizer(
|
| 1124 |
+
batch_texts,
|
| 1125 |
+
padding=True,
|
| 1126 |
+
truncation=True,
|
| 1127 |
+
max_length=512,
|
| 1128 |
+
return_tensors='pt'
|
| 1129 |
+
).to(self.device)
|
| 1130 |
+
|
| 1131 |
+
with torch.no_grad():
|
| 1132 |
+
model_output = self.model(**encoded)
|
| 1133 |
+
|
| 1134 |
+
embeddings = model_output[0][:, 0].float()
|
| 1135 |
+
|
| 1136 |
+
# L2 normalize embeddings
|
| 1137 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 1138 |
+
|
| 1139 |
+
# Move to CPU immediately
|
| 1140 |
+
all_embeddings.append(embeddings.cpu())
|
| 1141 |
+
|
| 1142 |
+
# Clear GPU memory
|
| 1143 |
+
del encoded, model_output, embeddings
|
| 1144 |
+
if torch.cuda.is_available():
|
| 1145 |
+
torch.cuda.empty_cache()
|
| 1146 |
+
|
| 1147 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 1148 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 1149 |
+
# Process one item at a time
|
| 1150 |
+
for single_text in batch_texts:
|
| 1151 |
+
try:
|
| 1152 |
+
encoded = self.tokenizer(
|
| 1153 |
+
[single_text],
|
| 1154 |
+
padding=True,
|
| 1155 |
+
truncation=True,
|
| 1156 |
+
max_length=512,
|
| 1157 |
+
return_tensors='pt'
|
| 1158 |
+
).to(self.device)
|
| 1159 |
+
|
| 1160 |
+
with torch.no_grad():
|
| 1161 |
+
model_output = self.model(**encoded)
|
| 1162 |
+
|
| 1163 |
+
embeddings = model_output[0][:, 0].float()
|
| 1164 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 1165 |
+
|
| 1166 |
+
all_embeddings.append(embeddings.cpu())
|
| 1167 |
+
|
| 1168 |
+
del encoded, model_output, embeddings
|
| 1169 |
+
if torch.cuda.is_available():
|
| 1170 |
+
torch.cuda.empty_cache()
|
| 1171 |
+
|
| 1172 |
+
except Exception as e2:
|
| 1173 |
+
print(f"Failed to process single text: {e2}")
|
| 1174 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 1175 |
+
all_embeddings.append(zero_embedding)
|
| 1176 |
+
|
| 1177 |
+
return torch.cat(all_embeddings, dim=0).numpy()
|
| 1178 |
+
|
| 1179 |
+
def prepare_corpus(self, texts, ids, batch_size=32):
|
| 1180 |
+
prefixed_texts = [f"{text.strip()}" for text in texts]
|
| 1181 |
+
|
| 1182 |
+
all_embeddings = []
|
| 1183 |
+
|
| 1184 |
+
all_tokens = []
|
| 1185 |
+
all_ids = []
|
| 1186 |
+
all_len = []
|
| 1187 |
+
|
| 1188 |
+
prefix_len = 1
|
| 1189 |
+
suffix_len = 1
|
| 1190 |
+
max_length = 512
|
| 1191 |
+
max_passage = max_length - prefix_len - suffix_len
|
| 1192 |
+
overlap = max_passage // 2
|
| 1193 |
+
|
| 1194 |
+
for i in range(len(prefixed_texts)): # tqdm(
|
| 1195 |
+
encoded = self.tokenizer(
|
| 1196 |
+
[prefixed_texts[i]],
|
| 1197 |
+
padding=False,
|
| 1198 |
+
truncation=False,
|
| 1199 |
+
# max_length=512,
|
| 1200 |
+
# return_tensors='pt'
|
| 1201 |
+
)
|
| 1202 |
+
|
| 1203 |
+
if len(encoded['input_ids'][0]) > max_length:
|
| 1204 |
+
_idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len))
|
| 1205 |
+
|
| 1206 |
+
i0 = 0
|
| 1207 |
+
while i0 < len(_idxs) - overlap:
|
| 1208 |
+
i1 = min(i0+max_passage, len(_idxs))
|
| 1209 |
+
all_ids.append((ids[i], (i0, i1, len(_idxs))))
|
| 1210 |
+
all_tokens.append({
|
| 1211 |
+
'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:],
|
| 1212 |
+
'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:]
|
| 1213 |
+
})
|
| 1214 |
+
all_len.append(i1-i0)
|
| 1215 |
+
i0 += max_passage - overlap
|
| 1216 |
+
else:
|
| 1217 |
+
all_ids.append((ids[i], None))
|
| 1218 |
+
all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]})
|
| 1219 |
+
all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len)
|
| 1220 |
+
|
| 1221 |
+
|
| 1222 |
+
total_batches = (len(all_tokens) + batch_size - 1) // batch_size
|
| 1223 |
+
|
| 1224 |
+
all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0]))
|
| 1225 |
+
|
| 1226 |
+
|
| 1227 |
+
for i in range(0, len(all_tokens), batch_size):
|
| 1228 |
+
batch_num = i // batch_size + 1
|
| 1229 |
+
if batch_num % 50 == 0:
|
| 1230 |
+
print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)")
|
| 1231 |
+
if torch.cuda.is_available():
|
| 1232 |
+
torch.cuda.empty_cache()
|
| 1233 |
+
|
| 1234 |
+
batch_tokens = all_tokens[i:i + batch_size]
|
| 1235 |
+
|
| 1236 |
+
batch_max = max([len(ids['input_ids']) for ids in batch_tokens])
|
| 1237 |
+
encoded = dict()
|
| 1238 |
+
encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 1239 |
+
encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 1240 |
+
encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device)
|
| 1241 |
+
encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device)
|
| 1242 |
+
|
| 1243 |
+
|
| 1244 |
+
|
| 1245 |
+
try:
|
| 1246 |
+
|
| 1247 |
+
with torch.no_grad():
|
| 1248 |
+
model_output = self.model(**encoded)
|
| 1249 |
+
|
| 1250 |
+
embeddings = model_output[0][:, 0].float()
|
| 1251 |
+
|
| 1252 |
+
# L2 normalize embeddings (important for E5)
|
| 1253 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 1254 |
+
|
| 1255 |
+
# Move to CPU immediately
|
| 1256 |
+
all_embeddings.append(embeddings.cpu())
|
| 1257 |
+
|
| 1258 |
+
# Clear GPU memory
|
| 1259 |
+
del encoded, model_output, embeddings
|
| 1260 |
+
if torch.cuda.is_available():
|
| 1261 |
+
torch.cuda.empty_cache()
|
| 1262 |
+
|
| 1263 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 1264 |
+
print(f"CUDA OOM at batch {batch_num}, reducing batch size...")
|
| 1265 |
+
# Process one item at a time
|
| 1266 |
+
for j in range(len(encoded)):
|
| 1267 |
+
try:
|
| 1268 |
+
|
| 1269 |
+
encoded0 = encoded[j:j+1]
|
| 1270 |
+
|
| 1271 |
+
with torch.no_grad():
|
| 1272 |
+
model_output = self.model(**encoded0)
|
| 1273 |
+
|
| 1274 |
+
embeddings = model_output[0][:, 0].float()
|
| 1275 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 1276 |
+
|
| 1277 |
+
all_embeddings.append(embeddings.cpu())
|
| 1278 |
+
|
| 1279 |
+
del encoded0, model_output, embeddings
|
| 1280 |
+
if torch.cuda.is_available():
|
| 1281 |
+
torch.cuda.empty_cache()
|
| 1282 |
+
|
| 1283 |
+
except Exception as e2:
|
| 1284 |
+
print(f"Failed to process single text: {e2}")
|
| 1285 |
+
zero_embedding = torch.zeros(1, 1024).float()
|
| 1286 |
+
all_embeddings.append(zero_embedding)
|
| 1287 |
+
|
| 1288 |
+
id2idx = {}
|
| 1289 |
+
for i in range(len(all_ids)):
|
| 1290 |
+
if all_ids[i][0] not in id2idx:
|
| 1291 |
+
id2idx[all_ids[i][0]] = []
|
| 1292 |
+
id2idx[all_ids[i][0]].append(i)
|
| 1293 |
+
|
| 1294 |
+
return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy()
|
| 1295 |
+
|
| 1296 |
+
|
| 1297 |
+
|
| 1298 |
+
class BGEReranker:
|
| 1299 |
+
def __init__(self, model_name=None, device=None):
|
| 1300 |
+
"""
|
| 1301 |
+
Initializes the BGE reranker for fine-grained relevance scoring.
|
| 1302 |
+
"""
|
| 1303 |
+
# Use local model
|
| 1304 |
+
if model_name is None:
|
| 1305 |
+
# local_model_path = os.path.join('sub/models', 'bge-reranker-v2-m3') # 'bge-reranker-v2-m3'
|
| 1306 |
+
local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'bge-reranker-v2-m3_pseudo_tune_full') # 'bge-reranker-v2-m3'
|
| 1307 |
+
if os.path.isdir(local_model_path):
|
| 1308 |
+
model_name = local_model_path
|
| 1309 |
+
print(f"Using local BGE model from: {model_name}")
|
| 1310 |
+
|
| 1311 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 1312 |
+
|
| 1313 |
+
print(f"Loading BGE reranker on device: {self.device}")
|
| 1314 |
+
|
| 1315 |
+
# BGE reranker is actually a special model type
|
| 1316 |
+
from transformers import AutoModelForSequenceClassification
|
| 1317 |
+
|
| 1318 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 1319 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(
|
| 1320 |
+
model_name,
|
| 1321 |
+
torch_dtype=torch.bfloat16,
|
| 1322 |
+
trust_remote_code=True
|
| 1323 |
+
).to(self.device)
|
| 1324 |
+
self.model.eval()
|
| 1325 |
+
|
| 1326 |
+
if torch.cuda.is_available():
|
| 1327 |
+
torch.cuda.empty_cache()
|
| 1328 |
+
|
| 1329 |
+
def prepare_corpus(self, texts, ids):
|
| 1330 |
+
|
| 1331 |
+
self.corpus_tokens = {}
|
| 1332 |
+
for i in range(len(texts)): #
|
| 1333 |
+
encoded = self.tokenizer(
|
| 1334 |
+
[texts[i].strip()],
|
| 1335 |
+
padding=False,
|
| 1336 |
+
truncation=False,
|
| 1337 |
+
# max_length=512,
|
| 1338 |
+
# return_tensors='pt'
|
| 1339 |
+
)
|
| 1340 |
+
self.corpus_tokens[ids[i]] = {'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]}
|
| 1341 |
+
|
| 1342 |
+
|
| 1343 |
+
def rerank(self, query_text, passages, passage_ids, max_time, top_k=20):
|
| 1344 |
+
"""
|
| 1345 |
+
Rerank the passages using BGE reranker - CORRECTED VERSION.
|
| 1346 |
+
"""
|
| 1347 |
+
if not passages:
|
| 1348 |
+
return []
|
| 1349 |
+
if time.time() > max_time-0.1:
|
| 1350 |
+
print(f'No time for bge rerank!!')
|
| 1351 |
+
return []
|
| 1352 |
+
|
| 1353 |
+
max_length = 2048
|
| 1354 |
+
|
| 1355 |
+
query_encoded = self.tokenizer(
|
| 1356 |
+
[query_text.strip()],
|
| 1357 |
+
padding=False,
|
| 1358 |
+
truncation=True,
|
| 1359 |
+
max_length = max_length - 1024,
|
| 1360 |
+
)
|
| 1361 |
+
|
| 1362 |
+
all_tokens = []
|
| 1363 |
+
all_ids = []
|
| 1364 |
+
all_len = []
|
| 1365 |
+
|
| 1366 |
+
suffix_len = 1
|
| 1367 |
+
prefix_len = 1
|
| 1368 |
+
|
| 1369 |
+
max_passage = max_length - len(query_encoded['input_ids'][0]) - suffix_len - prefix_len
|
| 1370 |
+
overlap = max_passage // 2
|
| 1371 |
+
|
| 1372 |
+
for i in range(len(passage_ids)): # tqdm(
|
| 1373 |
+
encoded = self.corpus_tokens[passage_ids[i]]
|
| 1374 |
+
|
| 1375 |
+
if len(encoded['input_ids']) - suffix_len - prefix_len > max_passage:
|
| 1376 |
+
_idxs = list(range(prefix_len, len(encoded['input_ids'])-suffix_len))
|
| 1377 |
+
|
| 1378 |
+
i0 = 0
|
| 1379 |
+
while i0 < len(_idxs) - overlap:
|
| 1380 |
+
i1 = min(i0+max_passage, len(_idxs))
|
| 1381 |
+
all_ids.append((passage_ids[i], (i0, i1, len(_idxs))))
|
| 1382 |
+
all_tokens.append({
|
| 1383 |
+
'input_ids': query_encoded['input_ids'][0] + [2] + encoded['input_ids'][i0+prefix_len:i1+prefix_len] + [2],
|
| 1384 |
+
'attention_mask': query_encoded['attention_mask'][0] + [1] + encoded['attention_mask'][i0+prefix_len:i1+prefix_len] + [1]
|
| 1385 |
+
})
|
| 1386 |
+
all_len.append(i1-i0)
|
| 1387 |
+
i0 += max_passage - overlap
|
| 1388 |
+
else:
|
| 1389 |
+
all_ids.append((passage_ids[i], None))
|
| 1390 |
+
all_tokens.append({'input_ids': query_encoded['input_ids'][0] + [2] + encoded['input_ids'][1:], 'attention_mask': query_encoded['attention_mask'][0] + [1] + encoded['attention_mask'][1:]})
|
| 1391 |
+
all_len.append(len(encoded['input_ids']) - prefix_len - suffix_len)
|
| 1392 |
+
|
| 1393 |
+
|
| 1394 |
+
# all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0]))
|
| 1395 |
+
|
| 1396 |
+
|
| 1397 |
+
scores = []
|
| 1398 |
+
batch_size = 4 # 4 # Conservative batch size
|
| 1399 |
+
|
| 1400 |
+
for i in range(0, len(all_tokens), batch_size):
|
| 1401 |
+
if time.time() > max_time:
|
| 1402 |
+
print(f'bge rerank time limit! processed {len(scores)} of {len(all_tokens)}')
|
| 1403 |
+
break
|
| 1404 |
+
|
| 1405 |
+
batch_tokens = all_tokens[i:i + batch_size]
|
| 1406 |
+
|
| 1407 |
+
batch_max = max([len(ids['input_ids']) for ids in batch_tokens])
|
| 1408 |
+
encoded = dict()
|
| 1409 |
+
encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 1410 |
+
encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens]
|
| 1411 |
+
encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device)
|
| 1412 |
+
encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device)
|
| 1413 |
+
|
| 1414 |
+
|
| 1415 |
+
try:
|
| 1416 |
+
# BGE reranker expects SEPARATE query and passage inputs
|
| 1417 |
+
# NOT concatenated strings
|
| 1418 |
+
# batch_queries = [query_text] * len(batch_passages)
|
| 1419 |
+
|
| 1420 |
+
# Tokenize query-passage pairs properly
|
| 1421 |
+
with torch.no_grad():
|
| 1422 |
+
|
| 1423 |
+
# Get relevance scores from sequence classification model
|
| 1424 |
+
outputs = self.model(**encoded)
|
| 1425 |
+
|
| 1426 |
+
# BGE reranker outputs logits for relevance classification
|
| 1427 |
+
logits = outputs.logits.float()
|
| 1428 |
+
|
| 1429 |
+
# Handle different output shapes
|
| 1430 |
+
if len(logits.shape) == 1:
|
| 1431 |
+
# Single score per pair
|
| 1432 |
+
batch_scores = logits.cpu().numpy()
|
| 1433 |
+
elif logits.shape[1] == 1:
|
| 1434 |
+
# Single column output
|
| 1435 |
+
batch_scores = logits.squeeze(-1).cpu().numpy()
|
| 1436 |
+
else:
|
| 1437 |
+
# Binary classification - take positive class (index 1)
|
| 1438 |
+
batch_scores = logits[:, 1].cpu().numpy()
|
| 1439 |
+
|
| 1440 |
+
scores.extend(batch_scores.tolist())
|
| 1441 |
+
|
| 1442 |
+
# Cleanup
|
| 1443 |
+
del encoded, outputs
|
| 1444 |
+
if torch.cuda.is_available():
|
| 1445 |
+
torch.cuda.empty_cache()
|
| 1446 |
+
|
| 1447 |
+
except Exception as e:
|
| 1448 |
+
print(f"Error in reranking batch {i//batch_size + 1}: {e}")
|
| 1449 |
+
# Fallback: Use neutral scores for this batch
|
| 1450 |
+
fallback_scores = [0.5] * len(batch_tokens)
|
| 1451 |
+
scores.extend(fallback_scores)
|
| 1452 |
+
|
| 1453 |
+
all_ids = all_ids[:len(scores)]
|
| 1454 |
+
|
| 1455 |
+
# Combine results and sort by reranking score
|
| 1456 |
+
results = list(zip(all_ids, scores))
|
| 1457 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 1458 |
+
|
| 1459 |
+
new_res = []
|
| 1460 |
+
_used = set([])
|
| 1461 |
+
for i in range(len(results)):
|
| 1462 |
+
if results[i][0][0] not in _used:
|
| 1463 |
+
_used.add(results[i][0][0])
|
| 1464 |
+
new_res.append((results[i][0][0], results[i][1]))
|
| 1465 |
+
|
| 1466 |
+
return new_res[:min(top_k, len(new_res))]
|
| 1467 |
+
|
| 1468 |
+
|
| 1469 |
+
|
| 1470 |
+
|
| 1471 |
+
|
| 1472 |
+
|
| 1473 |
+
|
| 1474 |
+
# Global instances
|
| 1475 |
+
retrievers = None
|
| 1476 |
+
reranker = None
|
| 1477 |
+
retrieverBM5 = None
|
| 1478 |
+
corpus_texts = {} # Store original passage texts for reranking
|
| 1479 |
+
|
| 1480 |
+
def preprocess(corpus_dict):
|
| 1481 |
+
"""
|
| 1482 |
+
Preprocessing function using E5 multilingual model + BGE reranker.
|
| 1483 |
+
|
| 1484 |
+
Input: corpus_dict - dict mapping document IDs to document objects with 'passage'/'text' field
|
| 1485 |
+
Output: dict containing initialized models, embeddings, and corpus data
|
| 1486 |
+
|
| 1487 |
+
Note: Uses global variables (retriever, reranker, corpus_texts) for efficiency,
|
| 1488 |
+
but also returns all required data via preprocessed_data for function interface.
|
| 1489 |
+
"""
|
| 1490 |
+
global retrievers, reranker, corpus_texts, retrieverBM5
|
| 1491 |
+
|
| 1492 |
+
start_time = time.time()
|
| 1493 |
+
|
| 1494 |
+
print("=" * 60)
|
| 1495 |
+
print("PREPROCESSING: Initializing E5 + BGE Reranker Pipeline...")
|
| 1496 |
+
print("=" * 60)
|
| 1497 |
+
|
| 1498 |
+
# Set GPU memory optimization
|
| 1499 |
+
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 1500 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 1501 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 1502 |
+
|
| 1503 |
+
# Initialize E5 retriever
|
| 1504 |
+
print("Loading retrievers...")
|
| 1505 |
+
|
| 1506 |
+
retrieverE5 = E5Retriever()
|
| 1507 |
+
|
| 1508 |
+
retrieverE5Ins = E5InstructRetriever()
|
| 1509 |
+
|
| 1510 |
+
retrieverM3 = M3Retriever()
|
| 1511 |
+
|
| 1512 |
+
retrieverSnowflake = SnowflakeInstructRetriever()
|
| 1513 |
+
|
| 1514 |
+
retrieverSolon = SolonRetriever()
|
| 1515 |
+
|
| 1516 |
+
retrieverRAGbot = E5Retriever(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0') )
|
| 1517 |
+
|
| 1518 |
+
retrievers = [retrieverE5, retrieverE5Ins, retrieverM3, retrieverSnowflake, retrieverSolon, retrieverRAGbot]
|
| 1519 |
+
|
| 1520 |
+
|
| 1521 |
+
# Initialize BGE reranker
|
| 1522 |
+
print("Loading rerankers...")
|
| 1523 |
+
reranker = BGEReranker()
|
| 1524 |
+
|
| 1525 |
+
print(f"Preparing corpus with {len(corpus_dict)} documents...")
|
| 1526 |
+
|
| 1527 |
+
# Store corpus IDs, passages, and original texts
|
| 1528 |
+
corpus_ids = list(corpus_dict.keys())
|
| 1529 |
+
passages = [doc.get('passage', doc.get('text', '')) for doc in corpus_dict.values()]
|
| 1530 |
+
|
| 1531 |
+
for ret in retrievers:
|
| 1532 |
+
ret.corpus_ids = list(corpus_dict.keys())
|
| 1533 |
+
print("Computing embeddings...")
|
| 1534 |
+
ret.corpus_embeddings, ret.corpus_ids, ret.id2idx = ret.prepare_corpus(passages, ret.corpus_ids, batch_size=32)
|
| 1535 |
+
|
| 1536 |
+
print("✓ Corpus preprocessing complete!")
|
| 1537 |
+
print(f"✓ Generated embeddings for {len(ret.corpus_ids)} documents")
|
| 1538 |
+
print(f"✓ Embedding matrix shape: {ret.corpus_embeddings.shape}")
|
| 1539 |
+
|
| 1540 |
+
# Store original texts for reranking
|
| 1541 |
+
corpus_texts = {doc_id: passages[i] for i, doc_id in enumerate(corpus_ids)}
|
| 1542 |
+
|
| 1543 |
+
reranker.corpus_ids = list(corpus_dict.keys())
|
| 1544 |
+
reranker.prepare_corpus(passages, reranker.corpus_ids)
|
| 1545 |
+
|
| 1546 |
+
retrieverBM5 = BM25sRetriever(passages, corpus_ids)
|
| 1547 |
+
|
| 1548 |
+
if torch.cuda.is_available():
|
| 1549 |
+
torch.cuda.empty_cache()
|
| 1550 |
+
|
| 1551 |
+
end_time = time.time()
|
| 1552 |
+
elapsed_time = end_time - start_time
|
| 1553 |
+
print(f"Elapsed time: {elapsed_time} seconds")
|
| 1554 |
+
|
| 1555 |
+
return {
|
| 1556 |
+
'retrievers': retrievers,
|
| 1557 |
+
'retrieverBM5': retrieverBM5,
|
| 1558 |
+
'reranker': reranker,
|
| 1559 |
+
'corpus_ids': corpus_ids,
|
| 1560 |
+
'corpus_texts': corpus_texts,
|
| 1561 |
+
'num_documents': len(corpus_dict)
|
| 1562 |
+
}
|
| 1563 |
+
|
| 1564 |
+
|
| 1565 |
+
def predict(query, preprocessed_data):
|
| 1566 |
+
"""
|
| 1567 |
+
Two-stage prediction: E5 retrieval + BGE reranking.
|
| 1568 |
+
|
| 1569 |
+
Input:
|
| 1570 |
+
- query: dict with 'query' field containing query text
|
| 1571 |
+
- preprocessed_data: dict from preprocess() containing models and corpus data
|
| 1572 |
+
|
| 1573 |
+
Output: list of dicts with 'paragraph_uuid' and 'score' fields, ranked by relevance
|
| 1574 |
+
|
| 1575 |
+
Note: Uses global variables for efficiency but can also extract required data
|
| 1576 |
+
from preprocessed_data parameter for proper function interface.
|
| 1577 |
+
"""
|
| 1578 |
+
global retrievers, reranker, corpus_texts, retrieverBM5
|
| 1579 |
+
|
| 1580 |
+
start_time = time.time()
|
| 1581 |
+
max_time = start_time + 1.85 #
|
| 1582 |
+
|
| 1583 |
+
# Extract query text
|
| 1584 |
+
query_text = query.get('query', '')
|
| 1585 |
+
if not query_text:
|
| 1586 |
+
return []
|
| 1587 |
+
|
| 1588 |
+
# Use global instances or get from preprocessed_data
|
| 1589 |
+
if retrievers is None:
|
| 1590 |
+
retrievers = preprocessed_data.get('retrievers')
|
| 1591 |
+
reranker = preprocessed_data.get('reranker')
|
| 1592 |
+
corpus_texts = preprocessed_data.get('corpus_texts', {})
|
| 1593 |
+
retrieverBM5 = preprocessed_data.get('retrieverBM5')
|
| 1594 |
+
|
| 1595 |
+
if retrievers is None or reranker is None:
|
| 1596 |
+
print("Error: Missing retriever or reranker in preprocessed data")
|
| 1597 |
+
return []
|
| 1598 |
+
|
| 1599 |
+
TopNCandidates0 = 250
|
| 1600 |
+
TopNCandidates = 250
|
| 1601 |
+
|
| 1602 |
+
retriever_mean_std = [(-1.4131863134104607, 1.055066495990117),
|
| 1603 |
+
(0.8814187049865723, 0.017973395064473152),
|
| 1604 |
+
(0.5294753313064575, 0.06463246047496796),
|
| 1605 |
+
(0.4171469509601593, 0.0699099600315094),
|
| 1606 |
+
(0.49514809250831604, 0.06277099251747131),
|
| 1607 |
+
(0.530211865901947, 0.0704670324921608)]
|
| 1608 |
+
|
| 1609 |
+
|
| 1610 |
+
retriever_w = np.array([1.1, 0.25, 0.2, 0.3, 0.3, 0.3])
|
| 1611 |
+
retriever_w /= retriever_w.sum()
|
| 1612 |
+
|
| 1613 |
+
try:
|
| 1614 |
+
# STAGE 1: Retrieval (get top 100 candidates)
|
| 1615 |
+
|
| 1616 |
+
candidate_ids = []
|
| 1617 |
+
candidate_scores0 = []
|
| 1618 |
+
candidate_passages = []
|
| 1619 |
+
candidate_scores_bm5 = []
|
| 1620 |
+
|
| 1621 |
+
ret_scores = []
|
| 1622 |
+
for retriever in retrievers:
|
| 1623 |
+
query_embedding = retriever.embed_texts([query_text], is_query=True, batch_size=1)
|
| 1624 |
+
query_embedding = torch.from_numpy(query_embedding).cuda()
|
| 1625 |
+
# Compute cosine similarity with precomputed corpus embeddings
|
| 1626 |
+
_scores = F.cosine_similarity(query_embedding, retriever.corpus_embeddings, 1, 1e-6)
|
| 1627 |
+
_scores = _scores.cpu().numpy()
|
| 1628 |
+
ret_scores.append(_scores)
|
| 1629 |
+
_argsort = np.argsort(_scores)[::-1]
|
| 1630 |
+
|
| 1631 |
+
for idx in _argsort[:TopNCandidates0]:
|
| 1632 |
+
if retriever.corpus_ids[idx][0] not in candidate_ids:
|
| 1633 |
+
candidate_ids.append(retriever.corpus_ids[idx][0])
|
| 1634 |
+
|
| 1635 |
+
_scores = retrieverBM5.get_scores(query_text)
|
| 1636 |
+
_argsort = np.argsort(_scores)[::-1]
|
| 1637 |
+
for idx in _argsort[:TopNCandidates0]:
|
| 1638 |
+
if retrieverBM5.corpus_ids[idx] not in candidate_ids:
|
| 1639 |
+
candidate_ids.append(retrieverBM5.corpus_ids[idx])
|
| 1640 |
+
|
| 1641 |
+
for i in range(len(candidate_ids)):
|
| 1642 |
+
doc_id = candidate_ids[i]
|
| 1643 |
+
_bm5_sc = (_scores[retrieverBM5.id2idx[doc_id]] - 5.919949) / 2.7885008
|
| 1644 |
+
candidate_scores_bm5.append(_bm5_sc)
|
| 1645 |
+
_sc = 0
|
| 1646 |
+
for j in range(len(retrievers)):
|
| 1647 |
+
_sc0 = np.max(ret_scores[j][retrievers[j].id2idx[doc_id]])
|
| 1648 |
+
_sc0 = (_sc0 - retriever_mean_std[j][0]) / retriever_mean_std[j][1]
|
| 1649 |
+
_sc += retriever_w[j] * _sc0
|
| 1650 |
+
candidate_scores0.append(0.85 * _sc + 0.15 * _bm5_sc)
|
| 1651 |
+
candidate_passages.append(corpus_texts.get(doc_id, ''))
|
| 1652 |
+
|
| 1653 |
+
candidate_scores, candidate_ids, candidate_passages, candidate_scores_bm5 = zip(*sorted(zip(candidate_scores0, candidate_ids, candidate_passages, candidate_scores_bm5))[::-1][:min(len(candidate_ids), TopNCandidates)])
|
| 1654 |
+
|
| 1655 |
+
|
| 1656 |
+
# STAGE 2: Reranking (rerank top 100 -> top 20)
|
| 1657 |
+
|
| 1658 |
+
reranked_results = reranker.rerank(
|
| 1659 |
+
query_text,
|
| 1660 |
+
candidate_passages,
|
| 1661 |
+
candidate_ids,
|
| 1662 |
+
max_time,
|
| 1663 |
+
top_k=len(candidate_ids)
|
| 1664 |
+
)
|
| 1665 |
+
|
| 1666 |
+
if len(reranked_results) >= 20:
|
| 1667 |
+
reranked_scores0 = [rerank_score for (passage_id, rerank_score) in reranked_results]
|
| 1668 |
+
reranked_results = [passage_id for (passage_id, rerank_score) in reranked_results]
|
| 1669 |
+
scores = []
|
| 1670 |
+
for _i, _p in enumerate(reranked_results):
|
| 1671 |
+
scores.append(0.35 * (reranked_scores0[_i] + 1.8669906079283858) / 1.207719509974457
|
| 1672 |
+
+ 0.65 * candidate_scores[candidate_ids.index(_p)])
|
| 1673 |
+
|
| 1674 |
+
reranked_results = [(rerank_score, passage_id) for rerank_score, passage_id in sorted(zip(scores, reranked_results))][::-1] #[:TopNCandidates2]
|
| 1675 |
+
else:
|
| 1676 |
+
reranked_results = [(candidate_scores[i], candidate_ids[i]) for i in range(20)]
|
| 1677 |
+
|
| 1678 |
+
|
| 1679 |
+
# Build final results with ACTUAL reranking scores
|
| 1680 |
+
results = []
|
| 1681 |
+
for rank, (rerank_score, passage_id) in enumerate(reranked_results[:20]):
|
| 1682 |
+
results.append({
|
| 1683 |
+
'paragraph_uuid': passage_id,
|
| 1684 |
+
'score': float(rerank_score) # Use actual BGE reranker score!
|
| 1685 |
+
})
|
| 1686 |
+
|
| 1687 |
+
end_time = time.time()
|
| 1688 |
+
elapsed_time = end_time - start_time
|
| 1689 |
+
# print(f"✓ Returned {len(results)} results with reranker scores. Elapsed time: {elapsed_time} seconds")
|
| 1690 |
+
|
| 1691 |
+
return results
|
| 1692 |
+
|
| 1693 |
+
except Exception as e:
|
| 1694 |
+
print(f"Error in prediction: {e}")
|
| 1695 |
+
# Fallback to E5-only retrieval with E5 scores
|
| 1696 |
+
try:
|
| 1697 |
+
query_embedding = retrievers[0].embed_texts([query_text], is_query=True, batch_size=1)
|
| 1698 |
+
query_embedding = torch.from_numpy(query_embedding).cuda()
|
| 1699 |
+
e5_scores = F.cosine_similarity(query_embedding, retrievers[0].corpus_embeddings, 1, 1e-6)
|
| 1700 |
+
e5_scores = e5_scores.cpu().numpy()
|
| 1701 |
+
top_indices = np.argsort(e5_scores)[::-1][:20]
|
| 1702 |
+
|
| 1703 |
+
results = []
|
| 1704 |
+
for idx in top_indices:
|
| 1705 |
+
results.append({
|
| 1706 |
+
'paragraph_uuid': retriever.corpus_ids[idx],
|
| 1707 |
+
'score': float(e5_scores[idx]) # Use actual E5 cosine similarity score
|
| 1708 |
+
})
|
| 1709 |
+
|
| 1710 |
+
return results
|
| 1711 |
+
except:
|
| 1712 |
+
return []
|
victord/sub19/models/Solon-embeddings-large-0.1/1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 1024,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": true,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": false,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
victord/sub19/models/Solon-embeddings-large-0.1/README.md
ADDED
|
@@ -0,0 +1,810 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- mteb
|
| 4 |
+
model-index:
|
| 5 |
+
- name: Solon-embeddings-large-0.1
|
| 6 |
+
results:
|
| 7 |
+
- task:
|
| 8 |
+
type: sentence-similarity
|
| 9 |
+
name: Passage Retrieval
|
| 10 |
+
dataset:
|
| 11 |
+
type: unicamp-dl/mmarco
|
| 12 |
+
name: mMARCO-fr
|
| 13 |
+
config: french
|
| 14 |
+
split: validation
|
| 15 |
+
metrics:
|
| 16 |
+
- type: recall_at_500
|
| 17 |
+
name: Recall@500
|
| 18 |
+
value: 92.7
|
| 19 |
+
- type: recall_at_100
|
| 20 |
+
name: Recall@100
|
| 21 |
+
value: 82.7
|
| 22 |
+
- type: recall_at_10
|
| 23 |
+
name: Recall@10
|
| 24 |
+
value: 55.5
|
| 25 |
+
- type: map_at_10
|
| 26 |
+
name: MAP@10
|
| 27 |
+
value: 29.4
|
| 28 |
+
- type: ndcg_at_10
|
| 29 |
+
name: nDCG@10
|
| 30 |
+
value: 35.8
|
| 31 |
+
- type: mrr_at_10
|
| 32 |
+
name: MRR@10
|
| 33 |
+
value: 29.9
|
| 34 |
+
- task:
|
| 35 |
+
type: Clustering
|
| 36 |
+
dataset:
|
| 37 |
+
type: lyon-nlp/alloprof
|
| 38 |
+
name: MTEB AlloProfClusteringP2P
|
| 39 |
+
config: default
|
| 40 |
+
split: test
|
| 41 |
+
revision: 392ba3f5bcc8c51f578786c1fc3dae648662cb9b
|
| 42 |
+
metrics:
|
| 43 |
+
- type: v_measure
|
| 44 |
+
value: 64.16942168287153
|
| 45 |
+
- task:
|
| 46 |
+
type: Clustering
|
| 47 |
+
dataset:
|
| 48 |
+
type: lyon-nlp/alloprof
|
| 49 |
+
name: MTEB AlloProfClusteringS2S
|
| 50 |
+
config: default
|
| 51 |
+
split: test
|
| 52 |
+
revision: 392ba3f5bcc8c51f578786c1fc3dae648662cb9b
|
| 53 |
+
metrics:
|
| 54 |
+
- type: v_measure
|
| 55 |
+
value: 38.17076313383054
|
| 56 |
+
- task:
|
| 57 |
+
type: Reranking
|
| 58 |
+
dataset:
|
| 59 |
+
type: lyon-nlp/mteb-fr-reranking-alloprof-s2p
|
| 60 |
+
name: MTEB AlloprofReranking
|
| 61 |
+
config: default
|
| 62 |
+
split: test
|
| 63 |
+
revision: 666fdacebe0291776e86f29345663dfaf80a0db9
|
| 64 |
+
metrics:
|
| 65 |
+
- type: map
|
| 66 |
+
value: 64.8770878097632
|
| 67 |
+
- type: mrr
|
| 68 |
+
value: 66.39132423169396
|
| 69 |
+
- task:
|
| 70 |
+
type: Retrieval
|
| 71 |
+
dataset:
|
| 72 |
+
type: lyon-nlp/alloprof
|
| 73 |
+
name: MTEB AlloprofRetrieval
|
| 74 |
+
config: default
|
| 75 |
+
split: test
|
| 76 |
+
revision: 392ba3f5bcc8c51f578786c1fc3dae648662cb9b
|
| 77 |
+
metrics:
|
| 78 |
+
- type: map_at_1
|
| 79 |
+
value: 29.62
|
| 80 |
+
- type: map_at_10
|
| 81 |
+
value: 40.963
|
| 82 |
+
- type: map_at_100
|
| 83 |
+
value: 41.894
|
| 84 |
+
- type: map_at_1000
|
| 85 |
+
value: 41.939
|
| 86 |
+
- type: map_at_3
|
| 87 |
+
value: 37.708999999999996
|
| 88 |
+
- type: map_at_5
|
| 89 |
+
value: 39.696999999999996
|
| 90 |
+
- type: mrr_at_1
|
| 91 |
+
value: 29.62
|
| 92 |
+
- type: mrr_at_10
|
| 93 |
+
value: 40.963
|
| 94 |
+
- type: mrr_at_100
|
| 95 |
+
value: 41.894
|
| 96 |
+
- type: mrr_at_1000
|
| 97 |
+
value: 41.939
|
| 98 |
+
- type: mrr_at_3
|
| 99 |
+
value: 37.708999999999996
|
| 100 |
+
- type: mrr_at_5
|
| 101 |
+
value: 39.696999999999996
|
| 102 |
+
- type: ndcg_at_1
|
| 103 |
+
value: 29.62
|
| 104 |
+
- type: ndcg_at_10
|
| 105 |
+
value: 46.942
|
| 106 |
+
- type: ndcg_at_100
|
| 107 |
+
value: 51.629999999999995
|
| 108 |
+
- type: ndcg_at_1000
|
| 109 |
+
value: 52.927
|
| 110 |
+
- type: ndcg_at_3
|
| 111 |
+
value: 40.333999999999996
|
| 112 |
+
- type: ndcg_at_5
|
| 113 |
+
value: 43.922
|
| 114 |
+
- type: precision_at_1
|
| 115 |
+
value: 29.62
|
| 116 |
+
- type: precision_at_10
|
| 117 |
+
value: 6.589
|
| 118 |
+
- type: precision_at_100
|
| 119 |
+
value: 0.882
|
| 120 |
+
- type: precision_at_1000
|
| 121 |
+
value: 0.099
|
| 122 |
+
- type: precision_at_3
|
| 123 |
+
value: 15.976
|
| 124 |
+
- type: precision_at_5
|
| 125 |
+
value: 11.33
|
| 126 |
+
- type: recall_at_1
|
| 127 |
+
value: 29.62
|
| 128 |
+
- type: recall_at_10
|
| 129 |
+
value: 65.889
|
| 130 |
+
- type: recall_at_100
|
| 131 |
+
value: 88.212
|
| 132 |
+
- type: recall_at_1000
|
| 133 |
+
value: 98.575
|
| 134 |
+
- type: recall_at_3
|
| 135 |
+
value: 47.927
|
| 136 |
+
- type: recall_at_5
|
| 137 |
+
value: 56.64900000000001
|
| 138 |
+
- task:
|
| 139 |
+
type: Classification
|
| 140 |
+
dataset:
|
| 141 |
+
type: mteb/amazon_reviews_multi
|
| 142 |
+
name: MTEB AmazonReviewsClassification (fr)
|
| 143 |
+
config: fr
|
| 144 |
+
split: test
|
| 145 |
+
revision: 1399c76144fd37290681b995c656ef9b2e06e26d
|
| 146 |
+
metrics:
|
| 147 |
+
- type: accuracy
|
| 148 |
+
value: 42.077999999999996
|
| 149 |
+
- type: f1
|
| 150 |
+
value: 40.64511241732637
|
| 151 |
+
- task:
|
| 152 |
+
type: Retrieval
|
| 153 |
+
dataset:
|
| 154 |
+
type: maastrichtlawtech/bsard
|
| 155 |
+
name: MTEB BSARDRetrieval
|
| 156 |
+
config: default
|
| 157 |
+
split: test
|
| 158 |
+
revision: 5effa1b9b5fa3b0f9e12523e6e43e5f86a6e6d59
|
| 159 |
+
metrics:
|
| 160 |
+
- type: map_at_1
|
| 161 |
+
value: 0.901
|
| 162 |
+
- type: map_at_10
|
| 163 |
+
value: 1.524
|
| 164 |
+
- type: map_at_100
|
| 165 |
+
value: 1.833
|
| 166 |
+
- type: map_at_1000
|
| 167 |
+
value: 1.916
|
| 168 |
+
- type: map_at_3
|
| 169 |
+
value: 1.276
|
| 170 |
+
- type: map_at_5
|
| 171 |
+
value: 1.276
|
| 172 |
+
- type: mrr_at_1
|
| 173 |
+
value: 0.901
|
| 174 |
+
- type: mrr_at_10
|
| 175 |
+
value: 1.524
|
| 176 |
+
- type: mrr_at_100
|
| 177 |
+
value: 1.833
|
| 178 |
+
- type: mrr_at_1000
|
| 179 |
+
value: 1.916
|
| 180 |
+
- type: mrr_at_3
|
| 181 |
+
value: 1.276
|
| 182 |
+
- type: mrr_at_5
|
| 183 |
+
value: 1.276
|
| 184 |
+
- type: ndcg_at_1
|
| 185 |
+
value: 0.901
|
| 186 |
+
- type: ndcg_at_10
|
| 187 |
+
value: 2.085
|
| 188 |
+
- type: ndcg_at_100
|
| 189 |
+
value: 3.805
|
| 190 |
+
- type: ndcg_at_1000
|
| 191 |
+
value: 6.704000000000001
|
| 192 |
+
- type: ndcg_at_3
|
| 193 |
+
value: 1.41
|
| 194 |
+
- type: ndcg_at_5
|
| 195 |
+
value: 1.41
|
| 196 |
+
- type: precision_at_1
|
| 197 |
+
value: 0.901
|
| 198 |
+
- type: precision_at_10
|
| 199 |
+
value: 0.40499999999999997
|
| 200 |
+
- type: precision_at_100
|
| 201 |
+
value: 0.126
|
| 202 |
+
- type: precision_at_1000
|
| 203 |
+
value: 0.037
|
| 204 |
+
- type: precision_at_3
|
| 205 |
+
value: 0.601
|
| 206 |
+
- type: precision_at_5
|
| 207 |
+
value: 0.36
|
| 208 |
+
- type: recall_at_1
|
| 209 |
+
value: 0.901
|
| 210 |
+
- type: recall_at_10
|
| 211 |
+
value: 4.054
|
| 212 |
+
- type: recall_at_100
|
| 213 |
+
value: 12.613
|
| 214 |
+
- type: recall_at_1000
|
| 215 |
+
value: 36.937
|
| 216 |
+
- type: recall_at_3
|
| 217 |
+
value: 1.802
|
| 218 |
+
- type: recall_at_5
|
| 219 |
+
value: 1.802
|
| 220 |
+
- task:
|
| 221 |
+
type: BitextMining
|
| 222 |
+
dataset:
|
| 223 |
+
type: rbawden/DiaBLa
|
| 224 |
+
name: MTEB DiaBLaBitextMining (fr-en)
|
| 225 |
+
config: fr-en
|
| 226 |
+
split: test
|
| 227 |
+
revision: 5345895c56a601afe1a98519ce3199be60a27dba
|
| 228 |
+
metrics:
|
| 229 |
+
- type: accuracy
|
| 230 |
+
value: 88.90048712595686
|
| 231 |
+
- type: f1
|
| 232 |
+
value: 86.94952864886115
|
| 233 |
+
- type: precision
|
| 234 |
+
value: 86.20344379175826
|
| 235 |
+
- type: recall
|
| 236 |
+
value: 88.90048712595686
|
| 237 |
+
- task:
|
| 238 |
+
type: Clustering
|
| 239 |
+
dataset:
|
| 240 |
+
type: lyon-nlp/clustering-hal-s2s
|
| 241 |
+
name: MTEB HALClusteringS2S
|
| 242 |
+
config: default
|
| 243 |
+
split: test
|
| 244 |
+
revision: e06ebbbb123f8144bef1a5d18796f3dec9ae2915
|
| 245 |
+
metrics:
|
| 246 |
+
- type: v_measure
|
| 247 |
+
value: 24.087988843991155
|
| 248 |
+
- task:
|
| 249 |
+
type: Clustering
|
| 250 |
+
dataset:
|
| 251 |
+
type: mlsum
|
| 252 |
+
name: MTEB MLSUMClusteringP2P
|
| 253 |
+
config: default
|
| 254 |
+
split: test
|
| 255 |
+
revision: b5d54f8f3b61ae17845046286940f03c6bc79bc7
|
| 256 |
+
metrics:
|
| 257 |
+
- type: v_measure
|
| 258 |
+
value: 43.79603865728535
|
| 259 |
+
- task:
|
| 260 |
+
type: Clustering
|
| 261 |
+
dataset:
|
| 262 |
+
type: mlsum
|
| 263 |
+
name: MTEB MLSUMClusteringS2S
|
| 264 |
+
config: default
|
| 265 |
+
split: test
|
| 266 |
+
revision: b5d54f8f3b61ae17845046286940f03c6bc79bc7
|
| 267 |
+
metrics:
|
| 268 |
+
- type: v_measure
|
| 269 |
+
value: 37.746550373003
|
| 270 |
+
- task:
|
| 271 |
+
type: Classification
|
| 272 |
+
dataset:
|
| 273 |
+
type: mteb/mtop_domain
|
| 274 |
+
name: MTEB MTOPDomainClassification (fr)
|
| 275 |
+
config: fr
|
| 276 |
+
split: test
|
| 277 |
+
revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
|
| 278 |
+
metrics:
|
| 279 |
+
- type: accuracy
|
| 280 |
+
value: 89.26088318196052
|
| 281 |
+
- type: f1
|
| 282 |
+
value: 88.95811185929033
|
| 283 |
+
- task:
|
| 284 |
+
type: Classification
|
| 285 |
+
dataset:
|
| 286 |
+
type: mteb/mtop_intent
|
| 287 |
+
name: MTEB MTOPIntentClassification (fr)
|
| 288 |
+
config: fr
|
| 289 |
+
split: test
|
| 290 |
+
revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
|
| 291 |
+
metrics:
|
| 292 |
+
- type: accuracy
|
| 293 |
+
value: 68.55308487316003
|
| 294 |
+
- type: f1
|
| 295 |
+
value: 48.2936682439785
|
| 296 |
+
- task:
|
| 297 |
+
type: Classification
|
| 298 |
+
dataset:
|
| 299 |
+
type: masakhane/masakhanews
|
| 300 |
+
name: MTEB MasakhaNEWSClassification (fra)
|
| 301 |
+
config: fra
|
| 302 |
+
split: test
|
| 303 |
+
revision: 8ccc72e69e65f40c70e117d8b3c08306bb788b60
|
| 304 |
+
metrics:
|
| 305 |
+
- type: accuracy
|
| 306 |
+
value: 81.51658767772511
|
| 307 |
+
- type: f1
|
| 308 |
+
value: 77.695234448912
|
| 309 |
+
- task:
|
| 310 |
+
type: Clustering
|
| 311 |
+
dataset:
|
| 312 |
+
type: masakhane/masakhanews
|
| 313 |
+
name: MTEB MasakhaNEWSClusteringP2P (fra)
|
| 314 |
+
config: fra
|
| 315 |
+
split: test
|
| 316 |
+
revision: 8ccc72e69e65f40c70e117d8b3c08306bb788b60
|
| 317 |
+
metrics:
|
| 318 |
+
- type: v_measure
|
| 319 |
+
value: 40.80377094681114
|
| 320 |
+
- task:
|
| 321 |
+
type: Clustering
|
| 322 |
+
dataset:
|
| 323 |
+
type: masakhane/masakhanews
|
| 324 |
+
name: MTEB MasakhaNEWSClusteringS2S (fra)
|
| 325 |
+
config: fra
|
| 326 |
+
split: test
|
| 327 |
+
revision: 8ccc72e69e65f40c70e117d8b3c08306bb788b60
|
| 328 |
+
metrics:
|
| 329 |
+
- type: v_measure
|
| 330 |
+
value: 28.79703837416241
|
| 331 |
+
- task:
|
| 332 |
+
type: Classification
|
| 333 |
+
dataset:
|
| 334 |
+
type: mteb/amazon_massive_intent
|
| 335 |
+
name: MTEB MassiveIntentClassification (fr)
|
| 336 |
+
config: fr
|
| 337 |
+
split: test
|
| 338 |
+
revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
|
| 339 |
+
metrics:
|
| 340 |
+
- type: accuracy
|
| 341 |
+
value: 67.40080699394755
|
| 342 |
+
- type: f1
|
| 343 |
+
value: 65.60793135686376
|
| 344 |
+
- task:
|
| 345 |
+
type: Classification
|
| 346 |
+
dataset:
|
| 347 |
+
type: mteb/amazon_massive_scenario
|
| 348 |
+
name: MTEB MassiveScenarioClassification (fr)
|
| 349 |
+
config: fr
|
| 350 |
+
split: test
|
| 351 |
+
revision: 7d571f92784cd94a019292a1f45445077d0ef634
|
| 352 |
+
metrics:
|
| 353 |
+
- type: accuracy
|
| 354 |
+
value: 71.29455279085406
|
| 355 |
+
- type: f1
|
| 356 |
+
value: 70.80876673828983
|
| 357 |
+
- task:
|
| 358 |
+
type: Retrieval
|
| 359 |
+
dataset:
|
| 360 |
+
type: jinaai/mintakaqa
|
| 361 |
+
name: MTEB MintakaRetrieval (fr)
|
| 362 |
+
config: fr
|
| 363 |
+
split: test
|
| 364 |
+
revision: efa78cc2f74bbcd21eff2261f9e13aebe40b814e
|
| 365 |
+
metrics:
|
| 366 |
+
- type: map_at_1
|
| 367 |
+
value: 16.625999999999998
|
| 368 |
+
- type: map_at_10
|
| 369 |
+
value: 25.224999999999998
|
| 370 |
+
- type: map_at_100
|
| 371 |
+
value: 26.291999999999998
|
| 372 |
+
- type: map_at_1000
|
| 373 |
+
value: 26.395000000000003
|
| 374 |
+
- type: map_at_3
|
| 375 |
+
value: 22.378999999999998
|
| 376 |
+
- type: map_at_5
|
| 377 |
+
value: 24.009
|
| 378 |
+
- type: mrr_at_1
|
| 379 |
+
value: 16.625999999999998
|
| 380 |
+
- type: mrr_at_10
|
| 381 |
+
value: 25.224999999999998
|
| 382 |
+
- type: mrr_at_100
|
| 383 |
+
value: 26.291999999999998
|
| 384 |
+
- type: mrr_at_1000
|
| 385 |
+
value: 26.395000000000003
|
| 386 |
+
- type: mrr_at_3
|
| 387 |
+
value: 22.378999999999998
|
| 388 |
+
- type: mrr_at_5
|
| 389 |
+
value: 24.009
|
| 390 |
+
- type: ndcg_at_1
|
| 391 |
+
value: 16.625999999999998
|
| 392 |
+
- type: ndcg_at_10
|
| 393 |
+
value: 30.074
|
| 394 |
+
- type: ndcg_at_100
|
| 395 |
+
value: 35.683
|
| 396 |
+
- type: ndcg_at_1000
|
| 397 |
+
value: 38.714999999999996
|
| 398 |
+
- type: ndcg_at_3
|
| 399 |
+
value: 24.188000000000002
|
| 400 |
+
- type: ndcg_at_5
|
| 401 |
+
value: 27.124
|
| 402 |
+
- type: precision_at_1
|
| 403 |
+
value: 16.625999999999998
|
| 404 |
+
- type: precision_at_10
|
| 405 |
+
value: 4.566
|
| 406 |
+
- type: precision_at_100
|
| 407 |
+
value: 0.729
|
| 408 |
+
- type: precision_at_1000
|
| 409 |
+
value: 0.097
|
| 410 |
+
- type: precision_at_3
|
| 411 |
+
value: 9.801
|
| 412 |
+
- type: precision_at_5
|
| 413 |
+
value: 7.305000000000001
|
| 414 |
+
- type: recall_at_1
|
| 415 |
+
value: 16.625999999999998
|
| 416 |
+
- type: recall_at_10
|
| 417 |
+
value: 45.659
|
| 418 |
+
- type: recall_at_100
|
| 419 |
+
value: 72.85000000000001
|
| 420 |
+
- type: recall_at_1000
|
| 421 |
+
value: 97.42
|
| 422 |
+
- type: recall_at_3
|
| 423 |
+
value: 29.402
|
| 424 |
+
- type: recall_at_5
|
| 425 |
+
value: 36.527
|
| 426 |
+
- task:
|
| 427 |
+
type: PairClassification
|
| 428 |
+
dataset:
|
| 429 |
+
type: GEM/opusparcus
|
| 430 |
+
name: MTEB OpusparcusPC (fr)
|
| 431 |
+
config: fr
|
| 432 |
+
split: test
|
| 433 |
+
revision: 9e9b1f8ef51616073f47f306f7f47dd91663f86a
|
| 434 |
+
metrics:
|
| 435 |
+
- type: cos_sim_accuracy
|
| 436 |
+
value: 83.58310626702998
|
| 437 |
+
- type: cos_sim_ap
|
| 438 |
+
value: 94.01979957812989
|
| 439 |
+
- type: cos_sim_f1
|
| 440 |
+
value: 88.70135958743555
|
| 441 |
+
- type: cos_sim_precision
|
| 442 |
+
value: 84.01420959147424
|
| 443 |
+
- type: cos_sim_recall
|
| 444 |
+
value: 93.94240317775571
|
| 445 |
+
- type: dot_accuracy
|
| 446 |
+
value: 83.58310626702998
|
| 447 |
+
- type: dot_ap
|
| 448 |
+
value: 94.01979957812989
|
| 449 |
+
- type: dot_f1
|
| 450 |
+
value: 88.70135958743555
|
| 451 |
+
- type: dot_precision
|
| 452 |
+
value: 84.01420959147424
|
| 453 |
+
- type: dot_recall
|
| 454 |
+
value: 93.94240317775571
|
| 455 |
+
- type: euclidean_accuracy
|
| 456 |
+
value: 83.58310626702998
|
| 457 |
+
- type: euclidean_ap
|
| 458 |
+
value: 94.01979957812989
|
| 459 |
+
- type: euclidean_f1
|
| 460 |
+
value: 88.70135958743555
|
| 461 |
+
- type: euclidean_precision
|
| 462 |
+
value: 84.01420959147424
|
| 463 |
+
- type: euclidean_recall
|
| 464 |
+
value: 93.94240317775571
|
| 465 |
+
- type: manhattan_accuracy
|
| 466 |
+
value: 83.58310626702998
|
| 467 |
+
- type: manhattan_ap
|
| 468 |
+
value: 93.99936024003892
|
| 469 |
+
- type: manhattan_f1
|
| 470 |
+
value: 88.6924150767799
|
| 471 |
+
- type: manhattan_precision
|
| 472 |
+
value: 83.45008756567425
|
| 473 |
+
- type: manhattan_recall
|
| 474 |
+
value: 94.63753723932473
|
| 475 |
+
- type: max_accuracy
|
| 476 |
+
value: 83.58310626702998
|
| 477 |
+
- type: max_ap
|
| 478 |
+
value: 94.01979957812989
|
| 479 |
+
- type: max_f1
|
| 480 |
+
value: 88.70135958743555
|
| 481 |
+
- task:
|
| 482 |
+
type: PairClassification
|
| 483 |
+
dataset:
|
| 484 |
+
type: paws-x
|
| 485 |
+
name: MTEB PawsX (fr)
|
| 486 |
+
config: fr
|
| 487 |
+
split: test
|
| 488 |
+
revision: 8a04d940a42cd40658986fdd8e3da561533a3646
|
| 489 |
+
metrics:
|
| 490 |
+
- type: cos_sim_accuracy
|
| 491 |
+
value: 60.6
|
| 492 |
+
- type: cos_sim_ap
|
| 493 |
+
value: 60.18915797975459
|
| 494 |
+
- type: cos_sim_f1
|
| 495 |
+
value: 62.491349480968864
|
| 496 |
+
- type: cos_sim_precision
|
| 497 |
+
value: 45.44539506794162
|
| 498 |
+
- type: cos_sim_recall
|
| 499 |
+
value: 100
|
| 500 |
+
- type: dot_accuracy
|
| 501 |
+
value: 60.6
|
| 502 |
+
- type: dot_ap
|
| 503 |
+
value: 60.091135216056024
|
| 504 |
+
- type: dot_f1
|
| 505 |
+
value: 62.491349480968864
|
| 506 |
+
- type: dot_precision
|
| 507 |
+
value: 45.44539506794162
|
| 508 |
+
- type: dot_recall
|
| 509 |
+
value: 100
|
| 510 |
+
- type: euclidean_accuracy
|
| 511 |
+
value: 60.6
|
| 512 |
+
- type: euclidean_ap
|
| 513 |
+
value: 60.18915797975459
|
| 514 |
+
- type: euclidean_f1
|
| 515 |
+
value: 62.491349480968864
|
| 516 |
+
- type: euclidean_precision
|
| 517 |
+
value: 45.44539506794162
|
| 518 |
+
- type: euclidean_recall
|
| 519 |
+
value: 100
|
| 520 |
+
- type: manhattan_accuracy
|
| 521 |
+
value: 60.650000000000006
|
| 522 |
+
- type: manhattan_ap
|
| 523 |
+
value: 60.2082343915352
|
| 524 |
+
- type: manhattan_f1
|
| 525 |
+
value: 62.491349480968864
|
| 526 |
+
- type: manhattan_precision
|
| 527 |
+
value: 45.44539506794162
|
| 528 |
+
- type: manhattan_recall
|
| 529 |
+
value: 100
|
| 530 |
+
- type: max_accuracy
|
| 531 |
+
value: 60.650000000000006
|
| 532 |
+
- type: max_ap
|
| 533 |
+
value: 60.2082343915352
|
| 534 |
+
- type: max_f1
|
| 535 |
+
value: 62.491349480968864
|
| 536 |
+
- task:
|
| 537 |
+
type: STS
|
| 538 |
+
dataset:
|
| 539 |
+
type: Lajavaness/SICK-fr
|
| 540 |
+
name: MTEB SICKFr
|
| 541 |
+
config: default
|
| 542 |
+
split: test
|
| 543 |
+
revision: e077ab4cf4774a1e36d86d593b150422fafd8e8a
|
| 544 |
+
metrics:
|
| 545 |
+
- type: cos_sim_pearson
|
| 546 |
+
value: 79.77067200230256
|
| 547 |
+
- type: cos_sim_spearman
|
| 548 |
+
value: 76.7445532523278
|
| 549 |
+
- type: euclidean_pearson
|
| 550 |
+
value: 76.34017074673956
|
| 551 |
+
- type: euclidean_spearman
|
| 552 |
+
value: 76.7453011027832
|
| 553 |
+
- type: manhattan_pearson
|
| 554 |
+
value: 76.19578084197778
|
| 555 |
+
- type: manhattan_spearman
|
| 556 |
+
value: 76.56293456459228
|
| 557 |
+
- task:
|
| 558 |
+
type: STS
|
| 559 |
+
dataset:
|
| 560 |
+
type: mteb/sts22-crosslingual-sts
|
| 561 |
+
name: MTEB STS22 (fr)
|
| 562 |
+
config: fr
|
| 563 |
+
split: test
|
| 564 |
+
revision: eea2b4fe26a775864c896887d910b76a8098ad3f
|
| 565 |
+
metrics:
|
| 566 |
+
- type: cos_sim_pearson
|
| 567 |
+
value: 81.2564160237984
|
| 568 |
+
- type: cos_sim_spearman
|
| 569 |
+
value: 83.30552085410882
|
| 570 |
+
- type: euclidean_pearson
|
| 571 |
+
value: 82.00494560507786
|
| 572 |
+
- type: euclidean_spearman
|
| 573 |
+
value: 83.30552085410882
|
| 574 |
+
- type: manhattan_pearson
|
| 575 |
+
value: 81.93132229157803
|
| 576 |
+
- type: manhattan_spearman
|
| 577 |
+
value: 83.04357992939353
|
| 578 |
+
- task:
|
| 579 |
+
type: STS
|
| 580 |
+
dataset:
|
| 581 |
+
type: stsb_multi_mt
|
| 582 |
+
name: MTEB STSBenchmarkMultilingualSTS (fr)
|
| 583 |
+
config: fr
|
| 584 |
+
split: test
|
| 585 |
+
revision: 93d57ef91790589e3ce9c365164337a8a78b7632
|
| 586 |
+
metrics:
|
| 587 |
+
- type: cos_sim_pearson
|
| 588 |
+
value: 80.34931905288978
|
| 589 |
+
- type: cos_sim_spearman
|
| 590 |
+
value: 79.99372771100049
|
| 591 |
+
- type: euclidean_pearson
|
| 592 |
+
value: 78.37976845123443
|
| 593 |
+
- type: euclidean_spearman
|
| 594 |
+
value: 79.99452356550658
|
| 595 |
+
- type: manhattan_pearson
|
| 596 |
+
value: 78.24434042082316
|
| 597 |
+
- type: manhattan_spearman
|
| 598 |
+
value: 79.87248340061164
|
| 599 |
+
- task:
|
| 600 |
+
type: Summarization
|
| 601 |
+
dataset:
|
| 602 |
+
type: lyon-nlp/summarization-summeval-fr-p2p
|
| 603 |
+
name: MTEB SummEvalFr
|
| 604 |
+
config: default
|
| 605 |
+
split: test
|
| 606 |
+
revision: b385812de6a9577b6f4d0f88c6a6e35395a94054
|
| 607 |
+
metrics:
|
| 608 |
+
- type: cos_sim_pearson
|
| 609 |
+
value: 30.476001473421586
|
| 610 |
+
- type: cos_sim_spearman
|
| 611 |
+
value: 29.687350195905456
|
| 612 |
+
- type: dot_pearson
|
| 613 |
+
value: 30.476000875190685
|
| 614 |
+
- type: dot_spearman
|
| 615 |
+
value: 29.662224660056562
|
| 616 |
+
- task:
|
| 617 |
+
type: Reranking
|
| 618 |
+
dataset:
|
| 619 |
+
type: lyon-nlp/mteb-fr-reranking-syntec-s2p
|
| 620 |
+
name: MTEB SyntecReranking
|
| 621 |
+
config: default
|
| 622 |
+
split: test
|
| 623 |
+
revision: b205c5084a0934ce8af14338bf03feb19499c84d
|
| 624 |
+
metrics:
|
| 625 |
+
- type: map
|
| 626 |
+
value: 88.28333333333333
|
| 627 |
+
- type: mrr
|
| 628 |
+
value: 88.28333333333333
|
| 629 |
+
- task:
|
| 630 |
+
type: Retrieval
|
| 631 |
+
dataset:
|
| 632 |
+
type: lyon-nlp/mteb-fr-retrieval-syntec-s2p
|
| 633 |
+
name: MTEB SyntecRetrieval
|
| 634 |
+
config: default
|
| 635 |
+
split: test
|
| 636 |
+
revision: 77f7e271bf4a92b24fce5119f3486b583ca016ff
|
| 637 |
+
metrics:
|
| 638 |
+
- type: map_at_1
|
| 639 |
+
value: 69
|
| 640 |
+
- type: map_at_10
|
| 641 |
+
value: 79.906
|
| 642 |
+
- type: map_at_100
|
| 643 |
+
value: 79.982
|
| 644 |
+
- type: map_at_1000
|
| 645 |
+
value: 79.982
|
| 646 |
+
- type: map_at_3
|
| 647 |
+
value: 77.667
|
| 648 |
+
- type: map_at_5
|
| 649 |
+
value: 79.51700000000001
|
| 650 |
+
- type: mrr_at_1
|
| 651 |
+
value: 69
|
| 652 |
+
- type: mrr_at_10
|
| 653 |
+
value: 79.906
|
| 654 |
+
- type: mrr_at_100
|
| 655 |
+
value: 79.982
|
| 656 |
+
- type: mrr_at_1000
|
| 657 |
+
value: 79.982
|
| 658 |
+
- type: mrr_at_3
|
| 659 |
+
value: 77.667
|
| 660 |
+
- type: mrr_at_5
|
| 661 |
+
value: 79.51700000000001
|
| 662 |
+
- type: ndcg_at_1
|
| 663 |
+
value: 69
|
| 664 |
+
- type: ndcg_at_10
|
| 665 |
+
value: 84.60499999999999
|
| 666 |
+
- type: ndcg_at_100
|
| 667 |
+
value: 84.868
|
| 668 |
+
- type: ndcg_at_1000
|
| 669 |
+
value: 84.868
|
| 670 |
+
- type: ndcg_at_3
|
| 671 |
+
value: 80.333
|
| 672 |
+
- type: ndcg_at_5
|
| 673 |
+
value: 83.647
|
| 674 |
+
- type: precision_at_1
|
| 675 |
+
value: 69
|
| 676 |
+
- type: precision_at_10
|
| 677 |
+
value: 9.9
|
| 678 |
+
- type: precision_at_100
|
| 679 |
+
value: 1
|
| 680 |
+
- type: precision_at_1000
|
| 681 |
+
value: 0.1
|
| 682 |
+
- type: precision_at_3
|
| 683 |
+
value: 29.333
|
| 684 |
+
- type: precision_at_5
|
| 685 |
+
value: 19.2
|
| 686 |
+
- type: recall_at_1
|
| 687 |
+
value: 69
|
| 688 |
+
- type: recall_at_10
|
| 689 |
+
value: 99
|
| 690 |
+
- type: recall_at_100
|
| 691 |
+
value: 100
|
| 692 |
+
- type: recall_at_1000
|
| 693 |
+
value: 100
|
| 694 |
+
- type: recall_at_3
|
| 695 |
+
value: 88
|
| 696 |
+
- type: recall_at_5
|
| 697 |
+
value: 96
|
| 698 |
+
- task:
|
| 699 |
+
type: Retrieval
|
| 700 |
+
dataset:
|
| 701 |
+
type: jinaai/xpqa
|
| 702 |
+
name: MTEB XPQARetrieval (fr)
|
| 703 |
+
config: fr
|
| 704 |
+
split: test
|
| 705 |
+
revision: c99d599f0a6ab9b85b065da6f9d94f9cf731679f
|
| 706 |
+
metrics:
|
| 707 |
+
- type: map_at_1
|
| 708 |
+
value: 42.027
|
| 709 |
+
- type: map_at_10
|
| 710 |
+
value: 64.331
|
| 711 |
+
- type: map_at_100
|
| 712 |
+
value: 65.657
|
| 713 |
+
- type: map_at_1000
|
| 714 |
+
value: 65.7
|
| 715 |
+
- type: map_at_3
|
| 716 |
+
value: 57.967999999999996
|
| 717 |
+
- type: map_at_5
|
| 718 |
+
value: 62.33800000000001
|
| 719 |
+
- type: mrr_at_1
|
| 720 |
+
value: 65.688
|
| 721 |
+
- type: mrr_at_10
|
| 722 |
+
value: 72.263
|
| 723 |
+
- type: mrr_at_100
|
| 724 |
+
value: 72.679
|
| 725 |
+
- type: mrr_at_1000
|
| 726 |
+
value: 72.69099999999999
|
| 727 |
+
- type: mrr_at_3
|
| 728 |
+
value: 70.405
|
| 729 |
+
- type: mrr_at_5
|
| 730 |
+
value: 71.587
|
| 731 |
+
- type: ndcg_at_1
|
| 732 |
+
value: 65.688
|
| 733 |
+
- type: ndcg_at_10
|
| 734 |
+
value: 70.221
|
| 735 |
+
- type: ndcg_at_100
|
| 736 |
+
value: 74.457
|
| 737 |
+
- type: ndcg_at_1000
|
| 738 |
+
value: 75.178
|
| 739 |
+
- type: ndcg_at_3
|
| 740 |
+
value: 65.423
|
| 741 |
+
- type: ndcg_at_5
|
| 742 |
+
value: 67.05499999999999
|
| 743 |
+
- type: precision_at_1
|
| 744 |
+
value: 65.688
|
| 745 |
+
- type: precision_at_10
|
| 746 |
+
value: 16.208
|
| 747 |
+
- type: precision_at_100
|
| 748 |
+
value: 1.975
|
| 749 |
+
- type: precision_at_1000
|
| 750 |
+
value: 0.207
|
| 751 |
+
- type: precision_at_3
|
| 752 |
+
value: 39.831
|
| 753 |
+
- type: precision_at_5
|
| 754 |
+
value: 28.652
|
| 755 |
+
- type: recall_at_1
|
| 756 |
+
value: 42.027
|
| 757 |
+
- type: recall_at_10
|
| 758 |
+
value: 78.803
|
| 759 |
+
- type: recall_at_100
|
| 760 |
+
value: 95.051
|
| 761 |
+
- type: recall_at_1000
|
| 762 |
+
value: 99.75500000000001
|
| 763 |
+
- type: recall_at_3
|
| 764 |
+
value: 62.62799999999999
|
| 765 |
+
- type: recall_at_5
|
| 766 |
+
value: 70.975
|
| 767 |
+
license: mit
|
| 768 |
+
language:
|
| 769 |
+
- fr
|
| 770 |
+
---
|
| 771 |
+
|
| 772 |
+
# Solon Embeddings — large 0.1
|
| 773 |
+
|
| 774 |
+
SOTA Open source french embedding model.
|
| 775 |
+
|
| 776 |
+
**Instructions :**
|
| 777 |
+
Add "query : " before the *query* to retrieve to increase performance of retrieval.
|
| 778 |
+
No instructions needed for *passages*.
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
| Model | Mean Score |
|
| 782 |
+
| --- | --- |
|
| 783 |
+
| **OrdalieTech/Solon-embeddings-large-0.1** | 0.7490 |
|
| 784 |
+
| cohere/embed-multilingual-v3 | 0.7402 |
|
| 785 |
+
| **OrdalieTech/Solon-embeddings-base-0.1** | 0.7306 |
|
| 786 |
+
| openai/ada-002 | 0.7290 |
|
| 787 |
+
| cohere/embed-multilingual-light-v3 | 0.6945 |
|
| 788 |
+
| antoinelouis/biencoder-camembert-base-mmarcoFR | 0.6826 |
|
| 789 |
+
| dangvantuan/sentence-camembert-large | 0.6756 |
|
| 790 |
+
| voyage/voyage-01 | 0.6753 |
|
| 791 |
+
| intfloat/multilingual-e5-large | 0.6660 |
|
| 792 |
+
| intfloat/multilingual-e5-base | 0.6597 |
|
| 793 |
+
| Sbert/paraphrase-multilingual-mpnet-base-v2 | 0.5975 |
|
| 794 |
+
| dangvantuan/sentence-camembert-base | 0.5456 |
|
| 795 |
+
| EuropeanParliament/eubert_embedding_v1 | 0.5063 |
|
| 796 |
+
|
| 797 |
+
These results have been obtained through 9 french benchmarks on a variety of text similarity tasks (classification, reranking, STS) :
|
| 798 |
+
- AmazonReviewsClassification (MTEB)
|
| 799 |
+
- MassiveIntentClassification (MTEB)
|
| 800 |
+
- MassiveScenarioClassification (MTEB)
|
| 801 |
+
- MTOPDomainClassification (MTEB)
|
| 802 |
+
- MTOPIntentClassification (MTEB)
|
| 803 |
+
- STS22 (MTEB)
|
| 804 |
+
- MiraclFRRerank (Miracl)
|
| 805 |
+
- OrdalieFRSTS (Ordalie)
|
| 806 |
+
- OrdalieFRReranking (Ordalie)
|
| 807 |
+
|
| 808 |
+
We created OrdalieFRSTS and OrdalieFRReranking to enhance the benchmarking capabilities of French STS and reranking assessments.
|
| 809 |
+
|
| 810 |
+
(evaluation script available here : github.com/OrdalieTech/mteb)
|
victord/sub19/models/Solon-embeddings-large-0.1/config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 4096,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"max_position_embeddings": 514,
|
| 16 |
+
"model_type": "xlm-roberta",
|
| 17 |
+
"num_attention_heads": 16,
|
| 18 |
+
"num_hidden_layers": 24,
|
| 19 |
+
"output_past": true,
|
| 20 |
+
"pad_token_id": 1,
|
| 21 |
+
"position_embedding_type": "absolute",
|
| 22 |
+
"torch_dtype": "bfloat16",
|
| 23 |
+
"transformers_version": "4.53.2",
|
| 24 |
+
"type_vocab_size": 1,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"vocab_size": 250002
|
| 27 |
+
}
|
victord/sub19/models/Solon-embeddings-large-0.1/config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"sentence_transformers": "5.1.1",
|
| 4 |
+
"transformers": "4.53.2",
|
| 5 |
+
"pytorch": "2.9.0+cu128"
|
| 6 |
+
},
|
| 7 |
+
"model_type": "SentenceTransformer",
|
| 8 |
+
"prompts": {
|
| 9 |
+
"query": "",
|
| 10 |
+
"document": ""
|
| 11 |
+
},
|
| 12 |
+
"default_prompt_name": null,
|
| 13 |
+
"similarity_fn_name": "cosine"
|
| 14 |
+
}
|
victord/sub19/models/Solon-embeddings-large-0.1/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93f3e71f01d33409b21e3c4ff37d249cb3c27c883b24a90afd352fb95cbd67fe
|
| 3 |
+
size 1119826072
|
victord/sub19/models/Solon-embeddings-large-0.1/modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
victord/sub19/models/Solon-embeddings-large-0.1/sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 512,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
victord/sub19/models/Solon-embeddings-large-0.1/sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
victord/sub19/models/Solon-embeddings-large-0.1/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
victord/sub19/models/Solon-embeddings-large-0.1/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:249df0778f236f6ece390de0de746838ef25b9d6954b68c2ee71249e0a9d8fd4
|
| 3 |
+
size 17082799
|
victord/sub19/models/Solon-embeddings-large-0.1/tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"model_max_length": 512,
|
| 51 |
+
"pad_token": "<pad>",
|
| 52 |
+
"sep_token": "</s>",
|
| 53 |
+
"sp_model_kwargs": {},
|
| 54 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 55 |
+
"unk_token": "<unk>"
|
| 56 |
+
}
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 1024,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": true,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": false,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/README.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: sentence-transformers
|
| 3 |
+
pipeline_tag: sentence-similarity
|
| 4 |
+
tags:
|
| 5 |
+
- sentence-transformers
|
| 6 |
+
- feature-extraction
|
| 7 |
+
- sentence-similarity
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# {MODEL_NAME}
|
| 12 |
+
|
| 13 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
| 14 |
+
|
| 15 |
+
<!--- Describe your model here -->
|
| 16 |
+
|
| 17 |
+
## Usage (Sentence-Transformers)
|
| 18 |
+
|
| 19 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
| 20 |
+
|
| 21 |
+
```
|
| 22 |
+
pip install -U sentence-transformers
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
Then you can use the model like this:
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
from sentence_transformers import SentenceTransformer
|
| 29 |
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
| 30 |
+
|
| 31 |
+
model = SentenceTransformer('{MODEL_NAME}')
|
| 32 |
+
embeddings = model.encode(sentences)
|
| 33 |
+
print(embeddings)
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
## Evaluation Results
|
| 39 |
+
|
| 40 |
+
<!--- Describe how your model was evaluated -->
|
| 41 |
+
|
| 42 |
+
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
## Training
|
| 46 |
+
The model was trained with the parameters:
|
| 47 |
+
|
| 48 |
+
**DataLoader**:
|
| 49 |
+
|
| 50 |
+
`torch.utils.data.dataloader.DataLoader` of length 874 with parameters:
|
| 51 |
+
```
|
| 52 |
+
{'batch_size': None, 'sampler': 'torch.utils.data.sampler.SequentialSampler', 'batch_sampler': 'utils.UniqueQuestionSampler'}
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
**Loss**:
|
| 56 |
+
|
| 57 |
+
`sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
|
| 58 |
+
```
|
| 59 |
+
{'scale': 100, 'similarity_fct': 'cos_sim'}
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
Parameters of the fit()-Method:
|
| 63 |
+
```
|
| 64 |
+
{
|
| 65 |
+
"epochs": 10,
|
| 66 |
+
"evaluation_steps": 0,
|
| 67 |
+
"evaluator": "utils.CustomInformationRetrievalEvaluator",
|
| 68 |
+
"max_grad_norm": 1,
|
| 69 |
+
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
|
| 70 |
+
"optimizer_params": {
|
| 71 |
+
"lr": 1e-05,
|
| 72 |
+
"weight_decay": 0.01
|
| 73 |
+
},
|
| 74 |
+
"scheduler": "WarmupLinear",
|
| 75 |
+
"steps_per_epoch": null,
|
| 76 |
+
"warmup_steps": 243,
|
| 77 |
+
"weight_decay": 0.01
|
| 78 |
+
}
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
## Full Model Architecture
|
| 83 |
+
```
|
| 84 |
+
SentenceTransformer(
|
| 85 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
|
| 86 |
+
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
| 87 |
+
(2): Normalize()
|
| 88 |
+
)
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## Citing & Authors
|
| 92 |
+
|
| 93 |
+
<!--- Describe where people can find more information -->
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 4096,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"max_position_embeddings": 514,
|
| 16 |
+
"model_type": "xlm-roberta",
|
| 17 |
+
"num_attention_heads": 16,
|
| 18 |
+
"num_hidden_layers": 24,
|
| 19 |
+
"output_past": true,
|
| 20 |
+
"pad_token_id": 1,
|
| 21 |
+
"position_embedding_type": "absolute",
|
| 22 |
+
"torch_dtype": "bfloat16",
|
| 23 |
+
"transformers_version": "4.53.2",
|
| 24 |
+
"type_vocab_size": 1,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"vocab_size": 250002
|
| 27 |
+
}
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"sentence_transformers": "5.1.2",
|
| 4 |
+
"transformers": "4.53.2",
|
| 5 |
+
"pytorch": "2.9.0+cu128"
|
| 6 |
+
},
|
| 7 |
+
"prompts": {
|
| 8 |
+
"query": "",
|
| 9 |
+
"document": ""
|
| 10 |
+
},
|
| 11 |
+
"default_prompt_name": null,
|
| 12 |
+
"model_type": "SentenceTransformer",
|
| 13 |
+
"similarity_fn_name": "cosine"
|
| 14 |
+
}
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c5ecf0a1c5a72fb78c91dfd5b0c7f16145ca5fe2e5c4d3e0870b73fef7f6e99
|
| 3 |
+
size 1119826072
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 512,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
|
| 3 |
+
size 17082987
|
victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer_config.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"max_length": 512,
|
| 51 |
+
"model_max_length": 512,
|
| 52 |
+
"pad_to_multiple_of": null,
|
| 53 |
+
"pad_token": "<pad>",
|
| 54 |
+
"pad_token_type_id": 0,
|
| 55 |
+
"padding_side": "right",
|
| 56 |
+
"sep_token": "</s>",
|
| 57 |
+
"stride": 0,
|
| 58 |
+
"tokenizer_class": "XLMRobertaTokenizerFast",
|
| 59 |
+
"truncation_side": "right",
|
| 60 |
+
"truncation_strategy": "longest_first",
|
| 61 |
+
"unk_token": "<unk>"
|
| 62 |
+
}
|
victord/sub19/models/bge-m3/1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 1024,
|
| 3 |
+
"pooling_mode_cls_token": true,
|
| 4 |
+
"pooling_mode_mean_tokens": false,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": false,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
victord/sub19/models/bge-m3/README.md
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
pipeline_tag: sentence-similarity
|
| 3 |
+
tags:
|
| 4 |
+
- sentence-transformers
|
| 5 |
+
- feature-extraction
|
| 6 |
+
- sentence-similarity
|
| 7 |
+
license: mit
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
For more details please refer to our github repo: https://github.com/FlagOpen/FlagEmbedding
|
| 11 |
+
|
| 12 |
+
# BGE-M3 ([paper](https://arxiv.org/pdf/2402.03216.pdf), [code](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/BGE_M3))
|
| 13 |
+
|
| 14 |
+
In this project, we introduce BGE-M3, which is distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity.
|
| 15 |
+
- Multi-Functionality: It can simultaneously perform the three common retrieval functionalities of embedding model: dense retrieval, multi-vector retrieval, and sparse retrieval.
|
| 16 |
+
- Multi-Linguality: It can support more than 100 working languages.
|
| 17 |
+
- Multi-Granularity: It is able to process inputs of different granularities, spanning from short sentences to long documents of up to 8192 tokens.
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
**Some suggestions for retrieval pipeline in RAG**
|
| 22 |
+
|
| 23 |
+
We recommend to use the following pipeline: hybrid retrieval + re-ranking.
|
| 24 |
+
- Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities.
|
| 25 |
+
A classic example: using both embedding retrieval and the BM25 algorithm.
|
| 26 |
+
Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval.
|
| 27 |
+
This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings.
|
| 28 |
+
To use hybrid retrieval, you can refer to [Vespa](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb
|
| 29 |
+
) and [Milvus](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py).
|
| 30 |
+
|
| 31 |
+
- As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model.
|
| 32 |
+
Utilizing the re-ranking model (e.g., [bge-reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/reranker), [bge-reranker-v2](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/llm_reranker)) after retrieval can further filter the selected text.
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
## News:
|
| 36 |
+
- 2024/7/1: **We update the MIRACL evaluation results of BGE-M3**. To reproduce the new results, you can refer to: [bge-m3_miracl_2cr](https://huggingface.co/datasets/hanhainebula/bge-m3_miracl_2cr). We have also updated our [paper](https://arxiv.org/pdf/2402.03216) on arXiv.
|
| 37 |
+
<details>
|
| 38 |
+
<summary> Details </summary>
|
| 39 |
+
|
| 40 |
+
The previous test results were lower because we mistakenly removed the passages that have the same id as the query from the search results. After correcting this mistake, the overall performance of BGE-M3 on MIRACL is higher than the previous results, but the experimental conclusion remains unchanged. The other results are not affected by this mistake. To reproduce the previous lower results, you need to add the `--remove-query` parameter when using `pyserini.search.faiss` or `pyserini.search.lucene` to search the passages.
|
| 41 |
+
|
| 42 |
+
</details>
|
| 43 |
+
- 2024/3/20: **Thanks Milvus team!** Now you can use hybrid retrieval of bge-m3 in Milvus: [pymilvus/examples
|
| 44 |
+
/hello_hybrid_sparse_dense.py](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py).
|
| 45 |
+
- 2024/3/8: **Thanks for the [experimental results](https://towardsdatascience.com/openai-vs-open-source-multilingual-embedding-models-e5ccb7c90f05) from @[Yannael](https://huggingface.co/Yannael). In this benchmark, BGE-M3 achieves top performance in both English and other languages, surpassing models such as OpenAI.**
|
| 46 |
+
- 2024/3/2: Release unified fine-tuning [example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/unified_finetune) and [data](https://huggingface.co/datasets/Shitao/bge-m3-data)
|
| 47 |
+
- 2024/2/6: We release the [MLDR](https://huggingface.co/datasets/Shitao/MLDR) (a long document retrieval dataset covering 13 languages) and [evaluation pipeline](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR).
|
| 48 |
+
- 2024/2/1: **Thanks for the excellent tool from Vespa.** You can easily use multiple modes of BGE-M3 following this [notebook](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
## Specs
|
| 52 |
+
|
| 53 |
+
- Model
|
| 54 |
+
|
| 55 |
+
| Model Name | Dimension | Sequence Length | Introduction |
|
| 56 |
+
|:----:|:---:|:---:|:---:|
|
| 57 |
+
| [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | 1024 | 8192 | multilingual; unified fine-tuning (dense, sparse, and colbert) from bge-m3-unsupervised|
|
| 58 |
+
| [BAAI/bge-m3-unsupervised](https://huggingface.co/BAAI/bge-m3-unsupervised) | 1024 | 8192 | multilingual; contrastive learning from bge-m3-retromae |
|
| 59 |
+
| [BAAI/bge-m3-retromae](https://huggingface.co/BAAI/bge-m3-retromae) | -- | 8192 | multilingual; extend the max_length of [xlm-roberta](https://huggingface.co/FacebookAI/xlm-roberta-large) to 8192 and further pretrained via [retromae](https://github.com/staoxiao/RetroMAE)|
|
| 60 |
+
| [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) | 1024 | 512 | English model |
|
| 61 |
+
| [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 768 | 512 | English model |
|
| 62 |
+
| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | 384 | 512 | English model |
|
| 63 |
+
|
| 64 |
+
- Data
|
| 65 |
+
|
| 66 |
+
| Dataset | Introduction |
|
| 67 |
+
|:----------------------------------------------------------:|:-------------------------------------------------:|
|
| 68 |
+
| [MLDR](https://huggingface.co/datasets/Shitao/MLDR) | Docuemtn Retrieval Dataset, covering 13 languages |
|
| 69 |
+
| [bge-m3-data](https://huggingface.co/datasets/Shitao/bge-m3-data) | Fine-tuning data used by bge-m3 |
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
## FAQ
|
| 74 |
+
|
| 75 |
+
**1. Introduction for different retrieval methods**
|
| 76 |
+
|
| 77 |
+
- Dense retrieval: map the text into a single embedding, e.g., [DPR](https://arxiv.org/abs/2004.04906), [BGE-v1.5](https://github.com/FlagOpen/FlagEmbedding)
|
| 78 |
+
- Sparse retrieval (lexical matching): a vector of size equal to the vocabulary, with the majority of positions set to zero, calculating a weight only for tokens present in the text. e.g., BM25, [unicoil](https://arxiv.org/pdf/2106.14807.pdf), and [splade](https://arxiv.org/abs/2107.05720)
|
| 79 |
+
- Multi-vector retrieval: use multiple vectors to represent a text, e.g., [ColBERT](https://arxiv.org/abs/2004.12832).
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
**2. How to use BGE-M3 in other projects?**
|
| 83 |
+
|
| 84 |
+
For embedding retrieval, you can employ the BGE-M3 model using the same approach as BGE.
|
| 85 |
+
The only difference is that the BGE-M3 model no longer requires adding instructions to the queries.
|
| 86 |
+
|
| 87 |
+
For hybrid retrieval, you can use [Vespa](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb
|
| 88 |
+
) and [Milvus](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py).
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
**3. How to fine-tune bge-M3 model?**
|
| 92 |
+
|
| 93 |
+
You can follow the common in this [example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune)
|
| 94 |
+
to fine-tune the dense embedding.
|
| 95 |
+
|
| 96 |
+
If you want to fine-tune all embedding function of m3 (dense, sparse and colbert), you can refer to the [unified_fine-tuning example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/unified_finetune)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
## Usage
|
| 104 |
+
|
| 105 |
+
Install:
|
| 106 |
+
```
|
| 107 |
+
git clone https://github.com/FlagOpen/FlagEmbedding.git
|
| 108 |
+
cd FlagEmbedding
|
| 109 |
+
pip install -e .
|
| 110 |
+
```
|
| 111 |
+
or:
|
| 112 |
+
```
|
| 113 |
+
pip install -U FlagEmbedding
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
### Generate Embedding for text
|
| 119 |
+
|
| 120 |
+
- Dense Embedding
|
| 121 |
+
```python
|
| 122 |
+
from FlagEmbedding import BGEM3FlagModel
|
| 123 |
+
|
| 124 |
+
model = BGEM3FlagModel('BAAI/bge-m3',
|
| 125 |
+
use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
|
| 126 |
+
|
| 127 |
+
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
|
| 128 |
+
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
|
| 129 |
+
"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
|
| 130 |
+
|
| 131 |
+
embeddings_1 = model.encode(sentences_1,
|
| 132 |
+
batch_size=12,
|
| 133 |
+
max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
|
| 134 |
+
)['dense_vecs']
|
| 135 |
+
embeddings_2 = model.encode(sentences_2)['dense_vecs']
|
| 136 |
+
similarity = embeddings_1 @ embeddings_2.T
|
| 137 |
+
print(similarity)
|
| 138 |
+
# [[0.6265, 0.3477], [0.3499, 0.678 ]]
|
| 139 |
+
```
|
| 140 |
+
You also can use sentence-transformers and huggingface transformers to generate dense embeddings.
|
| 141 |
+
Refer to [baai_general_embedding](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/baai_general_embedding#usage) for details.
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
- Sparse Embedding (Lexical Weight)
|
| 145 |
+
```python
|
| 146 |
+
from FlagEmbedding import BGEM3FlagModel
|
| 147 |
+
|
| 148 |
+
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
|
| 149 |
+
|
| 150 |
+
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
|
| 151 |
+
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
|
| 152 |
+
"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
|
| 153 |
+
|
| 154 |
+
output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=False)
|
| 155 |
+
output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=False)
|
| 156 |
+
|
| 157 |
+
# you can see the weight for each token:
|
| 158 |
+
print(model.convert_id_to_token(output_1['lexical_weights']))
|
| 159 |
+
# [{'What': 0.08356, 'is': 0.0814, 'B': 0.1296, 'GE': 0.252, 'M': 0.1702, '3': 0.2695, '?': 0.04092},
|
| 160 |
+
# {'De': 0.05005, 'fin': 0.1368, 'ation': 0.04498, 'of': 0.0633, 'BM': 0.2515, '25': 0.3335}]
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# compute the scores via lexical mathcing
|
| 164 |
+
lexical_scores = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0])
|
| 165 |
+
print(lexical_scores)
|
| 166 |
+
# 0.19554901123046875
|
| 167 |
+
|
| 168 |
+
print(model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_1['lexical_weights'][1]))
|
| 169 |
+
# 0.0
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
- Multi-Vector (ColBERT)
|
| 173 |
+
```python
|
| 174 |
+
from FlagEmbedding import BGEM3FlagModel
|
| 175 |
+
|
| 176 |
+
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
|
| 177 |
+
|
| 178 |
+
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
|
| 179 |
+
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
|
| 180 |
+
"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
|
| 181 |
+
|
| 182 |
+
output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)
|
| 183 |
+
output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)
|
| 184 |
+
|
| 185 |
+
print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]))
|
| 186 |
+
print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]))
|
| 187 |
+
# 0.7797
|
| 188 |
+
# 0.4620
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
### Compute score for text pairs
|
| 193 |
+
Input a list of text pairs, you can get the scores computed by different methods.
|
| 194 |
+
```python
|
| 195 |
+
from FlagEmbedding import BGEM3FlagModel
|
| 196 |
+
|
| 197 |
+
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
|
| 198 |
+
|
| 199 |
+
sentences_1 = ["What is BGE M3?", "Defination of BM25"]
|
| 200 |
+
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
|
| 201 |
+
"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
|
| 202 |
+
|
| 203 |
+
sentence_pairs = [[i,j] for i in sentences_1 for j in sentences_2]
|
| 204 |
+
|
| 205 |
+
print(model.compute_score(sentence_pairs,
|
| 206 |
+
max_passage_length=128, # a smaller max length leads to a lower latency
|
| 207 |
+
weights_for_different_modes=[0.4, 0.2, 0.4])) # weights_for_different_modes(w) is used to do weighted sum: w[0]*dense_score + w[1]*sparse_score + w[2]*colbert_score
|
| 208 |
+
|
| 209 |
+
# {
|
| 210 |
+
# 'colbert': [0.7796499729156494, 0.4621465802192688, 0.4523794651031494, 0.7898575067520142],
|
| 211 |
+
# 'sparse': [0.195556640625, 0.00879669189453125, 0.0, 0.1802978515625],
|
| 212 |
+
# 'dense': [0.6259765625, 0.347412109375, 0.349853515625, 0.67822265625],
|
| 213 |
+
# 'sparse+dense': [0.482503205537796, 0.23454029858112335, 0.2332356721162796, 0.5122477412223816],
|
| 214 |
+
# 'colbert+sparse+dense': [0.6013619303703308, 0.3255828022956848, 0.32089319825172424, 0.6232916116714478]
|
| 215 |
+
# }
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
## Evaluation
|
| 222 |
+
|
| 223 |
+
We provide the evaluation script for [MKQA](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MKQA) and [MLDR](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR)
|
| 224 |
+
|
| 225 |
+
### Benchmarks from the open-source community
|
| 226 |
+

|
| 227 |
+
The BGE-M3 model emerged as the top performer on this benchmark (OAI is short for OpenAI).
|
| 228 |
+
For more details, please refer to the [article](https://towardsdatascience.com/openai-vs-open-source-multilingual-embedding-models-e5ccb7c90f05) and [Github Repo](https://github.com/Yannael/multilingual-embeddings)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
### Our results
|
| 232 |
+
- Multilingual (Miracl dataset)
|
| 233 |
+
|
| 234 |
+

|
| 235 |
+
|
| 236 |
+
- Cross-lingual (MKQA dataset)
|
| 237 |
+
|
| 238 |
+

|
| 239 |
+
|
| 240 |
+
- Long Document Retrieval
|
| 241 |
+
- MLDR:
|
| 242 |
+

|
| 243 |
+
Please note that [MLDR](https://huggingface.co/datasets/Shitao/MLDR) is a document retrieval dataset we constructed via LLM,
|
| 244 |
+
covering 13 languages, including test set, validation set, and training set.
|
| 245 |
+
We utilized the training set from MLDR to enhance the model's long document retrieval capabilities.
|
| 246 |
+
Therefore, comparing baselines with `Dense w.o.long`(fine-tuning without long document dataset) is more equitable.
|
| 247 |
+
Additionally, this long document retrieval dataset will be open-sourced to address the current lack of open-source multilingual long text retrieval datasets.
|
| 248 |
+
We believe that this data will be helpful for the open-source community in training document retrieval models.
|
| 249 |
+
|
| 250 |
+
- NarritiveQA:
|
| 251 |
+

|
| 252 |
+
|
| 253 |
+
- Comparison with BM25
|
| 254 |
+
|
| 255 |
+
We utilized Pyserini to implement BM25, and the test results can be reproduced by this [script](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR#bm25-baseline).
|
| 256 |
+
We tested BM25 using two different tokenizers:
|
| 257 |
+
one using Lucene Analyzer and the other using the same tokenizer as M3 (i.e., the tokenizer of xlm-roberta).
|
| 258 |
+
The results indicate that BM25 remains a competitive baseline,
|
| 259 |
+
especially in long document retrieval.
|
| 260 |
+
|
| 261 |
+

|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
## Training
|
| 266 |
+
- Self-knowledge Distillation: combining multiple outputs from different
|
| 267 |
+
retrieval modes as reward signal to enhance the performance of single mode(especially for sparse retrieval and multi-vec(colbert) retrival)
|
| 268 |
+
- Efficient Batching: Improve the efficiency when fine-tuning on long text.
|
| 269 |
+
The small-batch strategy is simple but effective, which also can used to fine-tune large embedding model.
|
| 270 |
+
- MCLS: A simple method to improve the performance on long text without fine-tuning.
|
| 271 |
+
If you have no enough resource to fine-tuning model with long text, the method is useful.
|
| 272 |
+
|
| 273 |
+
Refer to our [report](https://arxiv.org/pdf/2402.03216.pdf) for more details.
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
## Acknowledgement
|
| 281 |
+
|
| 282 |
+
Thanks to the authors of open-sourced datasets, including Miracl, MKQA, NarritiveQA, etc.
|
| 283 |
+
Thanks to the open-sourced libraries like [Tevatron](https://github.com/texttron/tevatron), [Pyserini](https://github.com/castorini/pyserini).
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
## Citation
|
| 288 |
+
|
| 289 |
+
If you find this repository useful, please consider giving a star :star: and citation
|
| 290 |
+
|
| 291 |
+
```
|
| 292 |
+
@misc{bge-m3,
|
| 293 |
+
title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation},
|
| 294 |
+
author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu},
|
| 295 |
+
year={2024},
|
| 296 |
+
eprint={2402.03216},
|
| 297 |
+
archivePrefix={arXiv},
|
| 298 |
+
primaryClass={cs.CL}
|
| 299 |
+
}
|
| 300 |
+
```
|
victord/sub19/models/bge-m3/config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 4096,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"max_position_embeddings": 8194,
|
| 16 |
+
"model_type": "xlm-roberta",
|
| 17 |
+
"num_attention_heads": 16,
|
| 18 |
+
"num_hidden_layers": 24,
|
| 19 |
+
"output_past": true,
|
| 20 |
+
"pad_token_id": 1,
|
| 21 |
+
"position_embedding_type": "absolute",
|
| 22 |
+
"torch_dtype": "bfloat16",
|
| 23 |
+
"transformers_version": "4.53.2",
|
| 24 |
+
"type_vocab_size": 1,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"vocab_size": 250002
|
| 27 |
+
}
|
victord/sub19/models/bge-m3/config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"sentence_transformers": "5.1.1",
|
| 4 |
+
"transformers": "4.53.2",
|
| 5 |
+
"pytorch": "2.9.0+cu128"
|
| 6 |
+
},
|
| 7 |
+
"model_type": "SentenceTransformer",
|
| 8 |
+
"prompts": {
|
| 9 |
+
"query": "",
|
| 10 |
+
"document": ""
|
| 11 |
+
},
|
| 12 |
+
"default_prompt_name": null,
|
| 13 |
+
"similarity_fn_name": "cosine"
|
| 14 |
+
}
|
victord/sub19/models/bge-m3/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebd7cf4fcc1a794f73ea3213266a7bd44e995eac0751b6e98ec9bb91cfc57202
|
| 3 |
+
size 1135554736
|
victord/sub19/models/bge-m3/modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
victord/sub19/models/bge-m3/sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 8192,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
victord/sub19/models/bge-m3/sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
victord/sub19/models/bge-m3/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
victord/sub19/models/bge-m3/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:249df0778f236f6ece390de0de746838ef25b9d6954b68c2ee71249e0a9d8fd4
|
| 3 |
+
size 17082799
|
victord/sub19/models/bge-m3/tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"model_max_length": 8192,
|
| 51 |
+
"pad_token": "<pad>",
|
| 52 |
+
"sep_token": "</s>",
|
| 53 |
+
"sp_model_kwargs": {},
|
| 54 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 55 |
+
"unk_token": "<unk>"
|
| 56 |
+
}
|
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/README.md
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- sentence-transformers
|
| 4 |
+
- cross-encoder
|
| 5 |
+
- reranker
|
| 6 |
+
- generated_from_trainer
|
| 7 |
+
- dataset_size:175622
|
| 8 |
+
- loss:BinaryCrossEntropyLoss
|
| 9 |
+
base_model: BAAI/bge-reranker-v2-m3
|
| 10 |
+
pipeline_tag: text-ranking
|
| 11 |
+
library_name: sentence-transformers
|
| 12 |
+
metrics:
|
| 13 |
+
- pearson
|
| 14 |
+
- spearman
|
| 15 |
+
model-index:
|
| 16 |
+
- name: CrossEncoder based on BAAI/bge-reranker-v2-m3
|
| 17 |
+
results:
|
| 18 |
+
- task:
|
| 19 |
+
type: cross-encoder-correlation
|
| 20 |
+
name: Cross Encoder Correlation
|
| 21 |
+
dataset:
|
| 22 |
+
name: sts dev
|
| 23 |
+
type: sts_dev
|
| 24 |
+
metrics:
|
| 25 |
+
- type: pearson
|
| 26 |
+
value: 0.5543320589612747
|
| 27 |
+
name: Pearson
|
| 28 |
+
- type: spearman
|
| 29 |
+
value: 0.5369237805280626
|
| 30 |
+
name: Spearman
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
# CrossEncoder based on BAAI/bge-reranker-v2-m3
|
| 34 |
+
|
| 35 |
+
This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) on the train_dataset, HebNLI, HebQA, RAGbot and ParaShoot datasets using the [sentence-transformers](https://www.SBERT.net) library. It computes scores for pairs of texts, which can be used for text reranking and semantic search.
|
| 36 |
+
|
| 37 |
+
## Model Details
|
| 38 |
+
|
| 39 |
+
### Model Description
|
| 40 |
+
- **Model Type:** Cross Encoder
|
| 41 |
+
- **Base model:** [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) <!-- at revision 953dc6f6f85a1b2dbfca4c34a2796e7dde08d41e -->
|
| 42 |
+
- **Maximum Sequence Length:** 2048 tokens
|
| 43 |
+
- **Number of Output Labels:** 1 label
|
| 44 |
+
- **Training Datasets:**
|
| 45 |
+
- train_dataset
|
| 46 |
+
- HebNLI
|
| 47 |
+
- HebQA
|
| 48 |
+
- RAGbot
|
| 49 |
+
- ParaShoot
|
| 50 |
+
<!-- - **Language:** Unknown -->
|
| 51 |
+
<!-- - **License:** Unknown -->
|
| 52 |
+
|
| 53 |
+
### Model Sources
|
| 54 |
+
|
| 55 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
| 56 |
+
- **Documentation:** [Cross Encoder Documentation](https://www.sbert.net/docs/cross_encoder/usage/usage.html)
|
| 57 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
|
| 58 |
+
- **Hugging Face:** [Cross Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=cross-encoder)
|
| 59 |
+
|
| 60 |
+
## Usage
|
| 61 |
+
|
| 62 |
+
### Direct Usage (Sentence Transformers)
|
| 63 |
+
|
| 64 |
+
First install the Sentence Transformers library:
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
pip install -U sentence-transformers
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
Then you can load this model and run inference.
|
| 71 |
+
```python
|
| 72 |
+
from sentence_transformers import CrossEncoder
|
| 73 |
+
|
| 74 |
+
# Download from the 🤗 Hub
|
| 75 |
+
model = CrossEncoder("cross_encoder_model_id")
|
| 76 |
+
# Get scores for pairs of texts
|
| 77 |
+
pairs = [
|
| 78 |
+
['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'דרגות השתתפות במימון מעונות יום ומשפחתונים\n \nגובה שכר הלימוד\nסכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.\nהטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:\n\nגובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.\nהדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.\nככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.\nעבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).\nתינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.\nילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 2\nהדרגה הסופית\nהמותאמת \n\n: 14\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 3 ומעלה\nהדרגה הסופית\nהמותאמת \n\n: 15\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\n\n\nדרגה בהתאם לרמת הכנסה\n\nמספר הילדים\nהזכאים במשפחה \n\n\nהדרגה הס��פית\nהמותאמת \n\n\n\n3\n2\n14\n\n\n3\n3 ומעלה\n15'],
|
| 79 |
+
['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים\n \nהורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)\n\n\n \nהורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים\nכדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.\n\nגובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.\nהורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.\nלפרטים ומידע כללי על הזכות, ראו השתתפות במימון מעונות יום ומשפחתונים.'],
|
| 80 |
+
['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'השתתפות במימון מעונות יום ומשפחתונים\n תהליך מימוש הזכות\nאיתור מסגרת מוכרת\nמערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.\nניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.\n\nהתשלום למעון/משפחתון\nעבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).\nהחזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.\nהשתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):\n\nהוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).\nהוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.\n\nהיעדרות של הילד\nהיעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.\nהיעדרות של 45 ימים ומעלה בשל אשפוז או מחלה, תיחשב כעזיבה החל מהיום ה-45. חזרה של ילד למעון לאחר מכן, תחייב את ההורה להגיש בקשה חדשה לקביעת דרגת ההשתתפות בצירוף מסמכים עדכניים.\n\nשינוי בתנאי הזכאות\nהזכאות נמשכת כל עוד מתקיימים התנאים המזכים.\nבמקרה שחל שינוי בתנאים - ההורים חייבים למסור על כך הודעה בכתב לאגף מעונות יום ומשפחתונים, לא יאוחר מ-30 יום ממועד השינוי.\n\nבמהלך שנת לימודים, יתכן שיידרשו מההורים נתוני הכנסה מעודכנים כתנאי להמשך תשלום התמיכה.\nגובה התמיכה יעודכן, כלפי מעלה או מטה, על בסיס הנתונים החדשים.\nאת העדכון יש לבצע באמצעות מערכת עדכונט באתר משרד העבודה. ניתן גם להיעזר במוקד מעונות יום, משפחתונים וצהרונים, בטלפון *2969.'],
|
| 81 |
+
['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'מדריך למי שפונו או התפנו מבתיהם עקב מלחמת חרבות ברזל\n Metadata\nמענק למעונות יום ומשפחתונים שלא גבו כסף מההורים או החזירו חלק מהתשלום להורים\nמעונות יום ומשפחתונים בעלי סמל, שבהתאם להנחיות פיקוד העורף, לא פעלו במהלך חודש אוקטובר או שפעלו באופן חלקי בלבד, זכאים למענק סיוע חד-פעמי בגין הימים שבהם לא פעלו.\nהמענק יינתן בתנאי שלא גבו מההורים תשלום עבור הימים שבהם לא היתה פעילות, ואם היתה פעילות חלקית - הם השיבו להורים את החלק היחסי מהתשלום.\nגובה המענק הוא 889 ₪ עבור כל ילד במעון.\nלא ניתן לקבל גם מענק ממשרד העבודה וגם פיצוי מרשות המסים. מעון או משפחתון שקיבל מענק ממשרד העבודה והגיש תביעה לפיצוי מרשות המסים, סכום המענק יופחת מגובה הפיצוי שיקבל מרשות המסים.\nלמידע נוסף ראו מענק למעונות ומשפחתונים מוכרים בתקופת מלחמת חרבות ברזל'],
|
| 82 |
+
['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'מדריך הורות משותפת\n מענקים וקצבאות\n\n\nמיקום הילד/ה במשפחה\nסכום הקצבה עבור הילד/ה (נכון ל-2024)\n\n\nילד ראשון\n169 ₪\n\n\nילד שני\n214 ₪\n\n\nילד שלישי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד רביעי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד חמישי ואילך\n169 ₪\n\nלשיעורי הקצבה בשנים הקודמות ראו באתר המוסד לביטוח לאומי.\nלמידע נוסף ראו קצבת ילדים.\n\nמעונות יום\nקדימות בקבלה\nהורים העונים על קריטריונים שונים, עשויים להיות זכאים לקדימות בקבלה למעונות היום שנמצאים באחריות משרד הרווחה.\nבין הקריטריונים נמצאים "הורים עצמאים", הכוללים הורה לא נשוי ושאין לו ידוע בציבור, למשל, אישה בהורות משותפת.\nמעונות יום\nהשתתפות במימון\nכדי לעודד את שילובם של הורים בשוק העבודה, המדינה מסייעת במימון מעונות יום ומשפחתונים מוכרים. גובה הסיוע ניתן בהתאם לרמת ההכנסה.\nלמידע נוסף ראו השתתפות במימון מעונות יום ומשפחתונים.'],
|
| 83 |
+
]
|
| 84 |
+
scores = model.predict(pairs)
|
| 85 |
+
print(scores.shape)
|
| 86 |
+
# (5,)
|
| 87 |
+
|
| 88 |
+
# Or rank different texts based on similarity to a single text
|
| 89 |
+
ranks = model.rank(
|
| 90 |
+
'מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?',
|
| 91 |
+
[
|
| 92 |
+
'דרגות השתתפות במימון מעונות יום ומשפחתונים\n \nגובה שכר הלימוד\nסכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.\nהטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:\n\nגובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.\nהדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.\nככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.\nעבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).\nתינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.\nילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 2\nהדרגה הסופית\nהמותאמת \n\n: 14\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 3 ומעלה\nהדרגה הסופית\nהמותאמת \n\n: 15\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\n\n\nדרגה בהתאם לרמת הכנסה\n\nמספר הילדים\nהזכאים במשפחה \n\n\nהדרגה הסופית\nהמותאמת \n\n\n\n3\n2\n14\n\n\n3\n3 ומעלה\n15',
|
| 93 |
+
'סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים\n \nהורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)\n\n\n \nהורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים\nכדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.\n\nגובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.\nהורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.\nלפרטים ומידע כללי על הזכות, ראו השתתפות במימון מעונות יום ומשפחתונים.',
|
| 94 |
+
'השתתפות במימון מעונות יום ומשפחתונים\n תהליך מימוש הזכות\nאיתור מסגרת מוכרת\nמערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.\nניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.\n\nהתשלום למעון/משפחתון\nעבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).\nהחזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.\nהשתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):\n\nהוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).\nהוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.\n\nהיעדרות של הילד\nהיעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.\nהיעדרות של 45 ימים ומעלה בשל אשפוז או מחלה, תיחשב כעזיבה החל מהיום ה-45. חזרה של ילד למעון לאחר מכן, תחייב את ההורה להגיש בקשה חדשה לקביעת דרגת ההשתתפות בצירוף מסמכים עדכניים.\n\nשינוי בתנאי הזכאות\nהזכאות נמשכת כל עוד מתקיימים התנאים המזכים.\nבמקרה שחל שינוי בתנאים - ההורים חייבים למסור על כך הודעה בכתב לאגף מעונות יום ומשפחתונים, לא יאוחר מ-30 יום ממועד השינוי.\n\nבמהלך שנת לימודים, יתכן שיידרשו מההורים נתוני הכנסה מעודכנים כתנאי להמשך תשלום התמיכה.\nגובה התמיכה יעודכן, כלפי מעלה או מטה, על בסיס הנתונים החדשים.\nאת העדכון יש לבצע באמצעות מערכת עדכונט באתר משרד העבודה. ניתן גם להיעזר במוקד מעונות יום, משפחתונים וצהרונים, בטלפון *2969.',
|
| 95 |
+
'מדריך למי שפונו או התפנו מבתיהם עקב מלחמת חרבות ברזל\n Metadata\nמענק למעונות יום ומשפחתונים שלא גבו כסף מההורים או החזירו חלק מהתשלום להורים\nמעונות יום ומשפחתונים בעלי סמל, שבהתאם להנחיות פיקוד העורף, לא פעלו במהלך חודש אוקטובר או שפעלו באופן חלקי בלבד, זכאים למענק סיוע חד-פעמי בגין הימים שבהם לא פעלו.\nהמענק יינתן בתנאי שלא גבו מההורים תשלום עבור הימים שבהם לא היתה פעילות, ואם היתה פעילות חלקית - הם השיבו להורים את החלק היחסי מהתשלום.\nגובה המענק הוא 889 ₪ עבור כל ילד במעון.\nלא ניתן לקבל גם מענק ממשרד העבודה וגם פיצוי מרשות המסים. מעון או משפחתון שקיבל מענק ממשרד העבודה והגיש תביעה לפיצוי מרשות המסים, סכום המענק יופחת מגובה הפיצוי שיקבל מרשות המסים.\nלמידע נוסף ראו מענק למעונות ומשפחתונים מוכרים בתקופת מלחמת חרבות ברזל',
|
| 96 |
+
'מדריך הורות משותפת\n מענקים וקצבאות\n\n\nמיקום הילד/ה במשפחה\nסכום הקצבה עבור הילד/ה (נכון ל-2024)\n\n\nילד ראשון\n169 ₪\n\n\nילד שני\n214 ₪\n\n\nילד שלישי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד רביעי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד חמישי ואילך\n169 ₪\n\nלשיעורי הקצבה בשנים הקודמות ראו באתר המוסד לביטוח לאומי.\nלמידע נוסף ראו קצבת ילדים.\n\nמעונות יום\nקדימות בקבלה\nהורים העונים על קריטריונים שונים, עשויים להיות זכאים לקדימות בקבלה למעונות היום שנמצאים באחריות משרד הרווחה.\nבין הקריטריונים נמצאים "הורים עצמאים", הכוללים הורה לא נשוי ושאין לו ידוע בציבור, למשל, אישה בהורות משותפת.\nמעונות יום\nהשתתפות במימון\nכדי לעודד את שילובם של הורים בשוק העבודה, המדינה מסייעת במימון מעונות יום ומשפחתונים מוכרים. גובה הסיוע ניתן בהתאם לרמת ההכנסה.\nלמידע נוסף ראו השתתפות במימון מעונות יום ומשפחתונים.',
|
| 97 |
+
]
|
| 98 |
+
)
|
| 99 |
+
# [{'corpus_id': ..., 'score': ...}, {'corpus_id': ..., 'score': ...}, ...]
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
<!--
|
| 103 |
+
### Direct Usage (Transformers)
|
| 104 |
+
|
| 105 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
| 106 |
+
|
| 107 |
+
</details>
|
| 108 |
+
-->
|
| 109 |
+
|
| 110 |
+
<!--
|
| 111 |
+
### Downstream Usage (Sentence Transformers)
|
| 112 |
+
|
| 113 |
+
You can finetune this model on your own dataset.
|
| 114 |
+
|
| 115 |
+
<details><summary>Click to expand</summary>
|
| 116 |
+
|
| 117 |
+
</details>
|
| 118 |
+
-->
|
| 119 |
+
|
| 120 |
+
<!--
|
| 121 |
+
### Out-of-Scope Use
|
| 122 |
+
|
| 123 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
| 124 |
+
-->
|
| 125 |
+
|
| 126 |
+
## Evaluation
|
| 127 |
+
|
| 128 |
+
### Metrics
|
| 129 |
+
|
| 130 |
+
#### Cross Encoder Correlation
|
| 131 |
+
|
| 132 |
+
* Dataset: `sts_dev`
|
| 133 |
+
* Evaluated with [<code>CrossEncoderCorrelationEvaluator</code>](https://sbert.net/docs/package_reference/cross_encoder/evaluation.html#sentence_transformers.cross_encoder.evaluation.CrossEncoderCorrelationEvaluator)
|
| 134 |
+
|
| 135 |
+
| Metric | Value |
|
| 136 |
+
|:-------------|:-----------|
|
| 137 |
+
| pearson | 0.5543 |
|
| 138 |
+
| **spearman** | **0.5369** |
|
| 139 |
+
|
| 140 |
+
<!--
|
| 141 |
+
## Bias, Risks and Limitations
|
| 142 |
+
|
| 143 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
| 144 |
+
-->
|
| 145 |
+
|
| 146 |
+
<!--
|
| 147 |
+
### Recommendations
|
| 148 |
+
|
| 149 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 150 |
+
-->
|
| 151 |
+
|
| 152 |
+
## Training Details
|
| 153 |
+
|
| 154 |
+
### Training Datasets
|
| 155 |
+
<details><summary>train_dataset</summary>
|
| 156 |
+
|
| 157 |
+
#### train_dataset
|
| 158 |
+
|
| 159 |
+
* Dataset: train_dataset
|
| 160 |
+
* Size: 40,680 training samples
|
| 161 |
+
* Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>score</code>
|
| 162 |
+
* Approximate statistics based on the first 1000 samples:
|
| 163 |
+
| | sentence1 | sentence2 | score |
|
| 164 |
+
|:--------|:------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:---------------------------------------------------------------|
|
| 165 |
+
| type | string | string | float |
|
| 166 |
+
| details | <ul><li>min: 14 characters</li><li>mean: 46.84 characters</li><li>max: 134 characters</li></ul> | <ul><li>min: 86 characters</li><li>mean: 1021.29 characters</li><li>max: 9471 characters</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.31</li><li>max: 1.0</li></ul> |
|
| 167 |
+
* Samples:
|
| 168 |
+
| sentence1 | sentence2 | score |
|
| 169 |
+
|:---------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------|
|
| 170 |
+
| <code>מה התשלום עבור דרגה X במעון או משפחתון?<br> מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?</code> | <code>דרגות השתתפות במימון מעונות יום ומשפחתונים<br> <br>גובה שכר הלימוד<br>סכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.<br>הטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:<br><br>גובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.<br>הדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.<br>ככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.<br>עבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).<br>תינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.<br>ילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).<br><br>השפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפ...</code> | <code>0.8</code> |
|
| 171 |
+
| <code>מה התשלום עבור דרגה X במעון או משפחתון?<br> מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?</code> | <code>סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים<br> <br>הורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)<br><br><br> <br>הורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים<br>כדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.<br><br>גובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.<br>הורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.<br>לפרטים ומידע כללי ע...</code> | <code>0.7</code> |
|
| 172 |
+
| <code>מה התשלום עבור דרגה X במעון או משפחתון?<br> מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?</code> | <code>השתתפות במימון מעונות יום ומשפחתונים<br> תהליך מימוש הזכות<br>איתור מסגרת מוכרת<br>מערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.<br>ניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.<br><br>התשלום למעון/משפחתון<br>עבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).<br>החזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.<br>השתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):<br><br>הוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).<br>הוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.<br><br>היעדרות של הילד<br>היעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.<br>היעדרות של 45...</code> | <code>0.8</code> |
|
| 173 |
+
* Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
|
| 174 |
+
```json
|
| 175 |
+
{
|
| 176 |
+
"activation_fn": "torch.nn.modules.linear.Identity",
|
| 177 |
+
"pos_weight": null
|
| 178 |
+
}
|
| 179 |
+
```
|
| 180 |
+
</details>
|
| 181 |
+
<details><summary>HebNLI</summary>
|
| 182 |
+
|
| 183 |
+
#### HebNLI
|
| 184 |
+
|
| 185 |
+
* Dataset: HebNLI
|
| 186 |
+
* Size: 60,792 training samples
|
| 187 |
+
* Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>score</code>
|
| 188 |
+
* Approximate statistics based on the first 1000 samples:
|
| 189 |
+
| | sentence1 | sentence2 | score |
|
| 190 |
+
|:--------|:-----------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|:-----------------------------------------------------------------|
|
| 191 |
+
| type | string | string | float |
|
| 192 |
+
| details | <ul><li>min: 4 characters</li><li>mean: 86.89 characters</li><li>max: 400 characters</li></ul> | <ul><li>min: 8 characters</li><li>mean: 43.86 characters</li><li>max: 153 characters</li></ul> | <ul><li>min: 0.01</li><li>mean: 0.41</li><li>max: 0.99</li></ul> |
|
| 193 |
+
* Samples:
|
| 194 |
+
| sentence1 | sentence2 | score |
|
| 195 |
+
|:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:------------------|
|
| 196 |
+
| <code>הצלחנו, אמרה טופנס.</code> | <code>טופנס אמרה שהם ניצחו.</code> | <code>0.99</code> |
|
| 197 |
+
| <code>אחרי שלושה ימים של אימונים אינטנסיביים הם סיימו את הקורס בהצטיינות.</code> | <code>הם השלימו את המסלול עם שעות לאחר שלושה ימים של אימונים אינטנסיביים.</code> | <code>0.99</code> |
|
| 198 |
+
| <code>הייתי מציע לך לנסוע לרוסיה מיד.</code> | <code>הייתי מציע לך להכין את הדרך לרוסיה באופן מיידי.</code> | <code>0.99</code> |
|
| 199 |
+
* Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
|
| 200 |
+
```json
|
| 201 |
+
{
|
| 202 |
+
"activation_fn": "torch.nn.modules.linear.Identity",
|
| 203 |
+
"pos_weight": null
|
| 204 |
+
}
|
| 205 |
+
```
|
| 206 |
+
</details>
|
| 207 |
+
<details><summary>HebQA</summary>
|
| 208 |
+
|
| 209 |
+
#### HebQA
|
| 210 |
+
|
| 211 |
+
* Dataset: HebQA
|
| 212 |
+
* Size: 60,294 training samples
|
| 213 |
+
* Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>score</code>
|
| 214 |
+
* Approximate statistics based on the first 1000 samples:
|
| 215 |
+
| | sentence1 | sentence2 | score |
|
| 216 |
+
|:--------|:----------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------|
|
| 217 |
+
| type | string | string | float |
|
| 218 |
+
| details | <ul><li>min: 13 characters</li><li>mean: 37.7 characters</li><li>max: 91 characters</li></ul> | <ul><li>min: 504 characters</li><li>mean: 695.62 characters</li><li>max: 1311 characters</li></ul> | <ul><li>min: 0.01</li><li>mean: 0.75</li><li>max: 0.99</li></ul> |
|
| 219 |
+
* Samples:
|
| 220 |
+
| sentence1 | sentence2 | score |
|
| 221 |
+
|:------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------|
|
| 222 |
+
| <code>כמה אנשים היו בוועדה שניסחה את הטיוטה הרביעית?</code> | <code>לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. ש��ת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה.</code> | <code>0.8592790964302004</code> |
|
| 223 |
+
| <code>כמה אנשים היו בוועדה שניסחה את הטיוטה הרביעית?</code> | <code>לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה.</code> | <code>0.99</code> |
|
| 224 |
+
| <code>באילו מילים התחילו כל סעיפי הטויוטה?</code> | <code>לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה.</code> | <code>0.25883327354329505</code> |
|
| 225 |
+
* Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
|
| 226 |
+
```json
|
| 227 |
+
{
|
| 228 |
+
"activation_fn": "torch.nn.modules.linear.Identity",
|
| 229 |
+
"pos_weight": null
|
| 230 |
+
}
|
| 231 |
+
```
|
| 232 |
+
</details>
|
| 233 |
+
<details><summary>RAGbot</summary>
|
| 234 |
+
|
| 235 |
+
#### RAGbot
|
| 236 |
+
|
| 237 |
+
* Dataset: RAGbot
|
| 238 |
+
* Size: 7,780 training samples
|
| 239 |
+
* Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>score</code>
|
| 240 |
+
* Approximate statistics based on the first 1000 samples:
|
| 241 |
+
| | sentence1 | sentence2 | score |
|
| 242 |
+
|:--------|:------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------|
|
| 243 |
+
| type | string | string | float |
|
| 244 |
+
| details | <ul><li>min: 10 characters</li><li>mean: 60.23 characters</li><li>max: 424 characters</li></ul> | <ul><li>min: 211 characters</li><li>mean: 1312.64 characters</li><li>max: 4772 characters</li></ul> | <ul><li>min: 0.01</li><li>mean: 0.71</li><li>max: 0.99</li></ul> |
|
| 245 |
+
* Samples:
|
| 246 |
+
| sentence1 | sentence2 | score |
|
| 247 |
+
|:--------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------|
|
| 248 |
+
| <code>לא ברור באתר <br>אם על חברות ביטוח פרטיות <br>חל חוק חופש המידע <br>אני מציע שזה יהיה כתוב מפורש</code> | <code>חוק חופש המידע, התשנ"ח-1998 מאפשר לכל אזרח או תושב ישראל (/he/תושב_ישראל) לפנות בבקשה לרשות ציבורית לקבל מידע אותו היא מחזיקה. מטרתו הראשית של החוק היא להטמיע תפיסת יסוד של שיתוף במידע הציבורי.<br>פרטים<br><br>שם החוק:חוק חופש המידע, התשנ"ח-1998<br>קישור:החוק באתר נבו<br>שר אחראי:שר המשפטים<br>החוק ב"ספר החוקים הפתוח"<br>נושאים וזכויות<br>חופש המידע<br>הזכות לפרטיות</code> | <code>0.02321811595881069</code> |
|
| 249 |
+
| <code>לא ברור באתר <br>אם על חברות ביטוח פרטיות <br>חל חוק חופש המידע <br>אני מציע שזה יהיה כתוב מפורש</code> | <code>חוק חופש המידע, התשנ"ח-1998 מאפשר לכל אזרח או תושב ישראל (/he/תושב_ישראל) לפנות בבקשה לרשות ציבורית לקבל מידע אותו היא מחזיקה. מטרתו הראשית של החוק היא להטמיע תפיסת יסוד של שיתוף במידע הציבורי.<br>פרטים<br><br>שם החוק:חוק חופש המידע, התשנ"ח-1998<br>קישור:החוק באתר נבו<br>שר אחראי:שר המשפטים<br>החוק ב"ספר החוקים הפתוח"<br>נושאים וזכויות<br>חופש המידע<br>הזכות לפרטיות</code> | <code>0.99</code> |
|
| 250 |
+
| <code>לא ברור באתר <br>אם על חברות ביטוח פרטיות <br>חל חוק חופש המידע <br>אני מציע שזה יהיה כתוב מפורש</code> | <code>כל אזרח או תושב ישראל זכאי לקבלת מידע מרשות ציבורית (https://www.gov.il/he/departments/general/list_of_authorities)<br><br><br> <br>לצורך קבלת המידע יש לפנות לממונה על יישום חוק חופש המידע ברשות הציבורית<br><br><br> <br>לפני הגשת בקשה לקבלת מידע, מומלץ לברר אם המידע כבר קיים במאגר התשובות (https://www.gov.il/he/departments/general/answer_reservoir) של היחידה הממשלתית לחופש המידע במשרד המשפטים<br><br><br> <br>הגשת הבקשה כרוכה בתשלום אגרה, אך במקרים מסוימים ניתן פטור מתשלום האגרה<br><br><br> <br>למידע נוסף ראו הגשת בקשת חופש המידע (https://www.gov.il/he/service/freedom_of_information_submission) באתר היחידה הממשלתית לחופש המידע במשרד המשפטים<br>חוק חופש המידע קובע כי כל אזרח או תושב ישראל זכאי לקבל מידע מרשות ציבורית.<br><br>לרשימה המלאה של הרשויות הציבוריות שעליהן חל החוק ראו כאן.<br><br>אוכלוסיית יעד ותנאים מקדימים<br>כל אזרח או תושב ישראל.<br><br>למי ואיך פונים<br>יש לבדוק איזו רשות אחראית על הנושא שלגביו מבוקש המידע.<br>יש לפנות לממונה על יישום חוק חופש המידע ברשות האחראית:<br><br>לרשימת הממונים על יישום חוק חופש המידע ברשויות השונות באתר היחידה הממשלתית לחופש המידע...</code> | <code>0.01</code> |
|
| 251 |
+
* Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
|
| 252 |
+
```json
|
| 253 |
+
{
|
| 254 |
+
"activation_fn": "torch.nn.modules.linear.Identity",
|
| 255 |
+
"pos_weight": null
|
| 256 |
+
}
|
| 257 |
+
```
|
| 258 |
+
</details>
|
| 259 |
+
<details><summary>ParaShoot</summary>
|
| 260 |
+
|
| 261 |
+
#### ParaShoot
|
| 262 |
+
|
| 263 |
+
* Dataset: ParaShoot
|
| 264 |
+
* Size: 6,076 training samples
|
| 265 |
+
* Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>score</code>
|
| 266 |
+
* Approximate statistics based on the first 1000 samples:
|
| 267 |
+
| | sentence1 | sentence2 | score |
|
| 268 |
+
|:--------|:-----------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------|
|
| 269 |
+
| type | string | string | float |
|
| 270 |
+
| details | <ul><li>min: 9 characters</li><li>mean: 37.13 characters</li><li>max: 119 characters</li></ul> | <ul><li>min: 500 characters</li><li>mean: 746.67 characters</li><li>max: 2042 characters</li></ul> | <ul><li>min: 0.01</li><li>mean: 0.91</li><li>max: 0.99</li></ul> |
|
| 271 |
+
* Samples:
|
| 272 |
+
| sentence1 | sentence2 | score |
|
| 273 |
+
|:----------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------|
|
| 274 |
+
| <code>במה בתחילת דרכה עסקה חברתו של אהרון רוזנפלד?</code> | <code>בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה.</code> | <code>0.9341320672743826</code> |
|
| 275 |
+
| <code>במה בתחילת דרכה עסקה חברתו של אהרון רוזנפלד?</code> | <code>בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה.</code> | <code>0.99</code> |
|
| 276 |
+
| <code>מה אירע בחיפה לראשונה בשנת 1923?</code> | <code>בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה.</code> | <code>0.4503647439465404</code> |
|
| 277 |
+
* Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
|
| 278 |
+
```json
|
| 279 |
+
{
|
| 280 |
+
"activation_fn": "torch.nn.modules.linear.Identity",
|
| 281 |
+
"pos_weight": null
|
| 282 |
+
}
|
| 283 |
+
```
|
| 284 |
+
</details>
|
| 285 |
+
|
| 286 |
+
### Evaluation Dataset
|
| 287 |
+
|
| 288 |
+
#### Unnamed Dataset
|
| 289 |
+
|
| 290 |
+
* Size: 40,680 evaluation samples
|
| 291 |
+
* Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>score</code>
|
| 292 |
+
* Approximate statistics based on the first 1000 samples:
|
| 293 |
+
| | sentence1 | sentence2 | score |
|
| 294 |
+
|:--------|:------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:---------------------------------------------------------------|
|
| 295 |
+
| type | string | string | float |
|
| 296 |
+
| details | <ul><li>min: 14 characters</li><li>mean: 46.84 characters</li><li>max: 134 characters</li></ul> | <ul><li>min: 86 characters</li><li>mean: 1021.29 characters</li><li>max: 9471 characters</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.31</li><li>max: 1.0</li></ul> |
|
| 297 |
+
* Samples:
|
| 298 |
+
| sentence1 | sentence2 | score |
|
| 299 |
+
|:---------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------|
|
| 300 |
+
| <code>מה התשלום עבור דרגה X במעון או משפחתון?<br> מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?</code> | <code>דרגות השתתפות במימון מעונות יום ומשפחתונים<br> <br>גובה שכר הלימוד<br>סכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.<br>הטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:<br><br>גובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.<br>הדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.<br>ככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.<br>עבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).<br>תינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמב�� של שנת הלימודים אליה נרשמו.<br>ילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).<br><br>השפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפ...</code> | <code>0.8</code> |
|
| 301 |
+
| <code>מה התשלום עבור דרגה X במעון או משפחתון?<br> מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?</code> | <code>סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים<br> <br>הורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)<br><br><br> <br>הורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים<br>כדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.<br><br>גובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.<br>הורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.<br>לפרטים ומידע כללי ע...</code> | <code>0.7</code> |
|
| 302 |
+
| <code>מה התשלום עבור דרגה X במעון או משפחתון?<br> מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?</code> | <code>השתתפות במימון מעונות יום ומשפחתונים<br> תהליך מימוש הזכות<br>איתור מסגרת מוכרת<br>מערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.<br>ניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.<br><br>התשלום למעון/משפחתון<br>עבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).<br>החזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.<br>השתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):<br><br>הוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).<br>הוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.<br><br>היעדרות של הילד<br>היעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.<br>היעדרות של 45...</code> | <code>0.8</code> |
|
| 303 |
+
* Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
|
| 304 |
+
```json
|
| 305 |
+
{
|
| 306 |
+
"activation_fn": "torch.nn.modules.linear.Identity",
|
| 307 |
+
"pos_weight": null
|
| 308 |
+
}
|
| 309 |
+
```
|
| 310 |
+
|
| 311 |
+
### Training Hyperparameters
|
| 312 |
+
#### Non-Default Hyperparameters
|
| 313 |
+
|
| 314 |
+
- `eval_strategy`: steps
|
| 315 |
+
- `per_device_train_batch_size`: 16
|
| 316 |
+
- `per_device_eval_batch_size`: 16
|
| 317 |
+
- `gradient_accumulation_steps`: 8
|
| 318 |
+
- `learning_rate`: 4e-05
|
| 319 |
+
- `weight_decay`: 0.01
|
| 320 |
+
- `max_grad_norm`: 2.0
|
| 321 |
+
- `num_train_epochs`: 2
|
| 322 |
+
- `max_steps`: 300
|
| 323 |
+
- `warmup_steps`: 300
|
| 324 |
+
- `bf16`: True
|
| 325 |
+
- `tf32`: True
|
| 326 |
+
- `torch_compile`: True
|
| 327 |
+
- `torch_compile_backend`: inductor
|
| 328 |
+
- `batch_sampler`: no_duplicates
|
| 329 |
+
|
| 330 |
+
#### All Hyperparameters
|
| 331 |
+
<details><summary>Click to expand</summary>
|
| 332 |
+
|
| 333 |
+
- `overwrite_output_dir`: False
|
| 334 |
+
- `do_predict`: False
|
| 335 |
+
- `eval_strategy`: steps
|
| 336 |
+
- `prediction_loss_only`: True
|
| 337 |
+
- `per_device_train_batch_size`: 16
|
| 338 |
+
- `per_device_eval_batch_size`: 16
|
| 339 |
+
- `per_gpu_train_batch_size`: None
|
| 340 |
+
- `per_gpu_eval_batch_size`: None
|
| 341 |
+
- `gradient_accumulation_steps`: 8
|
| 342 |
+
- `eval_accumulation_steps`: None
|
| 343 |
+
- `torch_empty_cache_steps`: None
|
| 344 |
+
- `learning_rate`: 4e-05
|
| 345 |
+
- `weight_decay`: 0.01
|
| 346 |
+
- `adam_beta1`: 0.9
|
| 347 |
+
- `adam_beta2`: 0.999
|
| 348 |
+
- `adam_epsilon`: 1e-08
|
| 349 |
+
- `max_grad_norm`: 2.0
|
| 350 |
+
- `num_train_epochs`: 2
|
| 351 |
+
- `max_steps`: 300
|
| 352 |
+
- `lr_scheduler_type`: linear
|
| 353 |
+
- `lr_scheduler_kwargs`: {}
|
| 354 |
+
- `warmup_ratio`: 0.0
|
| 355 |
+
- `warmup_steps`: 300
|
| 356 |
+
- `log_level`: passive
|
| 357 |
+
- `log_level_replica`: warning
|
| 358 |
+
- `log_on_each_node`: True
|
| 359 |
+
- `logging_nan_inf_filter`: True
|
| 360 |
+
- `save_safetensors`: True
|
| 361 |
+
- `save_on_each_node`: False
|
| 362 |
+
- `save_only_model`: False
|
| 363 |
+
- `restore_callback_states_from_checkpoint`: False
|
| 364 |
+
- `no_cuda`: False
|
| 365 |
+
- `use_cpu`: False
|
| 366 |
+
- `use_mps_device`: False
|
| 367 |
+
- `seed`: 42
|
| 368 |
+
- `data_seed`: None
|
| 369 |
+
- `jit_mode_eval`: False
|
| 370 |
+
- `use_ipex`: False
|
| 371 |
+
- `bf16`: True
|
| 372 |
+
- `fp16`: False
|
| 373 |
+
- `fp16_opt_level`: O1
|
| 374 |
+
- `half_precision_backend`: auto
|
| 375 |
+
- `bf16_full_eval`: False
|
| 376 |
+
- `fp16_full_eval`: False
|
| 377 |
+
- `tf32`: True
|
| 378 |
+
- `local_rank`: 0
|
| 379 |
+
- `ddp_backend`: None
|
| 380 |
+
- `tpu_num_cores`: None
|
| 381 |
+
- `tpu_metrics_debug`: False
|
| 382 |
+
- `debug`: []
|
| 383 |
+
- `dataloader_drop_last`: False
|
| 384 |
+
- `dataloader_num_workers`: 0
|
| 385 |
+
- `dataloader_prefetch_factor`: None
|
| 386 |
+
- `past_index`: -1
|
| 387 |
+
- `disable_tqdm`: False
|
| 388 |
+
- `remove_unused_columns`: True
|
| 389 |
+
- `label_names`: None
|
| 390 |
+
- `load_best_model_at_end`: False
|
| 391 |
+
- `ignore_data_skip`: False
|
| 392 |
+
- `fsdp`: []
|
| 393 |
+
- `fsdp_min_num_params`: 0
|
| 394 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
| 395 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
| 396 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
| 397 |
+
- `deepspeed`: None
|
| 398 |
+
- `label_smoothing_factor`: 0.0
|
| 399 |
+
- `optim`: adamw_torch
|
| 400 |
+
- `optim_args`: None
|
| 401 |
+
- `adafactor`: False
|
| 402 |
+
- `group_by_length`: False
|
| 403 |
+
- `length_column_name`: length
|
| 404 |
+
- `ddp_find_unused_parameters`: None
|
| 405 |
+
- `ddp_bucket_cap_mb`: None
|
| 406 |
+
- `ddp_broadcast_buffers`: False
|
| 407 |
+
- `dataloader_pin_memory`: True
|
| 408 |
+
- `dataloader_persistent_workers`: False
|
| 409 |
+
- `skip_memory_metrics`: True
|
| 410 |
+
- `use_legacy_prediction_loop`: False
|
| 411 |
+
- `push_to_hub`: False
|
| 412 |
+
- `resume_from_checkpoint`: None
|
| 413 |
+
- `hub_model_id`: None
|
| 414 |
+
- `hub_strategy`: every_save
|
| 415 |
+
- `hub_private_repo`: None
|
| 416 |
+
- `hub_always_push`: False
|
| 417 |
+
- `hub_revision`: None
|
| 418 |
+
- `gradient_checkpointing`: False
|
| 419 |
+
- `gradient_checkpointing_kwargs`: None
|
| 420 |
+
- `include_inputs_for_metrics`: False
|
| 421 |
+
- `include_for_metrics`: []
|
| 422 |
+
- `eval_do_concat_batches`: True
|
| 423 |
+
- `fp16_backend`: auto
|
| 424 |
+
- `push_to_hub_model_id`: None
|
| 425 |
+
- `push_to_hub_organization`: None
|
| 426 |
+
- `mp_parameters`:
|
| 427 |
+
- `auto_find_batch_size`: False
|
| 428 |
+
- `full_determinism`: False
|
| 429 |
+
- `torchdynamo`: None
|
| 430 |
+
- `ray_scope`: last
|
| 431 |
+
- `ddp_timeout`: 1800
|
| 432 |
+
- `torch_compile`: True
|
| 433 |
+
- `torch_compile_backend`: inductor
|
| 434 |
+
- `torch_compile_mode`: None
|
| 435 |
+
- `include_tokens_per_second`: False
|
| 436 |
+
- `include_num_input_tokens_seen`: False
|
| 437 |
+
- `neftune_noise_alpha`: None
|
| 438 |
+
- `optim_target_modules`: None
|
| 439 |
+
- `batch_eval_metrics`: False
|
| 440 |
+
- `eval_on_start`: False
|
| 441 |
+
- `use_liger_kernel`: False
|
| 442 |
+
- `liger_kernel_config`: None
|
| 443 |
+
- `eval_use_gather_object`: False
|
| 444 |
+
- `average_tokens_across_devices`: False
|
| 445 |
+
- `prompts`: None
|
| 446 |
+
- `batch_sampler`: no_duplicates
|
| 447 |
+
- `multi_dataset_batch_sampler`: proportional
|
| 448 |
+
- `router_mapping`: {}
|
| 449 |
+
- `learning_rate_mapping`: {}
|
| 450 |
+
|
| 451 |
+
</details>
|
| 452 |
+
|
| 453 |
+
### Training Logs
|
| 454 |
+
| Epoch | Step | Training Loss | Validation Loss | sts_dev_spearman |
|
| 455 |
+
|:------:|:----:|:-------------:|:---------------:|:----------------:|
|
| 456 |
+
| -1 | -1 | - | - | 0.4608 |
|
| 457 |
+
| 0.2186 | 300 | 0.4926 | 0.4868 | 0.5369 |
|
| 458 |
+
| -1 | -1 | - | - | 0.5369 |
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
### Framework Versions
|
| 462 |
+
- Python: 3.10.16
|
| 463 |
+
- Sentence Transformers: 5.1.2
|
| 464 |
+
- Transformers: 4.53.2
|
| 465 |
+
- PyTorch: 2.9.0+cu128
|
| 466 |
+
- Accelerate: 1.10.1
|
| 467 |
+
- Datasets: 4.2.0
|
| 468 |
+
- Tokenizers: 0.21.4
|
| 469 |
+
|
| 470 |
+
## Citation
|
| 471 |
+
|
| 472 |
+
### BibTeX
|
| 473 |
+
|
| 474 |
+
#### Sentence Transformers
|
| 475 |
+
```bibtex
|
| 476 |
+
@inproceedings{reimers-2019-sentence-bert,
|
| 477 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
| 478 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
| 479 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
| 480 |
+
month = "11",
|
| 481 |
+
year = "2019",
|
| 482 |
+
publisher = "Association for Computational Linguistics",
|
| 483 |
+
url = "https://arxiv.org/abs/1908.10084",
|
| 484 |
+
}
|
| 485 |
+
```
|
| 486 |
+
|
| 487 |
+
<!--
|
| 488 |
+
## Glossary
|
| 489 |
+
|
| 490 |
+
*Clearly define terms in order to be accessible across audiences.*
|
| 491 |
+
-->
|
| 492 |
+
|
| 493 |
+
<!--
|
| 494 |
+
## Model Card Authors
|
| 495 |
+
|
| 496 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
| 497 |
+
-->
|
| 498 |
+
|
| 499 |
+
<!--
|
| 500 |
+
## Model Card Contact
|
| 501 |
+
|
| 502 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
| 503 |
+
-->
|
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"id2label": {
|
| 13 |
+
"0": "LABEL_0"
|
| 14 |
+
},
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": 4096,
|
| 17 |
+
"label2id": {
|
| 18 |
+
"LABEL_0": 0
|
| 19 |
+
},
|
| 20 |
+
"layer_norm_eps": 1e-05,
|
| 21 |
+
"max_position_embeddings": 8194,
|
| 22 |
+
"model_type": "xlm-roberta",
|
| 23 |
+
"num_attention_heads": 16,
|
| 24 |
+
"num_hidden_layers": 24,
|
| 25 |
+
"output_past": true,
|
| 26 |
+
"pad_token_id": 1,
|
| 27 |
+
"position_embedding_type": "absolute",
|
| 28 |
+
"sentence_transformers": {
|
| 29 |
+
"activation_fn": "torch.nn.modules.activation.Sigmoid",
|
| 30 |
+
"version": "5.1.2"
|
| 31 |
+
},
|
| 32 |
+
"torch_dtype": "bfloat16",
|
| 33 |
+
"transformers_version": "4.53.2",
|
| 34 |
+
"type_vocab_size": 1,
|
| 35 |
+
"use_cache": true,
|
| 36 |
+
"vocab_size": 250002
|
| 37 |
+
}
|
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c4e2582945a051835252f00c9967c0083f87e9b56af3fbc482c5eeeb31e8b68
|
| 3 |
+
size 1135560090
|
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f346050eed4b822e076c8958539a82bd05b7828c9110f81e7ed7a6d1b853710b
|
| 3 |
+
size 17083154
|
victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer_config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"max_length": 2048,
|
| 51 |
+
"model_max_length": 2048,
|
| 52 |
+
"pad_token": "<pad>",
|
| 53 |
+
"sep_token": "</s>",
|
| 54 |
+
"sp_model_kwargs": {},
|
| 55 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 56 |
+
"unk_token": "<unk>"
|
| 57 |
+
}
|
victord/sub19/models/multilingual-e5-large-instruct/1_Pooling/config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"word_embedding_dimension": 1024,
|
| 3 |
+
"pooling_mode_cls_token": false,
|
| 4 |
+
"pooling_mode_mean_tokens": true,
|
| 5 |
+
"pooling_mode_max_tokens": false,
|
| 6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
+
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
+
"pooling_mode_lasttoken": false,
|
| 9 |
+
"include_prompt": true
|
| 10 |
+
}
|
victord/sub19/models/multilingual-e5-large-instruct/README.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
victord/sub19/models/multilingual-e5-large-instruct/config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 4096,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"max_position_embeddings": 514,
|
| 16 |
+
"model_type": "xlm-roberta",
|
| 17 |
+
"num_attention_heads": 16,
|
| 18 |
+
"num_hidden_layers": 24,
|
| 19 |
+
"output_past": true,
|
| 20 |
+
"pad_token_id": 1,
|
| 21 |
+
"position_embedding_type": "absolute",
|
| 22 |
+
"torch_dtype": "bfloat16",
|
| 23 |
+
"transformers_version": "4.53.2",
|
| 24 |
+
"type_vocab_size": 1,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"vocab_size": 250002
|
| 27 |
+
}
|
victord/sub19/models/multilingual-e5-large-instruct/config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"sentence_transformers": "5.1.1",
|
| 4 |
+
"transformers": "4.53.2",
|
| 5 |
+
"pytorch": "2.9.0+cu128"
|
| 6 |
+
},
|
| 7 |
+
"model_type": "SentenceTransformer",
|
| 8 |
+
"prompts": {
|
| 9 |
+
"query": "",
|
| 10 |
+
"document": ""
|
| 11 |
+
},
|
| 12 |
+
"default_prompt_name": null,
|
| 13 |
+
"similarity_fn_name": "cosine"
|
| 14 |
+
}
|
victord/sub19/models/multilingual-e5-large-instruct/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23e75f34a6a87a71c6a6a4a9244c0520599b3593a9cb13275fa4a92a970fb85a
|
| 3 |
+
size 1119826072
|
victord/sub19/models/multilingual-e5-large-instruct/modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
victord/sub19/models/multilingual-e5-large-instruct/sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 512,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
victord/sub19/models/multilingual-e5-large-instruct/sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
victord/sub19/models/multilingual-e5-large-instruct/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|