Update README.md
Browse files
README.md
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
---
|
| 3 |
|
|
@@ -11,4 +17,43 @@ Read more about it in [the article](https://qdrant.tech/articles/minicoil).
|
|
| 11 |
|
| 12 |
This model is designed to be used with [FastEmbed](https://github.com/qdrant/fastembed) library.
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: sentence-similarity
|
| 6 |
+
---
|
| 7 |
---
|
| 8 |
---
|
| 9 |
|
|
|
|
| 17 |
|
| 18 |
This model is designed to be used with [FastEmbed](https://github.com/qdrant/fastembed) library.
|
| 19 |
|
| 20 |
+
> Note:
|
| 21 |
+
This model is supposed to be used with Qdrant. Vectors have to be configured with [Modifier.IDF](https://qdrant.tech/documentation/concepts/indexing/?q=modifier#idf-modifier).
|
| 22 |
+
|
| 23 |
+
```py
|
| 24 |
+
from fastembed import SparseTextEmbedding
|
| 25 |
+
|
| 26 |
+
model = SparseTextEmbedding(model_name="Qdrant/minicoil-v1")
|
| 27 |
+
|
| 28 |
+
documents = [
|
| 29 |
+
"fruit bat",
|
| 30 |
+
"baseball bat",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
embeddings = list(model.embed(documents))
|
| 34 |
+
query_embedding = list(model.query_embed("bat in a cave"))
|
| 35 |
+
|
| 36 |
+
# embeddings[0] - "fruit bat"
|
| 37 |
+
# SparseEmbedding(values=array([-1.2509683 , -0.9510568 , -0.55398935, 0.188206 , 1.0497165 ,
|
| 38 |
+
# 0.31841373, -0.82047373, -0.9671025 ], dtype=float32), indices=array([ 8992, 8993, 8994, 8995, 18832, 18833, 18834, 18835],
|
| 39 |
+
# dtype=int32)) # 8992, 8993, 8994, 8995 - 4D "fruit" representation, 18832, 18833, 18834, 18835 - 4D "bat" representation
|
| 40 |
+
|
| 41 |
+
# embeddings[1] - "baseball bat"
|
| 42 |
+
#SparseEmbedding(values=array([ 1.1004512 , -0.5959816 , 0.23380531, -1.0912857 , 1.6768292 ],
|
| 43 |
+
# dtype=float32), indices=array([ 18832, 18833, 18834, 18835, 2068153269],
|
| 44 |
+
# dtype=int32)) # 18832, 18833, 18834, 18835 - 4D "bat" representation, 2068153269 - 1D "baseball" representation, as "baseball" is not in miniCOIL_v1 vocabulary, so we fall back to Qdrant/bm25 1D score
|
| 45 |
+
|
| 46 |
+
# query_embedding - "bat in a cave"
|
| 47 |
+
#[SparseEmbedding(values=array([ 0.5656684 , 0.395691 , -0.48945513, -0.5328054 , -0.5889519 ,
|
| 48 |
+
# 0.55871224, 0.27323055, 0.5160634 ], dtype=float32), indices=array([18832, 18833, 18834, 18835, 18920, 18921, 18922, 18923],
|
| 49 |
+
# dtype=int32))] # 18832, 18833, 18834, 18835 - 4D "bat" representation, 18920, 18921, 18922, 18923 - 4D "cave" representation, "in"/"a" - removed stop words
|
| 50 |
+
|
| 51 |
+
bat_1 = embeddings[0].values[4:8]
|
| 52 |
+
bat_2 = embeddings[1].values[:4]
|
| 53 |
+
bat_query = query_embedding[0].values[:4]
|
| 54 |
+
|
| 55 |
+
dot_product_1 = (bat_1 * bat_query).sum() #np.float32(1.6366475) measuring dot product between matching indices of sparse vectors
|
| 56 |
+
dot_product_2 = (bat_2 * bat_query).sum() #np.float32(0.8536716) measuring dot product between matching indices of sparse vectors
|
| 57 |
+
|
| 58 |
+
#1.6366475 > 0.8536716, as "bat" in "fruit bat" is more semantically similar to "bat" in "bat in a cave", as "bat" in "baseball bat"
|
| 59 |
+
```
|