Commit
·
87e8c82
0
Parent(s):
Initial commit
Browse files- .gitignore +10 -0
- .python-version +1 -0
- README.md +0 -0
- main.py +6 -0
- multilingual.py +22 -0
- potion.py +14 -0
- pyproject.toml +10 -0
- tomaarsen.py +15 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.13
|
README.md
ADDED
|
File without changes
|
main.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def main():
|
| 2 |
+
print("Hello from sentence-embeddings!")
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
main()
|
multilingual.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from torch.nn import EmbeddingBag
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
model = SentenceTransformer(
|
| 6 |
+
"sentence-transformers/static-similarity-mrl-multilingual-v1"
|
| 7 |
+
)
|
| 8 |
+
embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
|
| 9 |
+
embeddings = torch.Tensor(embedding_bag.weight)
|
| 10 |
+
|
| 11 |
+
print(embeddings.shape)
|
| 12 |
+
assert embeddings.shape == torch.Size([105879, 1024])
|
| 13 |
+
|
| 14 |
+
print("float32")
|
| 15 |
+
print(f" 1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB")
|
| 16 |
+
print(f" 512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB")
|
| 17 |
+
print(f" 256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB")
|
| 18 |
+
|
| 19 |
+
print("float16")
|
| 20 |
+
print(f" 1024 dim - {embeddings.shape[0] * 1024 * 2 / 1024 / 1024:,.1f} MiB")
|
| 21 |
+
print(f" 512 dim - {embeddings.shape[0] * 512 * 2 / 1024 / 1024:,.1f} MiB")
|
| 22 |
+
print(f" 256 dim - {embeddings.shape[0] * 256 * 2 / 1024 / 1024:,.1f} MiB")
|
potion.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from model2vec import StaticModel
|
| 2 |
+
from tokenizers import Tokenizer
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
model = StaticModel.from_pretrained("minishlab/potion-multilingual-128M")
|
| 6 |
+
embeddings = torch.from_numpy(model.embedding)
|
| 7 |
+
|
| 8 |
+
print("Embedding shape:", embeddings.shape)
|
| 9 |
+
bytes = embeddings.shape[0] * embeddings.shape[1] * 4
|
| 10 |
+
|
| 11 |
+
print("MiB:", bytes / 1024 / 1024)
|
| 12 |
+
|
| 13 |
+
tokenizer: Tokenizer = model.tokenizer
|
| 14 |
+
print(tokenizer.to_str())
|
pyproject.toml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "sentence-embeddings"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.13"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"model2vec>=0.6.0",
|
| 9 |
+
"sentence-transformers>=5.1.0",
|
| 10 |
+
]
|
tomaarsen.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from torch.nn import EmbeddingBag
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
model = SentenceTransformer("tomaarsen/static-retrieval-mrl-en-v1")
|
| 6 |
+
embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
|
| 7 |
+
embeddings = torch.Tensor(embedding_bag.weight)
|
| 8 |
+
|
| 9 |
+
assert embeddings.shape == torch.Size([30522, 1024])
|
| 10 |
+
|
| 11 |
+
print(f"1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB:")
|
| 12 |
+
print(f"512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB:")
|
| 13 |
+
print(f"256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB:")
|
| 14 |
+
|
| 15 |
+
print("Embeddings[0]", embeddings[0])
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|