gregtatum commited on
Commit
87e8c82
·
0 Parent(s):

Initial commit

Browse files
Files changed (9) hide show
  1. .gitignore +10 -0
  2. .python-version +1 -0
  3. README.md +0 -0
  4. main.py +6 -0
  5. multilingual.py +22 -0
  6. potion.py +14 -0
  7. pyproject.toml +10 -0
  8. tomaarsen.py +15 -0
  9. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
README.md ADDED
File without changes
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from sentence-embeddings!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
multilingual.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from torch.nn import EmbeddingBag
3
+ import torch
4
+
5
+ model = SentenceTransformer(
6
+ "sentence-transformers/static-similarity-mrl-multilingual-v1"
7
+ )
8
+ embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
9
+ embeddings = torch.Tensor(embedding_bag.weight)
10
+
11
+ print(embeddings.shape)
12
+ assert embeddings.shape == torch.Size([105879, 1024])
13
+
14
+ print("float32")
15
+ print(f" 1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB")
16
+ print(f" 512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB")
17
+ print(f" 256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB")
18
+
19
+ print("float16")
20
+ print(f" 1024 dim - {embeddings.shape[0] * 1024 * 2 / 1024 / 1024:,.1f} MiB")
21
+ print(f" 512 dim - {embeddings.shape[0] * 512 * 2 / 1024 / 1024:,.1f} MiB")
22
+ print(f" 256 dim - {embeddings.shape[0] * 256 * 2 / 1024 / 1024:,.1f} MiB")
potion.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from model2vec import StaticModel
2
+ from tokenizers import Tokenizer
3
+ import torch
4
+
5
+ model = StaticModel.from_pretrained("minishlab/potion-multilingual-128M")
6
+ embeddings = torch.from_numpy(model.embedding)
7
+
8
+ print("Embedding shape:", embeddings.shape)
9
+ bytes = embeddings.shape[0] * embeddings.shape[1] * 4
10
+
11
+ print("MiB:", bytes / 1024 / 1024)
12
+
13
+ tokenizer: Tokenizer = model.tokenizer
14
+ print(tokenizer.to_str())
pyproject.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "sentence-embeddings"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "model2vec>=0.6.0",
9
+ "sentence-transformers>=5.1.0",
10
+ ]
tomaarsen.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from torch.nn import EmbeddingBag
3
+ import torch
4
+
5
+ model = SentenceTransformer("tomaarsen/static-retrieval-mrl-en-v1")
6
+ embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
7
+ embeddings = torch.Tensor(embedding_bag.weight)
8
+
9
+ assert embeddings.shape == torch.Size([30522, 1024])
10
+
11
+ print(f"1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB:")
12
+ print(f"512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB:")
13
+ print(f"256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB:")
14
+
15
+ print("Embeddings[0]", embeddings[0])
uv.lock ADDED
The diff for this file is too large to render. See raw diff