gregtatum commited on
Commit
1ffeda6
·
1 Parent(s): 6af7a68

Add the model files

Browse files
.gitattributes ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings filter=lfs diff=lfs merge=lfs -text
2
+ model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
3
+ model/static-embeddings.128.fp32.npy.zst filter=lfs diff=lfs merge=lfs -text
4
+ model/static-embeddings.512.fp16.npy.zst filter=lfs diff=lfs merge=lfs -text
5
+ model/tokenizer.json.zst filter=lfs diff=lfs merge=lfs -text
6
+ model/static-embeddings.1024.fp16.npy.zst filter=lfs diff=lfs merge=lfs -text
7
+ model/static-embeddings.256.fp16.npy.zst filter=lfs diff=lfs merge=lfs -text
8
+ model/static-embeddings.384.fp16.npy.zst filter=lfs diff=lfs merge=lfs -text
9
+ model/static-embeddings.512.fp32.npy.zst filter=lfs diff=lfs merge=lfs -text
10
+ model/static-embeddings.1024.fp32.npy.zst filter=lfs diff=lfs merge=lfs -text
11
+ model/static-embeddings.128.fp16.npy.zst filter=lfs diff=lfs merge=lfs -text
12
+ model/static-embeddings.128.int8.npy.zst filter=lfs diff=lfs merge=lfs -text
13
+ model/static-embeddings.384.fp32.npy.zst filter=lfs diff=lfs merge=lfs -text
14
+ model/static-embeddings.1024.int8.npy.zst filter=lfs diff=lfs merge=lfs -text
15
+ model/static-embeddings.256.fp32.npy.zst filter=lfs diff=lfs merge=lfs -text
16
+ model/static-embeddings.256.int8.npy.zst filter=lfs diff=lfs merge=lfs -text
17
+ model/static-embeddings.384.int8.npy.zst filter=lfs diff=lfs merge=lfs -text
18
+ model/static-embeddings.512.int8.npy.zst filter=lfs diff=lfs merge=lfs -text
js/example.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { pipeline, AutoTokenizer, AutoModel, TokenizerModel } from '@huggingface/transformers';
2
  import fs from 'node:fs/promises';
3
  import { constants } from 'node:fs';
4
  import path from 'path';
@@ -12,9 +12,7 @@ async function main() {
12
  const url = "https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1/resolve/main/0_StaticEmbedding/tokenizer.json"
13
 
14
  const config = await ensureTokenizerJson(url)
15
- // const tokenizer = TokenizerModel.fromConfig(config.model)
16
-
17
- const tokenizer = await AutoTokenizer.from_pretrained("./")
18
 
19
  const examples = [
20
  "This is an example of encoding",
 
1
+ import { pipeline, AutoTokenizer, AutoModel, TokenizerModel, PreTrainedTokenizer } from '@huggingface/transformers';
2
  import fs from 'node:fs/promises';
3
  import { constants } from 'node:fs';
4
  import path from 'path';
 
12
  const url = "https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1/resolve/main/0_StaticEmbedding/tokenizer.json"
13
 
14
  const config = await ensureTokenizerJson(url)
15
+ const tokenizer = new PreTrainedTokenizer(config, {})
 
 
16
 
17
  const examples = [
18
  "This is an example of encoding",
js/model.json DELETED
The diff for this file is too large to render. See raw diff
 
js/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
main.py DELETED
@@ -1,6 +0,0 @@
1
- def main():
2
- print("Hello from sentence-embeddings!")
3
-
4
-
5
- if __name__ == "__main__":
6
- main()
 
 
 
 
 
 
 
model/static-embeddings.1024.fp16.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b67bc3307eeb0387866325ff33a41f8bdd673e309c18224e1415f65b9873eba
3
+ size 200900906
model/static-embeddings.1024.fp32.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef55fc97d834f27413b63d9bba3b4832b8c66b2a27bfaba7afc8d2db4c236e68
3
+ size 402617664
model/static-embeddings.1024.int8.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5337ce683b6db454feeb8842deb46807e3e6d989cb46b1b660fbfa0d4506c7
3
+ size 74935075
model/static-embeddings.128.fp16.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b67bc3307eeb0387866325ff33a41f8bdd673e309c18224e1415f65b9873eba
3
+ size 200900906
model/static-embeddings.128.fp32.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef55fc97d834f27413b63d9bba3b4832b8c66b2a27bfaba7afc8d2db4c236e68
3
+ size 402617664
model/static-embeddings.128.int8.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5337ce683b6db454feeb8842deb46807e3e6d989cb46b1b660fbfa0d4506c7
3
+ size 74935075
model/static-embeddings.256.fp16.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b67bc3307eeb0387866325ff33a41f8bdd673e309c18224e1415f65b9873eba
3
+ size 200900906
model/static-embeddings.256.fp32.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef55fc97d834f27413b63d9bba3b4832b8c66b2a27bfaba7afc8d2db4c236e68
3
+ size 402617664
model/static-embeddings.256.int8.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5337ce683b6db454feeb8842deb46807e3e6d989cb46b1b660fbfa0d4506c7
3
+ size 74935075
model/static-embeddings.384.fp16.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b67bc3307eeb0387866325ff33a41f8bdd673e309c18224e1415f65b9873eba
3
+ size 200900906
model/static-embeddings.384.fp32.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef55fc97d834f27413b63d9bba3b4832b8c66b2a27bfaba7afc8d2db4c236e68
3
+ size 402617664
model/static-embeddings.384.int8.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5337ce683b6db454feeb8842deb46807e3e6d989cb46b1b660fbfa0d4506c7
3
+ size 74935075
model/static-embeddings.512.fp16.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b67bc3307eeb0387866325ff33a41f8bdd673e309c18224e1415f65b9873eba
3
+ size 200900906
model/static-embeddings.512.fp32.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef55fc97d834f27413b63d9bba3b4832b8c66b2a27bfaba7afc8d2db4c236e68
3
+ size 402617664
model/static-embeddings.512.int8.npy.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5337ce683b6db454feeb8842deb46807e3e6d989cb46b1b660fbfa0d4506c7
3
+ size 74935075
model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11aaf894a4ccf3d95e8830e27c0f8152791fbbff2b988e29a265580b86edd216
3
+ size 2563370
model/tokenizer.json.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e18e87ed3f9053c05694c40fa520cd13ca7a7c9003009d890a14a5ab0aafd9d6
3
+ size 829644
build_models.py → scripts/build_models.py RENAMED
@@ -11,7 +11,7 @@ def save_data(path: Path, tensor: torch.Tensor):
11
  """Writes out the static embeddings to a .npy.zst file"""
12
  assert str(path).endswith(".npy.zst")
13
  buffer = io.BytesIO()
14
- np.save(buffer, tensor.numpy())
15
 
16
  with (
17
  open(path, "wb") as outfile,
@@ -20,15 +20,14 @@ def save_data(path: Path, tensor: torch.Tensor):
20
  writer.write(buffer.getvalue())
21
 
22
 
23
- data_path = Path("embeddings")
24
-
25
  model_name = "sentence-transformers/static-similarity-mrl-multilingual-v1"
26
  vocab_size = 105_879
27
  dimensions = 1024
28
 
29
 
30
  def load_embeddings():
31
- model = SentenceTransformer(model_name)
32
  embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
33
  embeddings = torch.Tensor(embedding_bag.weight)
34
 
@@ -49,13 +48,13 @@ def load_embeddings():
49
  truncated = embeddings[:, :dim]
50
  assert truncated.shape == torch.Size([vocab_size, dim])
51
 
52
- save_data(data_path / f"static-embeddings.{dim}.fp32.npy.zst", embeddings)
53
  save_data(
54
- data_path / f"static-embeddings.{dim}.fp16.npy.zst",
55
  embeddings.to(dtype=torch.float16),
56
  )
57
  save_data(
58
- data_path / f"static-embeddings.{dim}.int8.npy.zst",
59
  embeddings.to(dtype=torch.int8),
60
  )
61
 
 
11
  """Writes out the static embeddings to a .npy.zst file"""
12
  assert str(path).endswith(".npy.zst")
13
  buffer = io.BytesIO()
14
+ np.save(buffer, tensor.detach().numpy())
15
 
16
  with (
17
  open(path, "wb") as outfile,
 
20
  writer.write(buffer.getvalue())
21
 
22
 
23
+ model_path = Path("model")
 
24
  model_name = "sentence-transformers/static-similarity-mrl-multilingual-v1"
25
  vocab_size = 105_879
26
  dimensions = 1024
27
 
28
 
29
  def load_embeddings():
30
+ model = SentenceTransformer(model_name, device="cpu")
31
  embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
32
  embeddings = torch.Tensor(embedding_bag.weight)
33
 
 
48
  truncated = embeddings[:, :dim]
49
  assert truncated.shape == torch.Size([vocab_size, dim])
50
 
51
+ save_data(model_path / f"static-embeddings.{dim}.fp32.npy.zst", embeddings)
52
  save_data(
53
+ model_path / f"static-embeddings.{dim}.fp16.npy.zst",
54
  embeddings.to(dtype=torch.float16),
55
  )
56
  save_data(
57
+ model_path / f"static-embeddings.{dim}.int8.npy.zst",
58
  embeddings.to(dtype=torch.int8),
59
  )
60
 
multilingual.py → scripts/experiments/multilingual.py RENAMED
@@ -3,46 +3,44 @@ from tokenizers import Encoding, Tokenizer
3
  from torch.nn import EmbeddingBag
4
  import torch
5
 
6
- examples = [
7
- "This is an example of encoding",
8
- "The quick brown fox jumps over the lazy dog.",
9
- "Curaçao, naïve fiancé, jalapeño, déjà vu.",
10
- "Привет, как дела?",
11
- "Бързата кафява лисица прескача мързеливото куче.",
12
- "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
13
- "اللغة العربية جميلة وغنية بالتاريخ.",
14
- "مرحبا بالعالم!",
15
- "Simplified: 快速的棕色狐狸跳过懒狗。",
16
- "Traditional: 快速的棕色狐狸跳過懶狗。",
17
- "素早い茶色の狐が怠け者の犬を飛び越える。",
18
- "コンピュータープログラミング",
19
- "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
20
- "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
21
- "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
22
- "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
23
- "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
24
- "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
25
- "Hello 世界 مرحبا 🌍",
26
- "123, αβγ, абв, العربية, 中文, हिन्दी.",
27
- ]
28
-
29
- tokenizer: Tokenizer = Tokenizer.from_file("js/tokenizer.json")
30
-
31
- for example in examples:
32
- encoding: Encoding = tokenizer.encode(example)
33
- tokenizer.decode
34
- print(example)
35
- print(encoding.tokens)
36
- print()
37
-
38
-
39
- print("!!! tokenizer", tokenizer)
40
- assert False
41
  # https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1
42
  model = SentenceTransformer(
43
  "sentence-transformers/static-similarity-mrl-multilingual-v1", device="cpu"
44
  )
45
- embeddings = model.encode(examples)
46
 
47
  embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
48
  embeddings = torch.Tensor(embedding_bag.weight)
 
3
  from torch.nn import EmbeddingBag
4
  import torch
5
 
6
+
7
+ def test_tokenizer():
8
+ examples = [
9
+ "This is an example of encoding",
10
+ "The quick brown fox jumps over the lazy dog.",
11
+ "Curaçao, naïve fiancé, jalapeño, déjà vu.",
12
+ "Привет, как дела?",
13
+ "Бързата кафява лисица прескача мързеливото куче.",
14
+ "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.",
15
+ "اللغة العربية جميلة وغنية بالتاريخ.",
16
+ "مرحبا بالعالم!",
17
+ "Simplified: 快速的棕色狐狸跳过懒狗。",
18
+ "Traditional: 快速的棕色狐狸跳過懶狗。",
19
+ "素早い茶色の狐が怠け者の犬を飛び越える。",
20
+ "コンピュータープログラミング",
21
+ "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.",
22
+ "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।",
23
+ "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।",
24
+ "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.",
25
+ "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.",
26
+ "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።",
27
+ "Hello 世界 مرحبا 🌍",
28
+ "123, αβγ, абв, العربية, 中文, हिन्दी.",
29
+ ]
30
+
31
+ tokenizer: Tokenizer = Tokenizer.from_file("js/tokenizer.json")
32
+
33
+ for example in examples:
34
+ encoding: Encoding = tokenizer.encode(example)
35
+ print(example)
36
+ print(encoding.tokens)
37
+ print()
38
+
39
+
 
40
  # https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1
41
  model = SentenceTransformer(
42
  "sentence-transformers/static-similarity-mrl-multilingual-v1", device="cpu"
43
  )
 
44
 
45
  embedding_bag: EmbeddingBag = model[0].embedding # type: ignore
46
  embeddings = torch.Tensor(embedding_bag.weight)
potion.py → scripts/experiments/potion.py RENAMED
File without changes
tomaarsen.py → scripts/experiments/tomaarsen.py RENAMED
File without changes