hash-map commited on
Commit
e98404d
·
verified ·
1 Parent(s): e39c24b

Upload 3 files

Browse files
Files changed (3) hide show
  1. icefire_spm.model +3 -0
  2. icefire_spm.vocab +0 -0
  3. usage.py +27 -0
icefire_spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:618c81434ac7381e299479573396f1020c352d6723b961e07596c0a442110f75
3
+ size 743008
icefire_spm.vocab ADDED
The diff for this file is too large to render. See raw diff
 
usage.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import json
4
+ import sentencepiece as spm
5
+ from tqdm import tqdm
6
+ MODEL_PREFIX = "icefire_spm"
7
+
8
+ sp = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model")
9
+ sample = "Daenerys Targaryen rides Drogon to Winterfell."
10
+ ids = sp.encode(sample)
11
+ pieces = sp.encode(sample, out_type=str)
12
+ print("\nTest encode:")
13
+ print(f" Text : {sample}")
14
+ print(f" IDs : {ids}")
15
+ print(f" Pieces: {pieces}")
16
+ print(f" Decode: {sp.decode(ids)}")
17
+
18
+
19
+
20
+ # check an entity
21
+ entity = "Winterfell"
22
+ e_pieces = sp.encode(entity, out_type=str)
23
+ print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}")
24
+
25
+ entity = "Long Claw"
26
+ e_pieces = sp.encode(entity, out_type=str)
27
+ print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}")