File size: 767 Bytes
d4c5dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

import os
import json
import sentencepiece as spm
from tqdm import tqdm
MODEL_PREFIX     = "icefire_spm"

sp = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model")
sample = "Daenerys Targaryen rides Drogon to Winterfell."
ids   = sp.encode(sample)
pieces = sp.encode(sample, out_type=str)
print("\nTest encode:")
print(f"   Text : {sample}")
print(f"   IDs  : {ids}")
print(f"   Pieces: {pieces}")
print(f"   Decode: {sp.decode(ids)}")



# check an entity
entity =  "Winterfell"
e_pieces = sp.encode(entity, out_type=str)
print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}")

entity =  "Long Claw"
e_pieces = sp.encode(entity, out_type=str)
print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}")