File size: 767 Bytes
d4c5dff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import os
import json
import sentencepiece as spm
from tqdm import tqdm
MODEL_PREFIX = "icefire_spm"
sp = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model")
sample = "Daenerys Targaryen rides Drogon to Winterfell."
ids = sp.encode(sample)
pieces = sp.encode(sample, out_type=str)
print("\nTest encode:")
print(f" Text : {sample}")
print(f" IDs : {ids}")
print(f" Pieces: {pieces}")
print(f" Decode: {sp.decode(ids)}")
# check an entity
entity = "Winterfell"
e_pieces = sp.encode(entity, out_type=str)
print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}")
entity = "Long Claw"
e_pieces = sp.encode(entity, out_type=str)
print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}") |