import os import json import sentencepiece as spm from tqdm import tqdm MODEL_PREFIX = "icefire_spm" sp = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model") sample = "Daenerys Targaryen rides Drogon to Winterfell." ids = sp.encode(sample) pieces = sp.encode(sample, out_type=str) print("\nTest encode:") print(f" Text : {sample}") print(f" IDs : {ids}") print(f" Pieces: {pieces}") print(f" Decode: {sp.decode(ids)}") # check an entity entity = "Winterfell" e_pieces = sp.encode(entity, out_type=str) print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}") entity = "Long Claw" e_pieces = sp.encode(entity, out_type=str) print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}")