lexiform-13m / tokenizer /tokenizer.py
Raj0pro's picture
Upload tokenizer/tokenizer.py with huggingface_hub
3136b27 verified
Raw
History Blame Contribute Delete
1.19 kB
from pathlib import Path
import sentencepiece as spm
class Tokenizer:
def __init__(self, model_path: str | Path):
self.sp = spm.SentencePieceProcessor()
self.sp.load(str(model_path))
@property
def vocab_size(self) -> int:
return self.sp.get_piece_size()
@property
def pad_id(self) -> int: return self.sp.pad_id()
@property
def bos_id(self) -> int: return self.sp.bos_id()
@property
def eos_id(self) -> int: return self.sp.eos_id()
@property
def unk_id(self) -> int: return self.sp.unk_id()
def encode(self, text: str, max_length: int | None = None) -> list[int]:
ids = self.sp.encode(text, out_type=int)
if max_length:
ids = ids[:max_length]
return ids
def decode(self, ids: list[int]) -> str:
return self.sp.decode(ids)
def batch_decode(self, batch: list[list[int]], skip_special_tokens: bool = True) -> list[str]:
results = []
for ids in batch:
if skip_special_tokens:
ids = [i for i in ids if i not in (self.pad_id, self.bos_id, self.eos_id)]
results.append(self.decode(ids))
return results