Tessar-largest / tessar_tokenizer_example.py
SVECTOR-OFFICIAL's picture
Update tessar_tokenizer_example.py
d2bf2f6 verified
raw
history blame
652 Bytes
# Standard usage with default settings
tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
# Tokenize a single piece of text
text = "Hello, how are you doing today?"
encoded = tokenizer(text, return_tensors="pt")
# Batch tokenization of multiple texts
texts = [
"Hello, world!",
"This is a test sentence.",
"Tokenization is an important NLP task."
]
batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
# Custom tokenizer with specific settings
custom_tokenizer = TessarTokenizer(
do_lower_case=True,
max_cell_length=20,
unk_token="[UNK]",
pad_token="[PAD]"
)