| # Standard usage with default settings | |
| tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest") | |
| # Tokenize a single piece of text | |
| text = "Hello, how are you doing today?" | |
| encoded = tokenizer(text, return_tensors="pt") | |
| # Batch tokenization of multiple texts | |
| texts = [ | |
| "Hello, world!", | |
| "This is a test sentence.", | |
| "Tokenization is an important NLP task." | |
| ] | |
| batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") | |
| # Custom tokenizer with specific settings | |
| custom_tokenizer = TessarTokenizer( | |
| do_lower_case=True, | |
| max_cell_length=20, | |
| unk_token="[UNK]", | |
| pad_token="[PAD]" | |
| ) |