Delete tessar_tokenizer_example.py
Browse files- tessar_tokenizer_example.py +0 -22
tessar_tokenizer_example.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
# Standard usage with default settings
|
| 2 |
-
tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
|
| 3 |
-
|
| 4 |
-
# Tokenize a single piece of text
|
| 5 |
-
text = "Hello, how are you doing today?"
|
| 6 |
-
encoded = tokenizer(text, return_tensors="pt")
|
| 7 |
-
|
| 8 |
-
# Batch tokenization of multiple texts
|
| 9 |
-
texts = [
|
| 10 |
-
"Hello, world!",
|
| 11 |
-
"This is a test sentence.",
|
| 12 |
-
"Tokenization is an important NLP task."
|
| 13 |
-
]
|
| 14 |
-
batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
|
| 15 |
-
|
| 16 |
-
# Custom tokenizer with specific settings
|
| 17 |
-
custom_tokenizer = TessarTokenizer(
|
| 18 |
-
do_lower_case=True,
|
| 19 |
-
max_cell_length=20,
|
| 20 |
-
unk_token="[UNK]",
|
| 21 |
-
pad_token="[PAD]"
|
| 22 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|