SVECTOR-CORPORATION
/

Tessar-largest

Table Question Answering

neural-sql-executor

Model card Files Files and versions

SVECTOR-OFFICIAL commited on Mar 26, 2025

Commit

d2bf2f6

·

verified ·

1 Parent(s): 1ed1133

Update tessar_tokenizer_example.py

Files changed (1) hide show

tessar_tokenizer_example.py +5 -21

tessar_tokenizer_example.py CHANGED Viewed

@@ -1,38 +1,22 @@
-from tessar_tokenizer import TessarTokenizer, load_tessar_tokenizer
-# Example 1: Initialize a new Tessar Tokenizer
 tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
-# Example 2: Tokenize a simple text
 text = "Hello, how are you doing today?"
 encoded = tokenizer(text, return_tensors="pt")
-print("Encoded Input:", encoded)
-# Example 3: Batch tokenization
 texts = [
     "Hello, world!",
     "This is a test sentence.",
     "Tokenization is an important NLP task."
 ]
 batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
-print("Batch Encoded Inputs:", batch_encoded)
-# Example 4: Save and reload tokenizer
-save_directory = "./tessar_tokenizer"
-tokenizer.save_pretrained(save_directory)
-# Reload the saved tokenizer
-reloaded_tokenizer = load_tessar_tokenizer(save_directory)
-# Example 5: Custom tokenization with specific parameters
 custom_tokenizer = TessarTokenizer(
     do_lower_case=True,
     max_cell_length=20,
     unk_token="[UNK]",
     pad_token="[PAD]"
-)
-# Tokenize with custom settings
-custom_text = "A custom tokenization example"
-custom_encoded = custom_tokenizer(custom_text, return_tensors="pt")
-print("Custom Tokenizer Encoded:", custom_encoded)

+# Standard usage with default settings
 tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
+# Tokenize a single piece of text
 text = "Hello, how are you doing today?"
 encoded = tokenizer(text, return_tensors="pt")
+# Batch tokenization of multiple texts
 texts = [
     "Hello, world!",
     "This is a test sentence.",
     "Tokenization is an important NLP task."
 ]
 batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+# Custom tokenizer with specific settings
 custom_tokenizer = TessarTokenizer(
     do_lower_case=True,
     max_cell_length=20,
     unk_token="[UNK]",
     pad_token="[PAD]"
+)