SVECTOR-OFFICIAL commited on
Commit
d2bf2f6
·
verified ·
1 Parent(s): 1ed1133

Update tessar_tokenizer_example.py

Browse files
Files changed (1) hide show
  1. tessar_tokenizer_example.py +5 -21
tessar_tokenizer_example.py CHANGED
@@ -1,38 +1,22 @@
1
- from tessar_tokenizer import TessarTokenizer, load_tessar_tokenizer
2
-
3
- # Example 1: Initialize a new Tessar Tokenizer
4
  tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
5
 
6
- # Example 2: Tokenize a simple text
7
  text = "Hello, how are you doing today?"
8
  encoded = tokenizer(text, return_tensors="pt")
9
- print("Encoded Input:", encoded)
10
 
11
- # Example 3: Batch tokenization
12
  texts = [
13
  "Hello, world!",
14
  "This is a test sentence.",
15
  "Tokenization is an important NLP task."
16
  ]
17
  batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
18
- print("Batch Encoded Inputs:", batch_encoded)
19
-
20
- # Example 4: Save and reload tokenizer
21
- save_directory = "./tessar_tokenizer"
22
- tokenizer.save_pretrained(save_directory)
23
 
24
- # Reload the saved tokenizer
25
- reloaded_tokenizer = load_tessar_tokenizer(save_directory)
26
-
27
- # Example 5: Custom tokenization with specific parameters
28
  custom_tokenizer = TessarTokenizer(
29
  do_lower_case=True,
30
  max_cell_length=20,
31
  unk_token="[UNK]",
32
  pad_token="[PAD]"
33
- )
34
-
35
- # Tokenize with custom settings
36
- custom_text = "A custom tokenization example"
37
- custom_encoded = custom_tokenizer(custom_text, return_tensors="pt")
38
- print("Custom Tokenizer Encoded:", custom_encoded)
 
1
+ # Standard usage with default settings
 
 
2
  tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
3
 
4
+ # Tokenize a single piece of text
5
  text = "Hello, how are you doing today?"
6
  encoded = tokenizer(text, return_tensors="pt")
 
7
 
8
+ # Batch tokenization of multiple texts
9
  texts = [
10
  "Hello, world!",
11
  "This is a test sentence.",
12
  "Tokenization is an important NLP task."
13
  ]
14
  batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
 
 
 
 
 
15
 
16
+ # Custom tokenizer with specific settings
 
 
 
17
  custom_tokenizer = TessarTokenizer(
18
  do_lower_case=True,
19
  max_cell_length=20,
20
  unk_token="[UNK]",
21
  pad_token="[PAD]"
22
+ )