SVECTOR-OFFICIAL commited on
Commit
a0c5230
·
verified ·
1 Parent(s): 3e49727

Update tessar_tokenizer.py

Browse files
Files changed (1) hide show
  1. tessar_tokenizer.py +22 -75
tessar_tokenizer.py CHANGED
@@ -1,19 +1,15 @@
 
1
  import json
2
  import os
3
  from typing import List, Optional, Union
4
 
5
- from transformers import PreTrainedTokenizerFast
6
-
7
 
8
  class TessarTokenizer(PreTrainedTokenizerFast):
9
  """
10
- Tessar Tokenizer implementation for Hugging Face Transformers.
11
-
12
- This custom tokenizer extends the PreTrainedTokenizerFast with specialized
13
- configurations and methods for the Tessar model ecosystem.
14
  """
15
 
16
- # Define the input names expected by the model
17
  model_input_names = ['input_ids', 'attention_mask']
18
 
19
  def __init__(
@@ -32,24 +28,9 @@ class TessarTokenizer(PreTrainedTokenizerFast):
32
  **kwargs
33
  ):
34
  """
35
- Initialize the Tessar Tokenizer with customizable token configurations.
36
-
37
- Args:
38
- vocab_file (str, optional): Path to the vocabulary file.
39
- tokenizer_file (str, optional): Path to the pre-trained tokenizer file.
40
- do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
41
- max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
42
-
43
- Additional token parameters allow for custom special token definitions:
44
- unk_token (str): Unknown token
45
- sep_token (str): Separator token
46
- pad_token (str): Padding token
47
- cls_token (str): Classification token
48
- mask_token (str): Mask token
49
- bos_token (str): Beginning of sequence token
50
- eos_token (str): End of sequence token
51
  """
52
- # Prepare special tokens dictionary
53
  special_tokens = {
54
  "unk_token": unk_token,
55
  "sep_token": sep_token,
@@ -60,7 +41,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
60
  "eos_token": eos_token,
61
  }
62
 
63
- # Remove None values from special tokens
64
  special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
65
 
66
  # Call parent constructor
@@ -72,28 +53,21 @@ class TessarTokenizer(PreTrainedTokenizerFast):
72
  **kwargs
73
  )
74
 
75
- # Store Tessar-specific attributes
76
  self.do_lower_case = do_lower_case
77
  self.max_cell_length = max_cell_length
78
 
79
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
80
  """
81
- Save the tokenizer vocabulary and special tokens configuration.
82
-
83
- Args:
84
- save_directory (str): Directory to save the vocabulary files.
85
- filename_prefix (str, optional): Prefix for the saved files.
86
-
87
- Returns:
88
- tuple: Paths to the saved vocabulary and special tokens files.
89
  """
90
- # Prepare vocabulary file path
91
  vocab_file = os.path.join(
92
  save_directory,
93
  f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
94
  )
95
 
96
- # Prepare special tokens file path
97
  special_tokens_file = os.path.join(
98
  save_directory,
99
  f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
@@ -103,7 +77,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
103
  with open(vocab_file, 'w', encoding='utf-8') as f:
104
  json.dump(self.vocab, f, ensure_ascii=False, indent=2)
105
 
106
- # Prepare special tokens configuration
107
  special_tokens_config = {
108
  "unk_token": self.unk_token,
109
  "sep_token": self.sep_token,
@@ -116,7 +90,6 @@ class TessarTokenizer(PreTrainedTokenizerFast):
116
  "max_cell_length": self.max_cell_length
117
  }
118
 
119
- # Save special tokens configuration
120
  with open(special_tokens_file, 'w', encoding='utf-8') as f:
121
  json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
122
 
@@ -124,13 +97,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
124
 
125
  def _tokenize(self, text: str) -> List[str]:
126
  """
127
- Custom tokenization method with optional preprocessing.
128
-
129
- Args:
130
- text (str): Input text to tokenize.
131
-
132
- Returns:
133
- List[str]: List of tokens after preprocessing.
134
  """
135
  # Apply lowercase if required
136
  if self.do_lower_case:
@@ -139,39 +106,19 @@ class TessarTokenizer(PreTrainedTokenizerFast):
139
  # Use the parent tokenizer's tokenization method
140
  tokens = super()._tokenize(text)
141
 
142
- # Truncate tokens to maximum cell length
143
  tokens = tokens[:self.max_cell_length]
144
 
145
  return tokens
146
-
147
- def prepare_for_model(
148
- self,
149
- ids: List[int],
150
- pair_ids: Optional[List[int]] = None,
151
- **kwargs
152
- ) -> dict:
153
- """
154
- Prepare tokenized inputs for the model with optional custom logic.
155
-
156
- Args:
157
- ids (List[int]): List of input token ids.
158
- pair_ids (Optional[List[int]], optional): List of pair token ids.
159
-
160
- Returns:
161
- dict: Prepared model inputs.
162
- """
163
- # Call parent method with any additional custom preprocessing
164
- return super().prepare_for_model(ids, pair_ids, **kwargs)
165
-
166
 
167
- def load_tessar_tokenizer(pretrained_model_name_or_path: str) -> TessarTokenizer:
168
  """
169
- Load a pretrained Tessar tokenizer.
170
-
171
- Args:
172
- pretrained_model_name_or_path (str): Path to the pretrained model.
173
-
174
- Returns:
175
- TessarTokenizer: Initialized tokenizer.
176
  """
177
- return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
 
 
 
 
 
 
 
1
+ # tessar_tokenizer.py
2
  import json
3
  import os
4
  from typing import List, Optional, Union
5
 
6
+ from transformers import PreTrainedTokenizerFast, AutoTokenizer
 
7
 
8
  class TessarTokenizer(PreTrainedTokenizerFast):
9
  """
10
+ Tessar Tokenizer implementation for Hugging Face Transformers
 
 
 
11
  """
12
 
 
13
  model_input_names = ['input_ids', 'attention_mask']
14
 
15
  def __init__(
 
28
  **kwargs
29
  ):
30
  """
31
+ Initialize the Tessar Tokenizer with specific token configurations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
+ # Prepare special tokens
34
  special_tokens = {
35
  "unk_token": unk_token,
36
  "sep_token": sep_token,
 
41
  "eos_token": eos_token,
42
  }
43
 
44
+ # Remove None values
45
  special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
46
 
47
  # Call parent constructor
 
53
  **kwargs
54
  )
55
 
56
+ # Custom Tessar-specific attributes
57
  self.do_lower_case = do_lower_case
58
  self.max_cell_length = max_cell_length
59
 
60
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
61
  """
62
+ Save the tokenizer vocabulary and special tokens file
 
 
 
 
 
 
 
63
  """
64
+ # Prepare file paths
65
  vocab_file = os.path.join(
66
  save_directory,
67
  f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
68
  )
69
 
70
+ # Save special tokens configuration
71
  special_tokens_file = os.path.join(
72
  save_directory,
73
  f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
 
77
  with open(vocab_file, 'w', encoding='utf-8') as f:
78
  json.dump(self.vocab, f, ensure_ascii=False, indent=2)
79
 
80
+ # Save special tokens configuration
81
  special_tokens_config = {
82
  "unk_token": self.unk_token,
83
  "sep_token": self.sep_token,
 
90
  "max_cell_length": self.max_cell_length
91
  }
92
 
 
93
  with open(special_tokens_file, 'w', encoding='utf-8') as f:
94
  json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
95
 
 
97
 
98
  def _tokenize(self, text: str) -> List[str]:
99
  """
100
+ Custom tokenization method
 
 
 
 
 
 
101
  """
102
  # Apply lowercase if required
103
  if self.do_lower_case:
 
106
  # Use the parent tokenizer's tokenization method
107
  tokens = super()._tokenize(text)
108
 
109
+ # Optional: Add custom cell-length truncation
110
  tokens = tokens[:self.max_cell_length]
111
 
112
  return tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ def load_tessar_tokenizer(pretrained_model_name_or_path: str):
115
  """
116
+ Load a pretrained Tessar tokenizer
 
 
 
 
 
 
117
  """
118
+ return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
119
+
120
+ # Explicitly register the tokenizer with AutoTokenizer
121
+ AutoTokenizer.register(
122
+ "TessarTokenizer",
123
+ TessarTokenizer
124
+ )