SVECTOR-OFFICIAL commited on
Commit
3e49727
·
verified ·
1 Parent(s): d2bf2f6

Update tessar_tokenizer.py

Browse files
Files changed (1) hide show
  1. tessar_tokenizer.py +42 -29
tessar_tokenizer.py CHANGED
@@ -7,9 +7,13 @@ from transformers import PreTrainedTokenizerFast
7
 
8
  class TessarTokenizer(PreTrainedTokenizerFast):
9
  """
10
- Tessar Tokenizer implementation for Hugging Face Transformers
 
 
 
11
  """
12
 
 
13
  model_input_names = ['input_ids', 'attention_mask']
14
 
15
  def __init__(
@@ -28,15 +32,24 @@ class TessarTokenizer(PreTrainedTokenizerFast):
28
  **kwargs
29
  ):
30
  """
31
- Initialize the Tessar Tokenizer with specific token configurations
32
 
33
  Args:
34
- vocab_file (str, optional): Path to the vocabulary file
35
- tokenizer_file (str, optional): Path to the pre-trained tokenizer file
36
  do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
37
  max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
 
 
 
 
 
 
 
 
 
38
  """
39
- # Prepare special tokens
40
  special_tokens = {
41
  "unk_token": unk_token,
42
  "sep_token": sep_token,
@@ -47,7 +60,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
47
  "eos_token": eos_token,
48
  }
49
 
50
- # Remove None values
51
  special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
52
 
53
  # Call parent constructor
@@ -59,28 +72,28 @@ class TessarTokenizer(PreTrainedTokenizerFast):
59
  **kwargs
60
  )
61
 
62
- # Custom Tessar-specific attributes
63
  self.do_lower_case = do_lower_case
64
  self.max_cell_length = max_cell_length
65
 
66
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
67
  """
68
- Save the tokenizer vocabulary and special tokens file
69
 
70
  Args:
71
- save_directory (str): Directory to save the vocabulary
72
- filename_prefix (str, optional): Prefix for the saved files
73
 
74
  Returns:
75
- tuple: Paths to the saved files
76
  """
77
- # Prepare file paths
78
  vocab_file = os.path.join(
79
  save_directory,
80
  f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
81
  )
82
 
83
- # Save special tokens configuration
84
  special_tokens_file = os.path.join(
85
  save_directory,
86
  f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
@@ -90,7 +103,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
90
  with open(vocab_file, 'w', encoding='utf-8') as f:
91
  json.dump(self.vocab, f, ensure_ascii=False, indent=2)
92
 
93
- # Save special tokens configuration
94
  special_tokens_config = {
95
  "unk_token": self.unk_token,
96
  "sep_token": self.sep_token,
@@ -103,6 +116,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
103
  "max_cell_length": self.max_cell_length
104
  }
105
 
 
106
  with open(special_tokens_file, 'w', encoding='utf-8') as f:
107
  json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
108
 
@@ -110,13 +124,13 @@ class TessarTokenizer(PreTrainedTokenizerFast):
110
 
111
  def _tokenize(self, text: str) -> List[str]:
112
  """
113
- Custom tokenization method
114
 
115
  Args:
116
- text (str): Input text to tokenize
117
 
118
  Returns:
119
- List[str]: List of tokens
120
  """
121
  # Apply lowercase if required
122
  if self.do_lower_case:
@@ -125,7 +139,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
125
  # Use the parent tokenizer's tokenization method
126
  tokens = super()._tokenize(text)
127
 
128
- # Optional: Add custom cell-length truncation
129
  tokens = tokens[:self.max_cell_length]
130
 
131
  return tokens
@@ -137,28 +151,27 @@ class TessarTokenizer(PreTrainedTokenizerFast):
137
  **kwargs
138
  ) -> dict:
139
  """
140
- Prepare tokenized inputs for the model
141
 
142
  Args:
143
- ids (List[int]): List of input token ids
144
- pair_ids (Optional[List[int]], optional): List of pair token ids
145
 
146
  Returns:
147
- dict: Prepared model inputs
148
  """
149
- # Implement any Tessar-specific model preparation logic
150
- # This method can be extended to add Tessar-specific preprocessing
151
  return super().prepare_for_model(ids, pair_ids, **kwargs)
152
 
153
- # Example usage and initialization
154
- def load_tessar_tokenizer(pretrained_model_name_or_path: str):
155
  """
156
- Load a pretrained Tessar tokenizer
157
 
158
  Args:
159
- pretrained_model_name_or_path (str): Path to the pretrained model
160
 
161
  Returns:
162
- TessarTokenizer: Initialized tokenizer
163
  """
164
  return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
 
7
 
8
  class TessarTokenizer(PreTrainedTokenizerFast):
9
  """
10
+ Tessar Tokenizer implementation for Hugging Face Transformers.
11
+
12
+ This custom tokenizer extends the PreTrainedTokenizerFast with specialized
13
+ configurations and methods for the Tessar model ecosystem.
14
  """
15
 
16
+ # Define the input names expected by the model
17
  model_input_names = ['input_ids', 'attention_mask']
18
 
19
  def __init__(
 
32
  **kwargs
33
  ):
34
  """
35
+ Initialize the Tessar Tokenizer with customizable token configurations.
36
 
37
  Args:
38
+ vocab_file (str, optional): Path to the vocabulary file.
39
+ tokenizer_file (str, optional): Path to the pre-trained tokenizer file.
40
  do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
41
  max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
42
+
43
+ Additional token parameters allow for custom special token definitions:
44
+ unk_token (str): Unknown token
45
+ sep_token (str): Separator token
46
+ pad_token (str): Padding token
47
+ cls_token (str): Classification token
48
+ mask_token (str): Mask token
49
+ bos_token (str): Beginning of sequence token
50
+ eos_token (str): End of sequence token
51
  """
52
+ # Prepare special tokens dictionary
53
  special_tokens = {
54
  "unk_token": unk_token,
55
  "sep_token": sep_token,
 
60
  "eos_token": eos_token,
61
  }
62
 
63
+ # Remove None values from special tokens
64
  special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
65
 
66
  # Call parent constructor
 
72
  **kwargs
73
  )
74
 
75
+ # Store Tessar-specific attributes
76
  self.do_lower_case = do_lower_case
77
  self.max_cell_length = max_cell_length
78
 
79
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
80
  """
81
+ Save the tokenizer vocabulary and special tokens configuration.
82
 
83
  Args:
84
+ save_directory (str): Directory to save the vocabulary files.
85
+ filename_prefix (str, optional): Prefix for the saved files.
86
 
87
  Returns:
88
+ tuple: Paths to the saved vocabulary and special tokens files.
89
  """
90
+ # Prepare vocabulary file path
91
  vocab_file = os.path.join(
92
  save_directory,
93
  f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
94
  )
95
 
96
+ # Prepare special tokens file path
97
  special_tokens_file = os.path.join(
98
  save_directory,
99
  f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
 
103
  with open(vocab_file, 'w', encoding='utf-8') as f:
104
  json.dump(self.vocab, f, ensure_ascii=False, indent=2)
105
 
106
+ # Prepare special tokens configuration
107
  special_tokens_config = {
108
  "unk_token": self.unk_token,
109
  "sep_token": self.sep_token,
 
116
  "max_cell_length": self.max_cell_length
117
  }
118
 
119
+ # Save special tokens configuration
120
  with open(special_tokens_file, 'w', encoding='utf-8') as f:
121
  json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
122
 
 
124
 
125
  def _tokenize(self, text: str) -> List[str]:
126
  """
127
+ Custom tokenization method with optional preprocessing.
128
 
129
  Args:
130
+ text (str): Input text to tokenize.
131
 
132
  Returns:
133
+ List[str]: List of tokens after preprocessing.
134
  """
135
  # Apply lowercase if required
136
  if self.do_lower_case:
 
139
  # Use the parent tokenizer's tokenization method
140
  tokens = super()._tokenize(text)
141
 
142
+ # Truncate tokens to maximum cell length
143
  tokens = tokens[:self.max_cell_length]
144
 
145
  return tokens
 
151
  **kwargs
152
  ) -> dict:
153
  """
154
+ Prepare tokenized inputs for the model with optional custom logic.
155
 
156
  Args:
157
+ ids (List[int]): List of input token ids.
158
+ pair_ids (Optional[List[int]], optional): List of pair token ids.
159
 
160
  Returns:
161
+ dict: Prepared model inputs.
162
  """
163
+ # Call parent method with any additional custom preprocessing
 
164
  return super().prepare_for_model(ids, pair_ids, **kwargs)
165
 
166
+
167
+ def load_tessar_tokenizer(pretrained_model_name_or_path: str) -> TessarTokenizer:
168
  """
169
+ Load a pretrained Tessar tokenizer.
170
 
171
  Args:
172
+ pretrained_model_name_or_path (str): Path to the pretrained model.
173
 
174
  Returns:
175
+ TessarTokenizer: Initialized tokenizer.
176
  """
177
  return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)