SVECTOR-CORPORATION
/

Tessar-largest

@@ -7,9 +7,13 @@ from transformers import PreTrainedTokenizerFast
 class TessarTokenizer(PreTrainedTokenizerFast):
     """
-    Tessar Tokenizer implementation for Hugging Face Transformers
     """
     model_input_names = ['input_ids', 'attention_mask']
     def __init__(
@@ -28,15 +32,24 @@ class TessarTokenizer(PreTrainedTokenizerFast):
         **kwargs
     ):
         """
-        Initialize the Tessar Tokenizer with specific token configurations
         Args:
-            vocab_file (str, optional): Path to the vocabulary file
-            tokenizer_file (str, optional): Path to the pre-trained tokenizer file
             do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
             max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
         """
-        # Prepare special tokens
         special_tokens = {
             "unk_token": unk_token,
             "sep_token": sep_token,
@@ -47,7 +60,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             "eos_token": eos_token,
         }
-        # Remove None values
         special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
         # Call parent constructor
@@ -59,28 +72,28 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             **kwargs
         )
-        # Custom Tessar-specific attributes
         self.do_lower_case = do_lower_case
         self.max_cell_length = max_cell_length
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
         """
-        Save the tokenizer vocabulary and special tokens file
         Args:
-            save_directory (str): Directory to save the vocabulary
-            filename_prefix (str, optional): Prefix for the saved files
         Returns:
-            tuple: Paths to the saved files
         """
-        # Prepare file paths
         vocab_file = os.path.join(
             save_directory,
             f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
         )
-        # Save special tokens configuration
         special_tokens_file = os.path.join(
             save_directory,
             f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
@@ -90,7 +103,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
         with open(vocab_file, 'w', encoding='utf-8') as f:
             json.dump(self.vocab, f, ensure_ascii=False, indent=2)
-        # Save special tokens configuration
         special_tokens_config = {
             "unk_token": self.unk_token,
             "sep_token": self.sep_token,
@@ -103,6 +116,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             "max_cell_length": self.max_cell_length
         }
         with open(special_tokens_file, 'w', encoding='utf-8') as f:
             json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
@@ -110,13 +124,13 @@ class TessarTokenizer(PreTrainedTokenizerFast):
     def _tokenize(self, text: str) -> List[str]:
         """
-        Custom tokenization method
         Args:
-            text (str): Input text to tokenize
         Returns:
-            List[str]: List of tokens
         """
         # Apply lowercase if required
         if self.do_lower_case:
@@ -125,7 +139,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
         # Use the parent tokenizer's tokenization method
         tokens = super()._tokenize(text)
-        # Optional: Add custom cell-length truncation
         tokens = tokens[:self.max_cell_length]
         return tokens
@@ -137,28 +151,27 @@ class TessarTokenizer(PreTrainedTokenizerFast):
         **kwargs
     ) -> dict:
         """
-        Prepare tokenized inputs for the model
         Args:
-            ids (List[int]): List of input token ids
-            pair_ids (Optional[List[int]], optional): List of pair token ids
         Returns:
-            dict: Prepared model inputs
         """
-        # Implement any Tessar-specific model preparation logic
-        # This method can be extended to add Tessar-specific preprocessing
         return super().prepare_for_model(ids, pair_ids, **kwargs)
-# Example usage and initialization
-def load_tessar_tokenizer(pretrained_model_name_or_path: str):
     """
-    Load a pretrained Tessar tokenizer
     Args:
-        pretrained_model_name_or_path (str): Path to the pretrained model
     Returns:
-        TessarTokenizer: Initialized tokenizer
     """
     return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)

 class TessarTokenizer(PreTrainedTokenizerFast):
     """
+    Tessar Tokenizer implementation for Hugging Face Transformers.
+    This custom tokenizer extends the PreTrainedTokenizerFast with specialized
+    configurations and methods for the Tessar model ecosystem.
     """
+    # Define the input names expected by the model
     model_input_names = ['input_ids', 'attention_mask']
     def __init__(
         **kwargs
     ):
         """
+        Initialize the Tessar Tokenizer with customizable token configurations.
         Args:
+            vocab_file (str, optional): Path to the vocabulary file.
+            tokenizer_file (str, optional): Path to the pre-trained tokenizer file.
             do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
             max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
+        Additional token parameters allow for custom special token definitions:
+            unk_token (str): Unknown token
+            sep_token (str): Separator token
+            pad_token (str): Padding token
+            cls_token (str): Classification token
+            mask_token (str): Mask token
+            bos_token (str): Beginning of sequence token
+            eos_token (str): End of sequence token
         """
+        # Prepare special tokens dictionary
         special_tokens = {
             "unk_token": unk_token,
             "sep_token": sep_token,
             "eos_token": eos_token,
         }
+        # Remove None values from special tokens
         special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
         # Call parent constructor
             **kwargs
         )
+        # Store Tessar-specific attributes
         self.do_lower_case = do_lower_case
         self.max_cell_length = max_cell_length
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
         """
+        Save the tokenizer vocabulary and special tokens configuration.
         Args:
+            save_directory (str): Directory to save the vocabulary files.
+            filename_prefix (str, optional): Prefix for the saved files.
         Returns:
+            tuple: Paths to the saved vocabulary and special tokens files.
         """
+        # Prepare vocabulary file path
         vocab_file = os.path.join(
             save_directory,
             f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
         )
+        # Prepare special tokens file path
         special_tokens_file = os.path.join(
             save_directory,
             f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
         with open(vocab_file, 'w', encoding='utf-8') as f:
             json.dump(self.vocab, f, ensure_ascii=False, indent=2)
+        # Prepare special tokens configuration
         special_tokens_config = {
             "unk_token": self.unk_token,
             "sep_token": self.sep_token,
             "max_cell_length": self.max_cell_length
         }
+        # Save special tokens configuration
         with open(special_tokens_file, 'w', encoding='utf-8') as f:
             json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
     def _tokenize(self, text: str) -> List[str]:
         """
+        Custom tokenization method with optional preprocessing.
         Args:
+            text (str): Input text to tokenize.
         Returns:
+            List[str]: List of tokens after preprocessing.
         """
         # Apply lowercase if required
         if self.do_lower_case:
         # Use the parent tokenizer's tokenization method
         tokens = super()._tokenize(text)
+        # Truncate tokens to maximum cell length
         tokens = tokens[:self.max_cell_length]
         return tokens
         **kwargs
     ) -> dict:
         """
+        Prepare tokenized inputs for the model with optional custom logic.
         Args:
+            ids (List[int]): List of input token ids.
+            pair_ids (Optional[List[int]], optional): List of pair token ids.
         Returns:
+            dict: Prepared model inputs.
         """
+        # Call parent method with any additional custom preprocessing
         return super().prepare_for_model(ids, pair_ids, **kwargs)
+def load_tessar_tokenizer(pretrained_model_name_or_path: str) -> TessarTokenizer:
     """
+    Load a pretrained Tessar tokenizer.
     Args:
+        pretrained_model_name_or_path (str): Path to the pretrained model.
     Returns:
+        TessarTokenizer: Initialized tokenizer.
     """
     return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)