SVECTOR-CORPORATION
/

Tessar-largest

@@ -1,9 +1,12 @@
 import json
 import os
-from typing import List, Optional, Union
 from transformers import PreTrainedTokenizerFast
 class TessarTokenizer(PreTrainedTokenizerFast):
     """
@@ -14,6 +17,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
     """
     model_input_names = ['input_ids', 'attention_mask']
     def __init__(
         self,
@@ -40,7 +44,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
         """
         # Prepare special tokens
-        special_tokens = {
             "unk_token": unk_token,
             "sep_token": sep_token,
             "pad_token": pad_token,
@@ -50,15 +54,20 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             "eos_token": eos_token,
         }
-        # Remove None values
-        special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
         # Call parent constructor
         super().__init__(
             vocab_file=vocab_file,
             tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            **special_tokens,
             **kwargs
         )
@@ -66,7 +75,26 @@ class TessarTokenizer(PreTrainedTokenizerFast):
         self.do_lower_case = do_lower_case
         self.max_cell_length = max_cell_length
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
         """
         Save the tokenizer vocabulary and special tokens file
@@ -86,15 +114,28 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
         )
         # Save special tokens configuration
         special_tokens_file = os.path.join(
             save_directory,
             f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
         )
         # Save vocabulary
         with open(vocab_file, 'w', encoding='utf-8') as f:
-            json.dump(self.vocab, f, ensure_ascii=False, indent=2)
         # Save special tokens configuration
         special_tokens_config = {
@@ -109,10 +150,15 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             "max_cell_length": self.max_cell_length
         }
         with open(special_tokens_file, 'w', encoding='utf-8') as f:
             json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
-        return (vocab_file, special_tokens_file)
     def _tokenize(self, text: str) -> List[str]:
         """
@@ -132,7 +178,8 @@ class TessarTokenizer(PreTrainedTokenizerFast):
         tokens = super()._tokenize(text)
         # Optional: Add custom cell-length truncation
-        tokens = tokens[:self.max_cell_length]
         return tokens
@@ -140,8 +187,22 @@ class TessarTokenizer(PreTrainedTokenizerFast):
         self,
         ids: List[int],
         pair_ids: Optional[List[int]] = None,
         **kwargs
-    ) -> dict:
         """
         Prepare tokenized inputs for the model
@@ -153,33 +214,131 @@ class TessarTokenizer(PreTrainedTokenizerFast):
             dict: Prepared model inputs
         """
         # Implement any Tessar-specific model preparation logic
-        # This method can be extended to add Tessar-specific preprocessing
-        return super().prepare_for_model(ids, pair_ids, **kwargs)
-def load_tessar_tokenizer(pretrained_model_name_or_path: str):
     """
     Load a pretrained Tessar tokenizer
     Args:
         pretrained_model_name_or_path (str): Path to the pretrained model
     Returns:
         TessarTokenizer: Initialized tokenizer
     """
-    return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
-# Optionally, add some example usage
 if __name__ == "__main__":
     # Example of loading a pretrained tokenizer
     try:
         tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
         print("Tokenizer loaded successfully!")
         # Basic tokenization example
         text = "Hello, how are you doing today?"
         encoded = tokenizer(text, return_tensors="pt")
         print("Encoded Input:", encoded)
     except Exception as e:
         print(f"Error loading tokenizer: {e}")

 import json
 import os
+from typing import List, Optional, Union, Dict, Any, Tuple
 from transformers import PreTrainedTokenizerFast
+from transformers.tokenization_utils_base import AddedToken
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
 class TessarTokenizer(PreTrainedTokenizerFast):
     """
     """
     model_input_names = ['input_ids', 'attention_mask']
+    vocab_files_names = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
     def __init__(
         self,
             max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
         """
         # Prepare special tokens
+        special_tokens_dict = {
             "unk_token": unk_token,
             "sep_token": sep_token,
             "pad_token": pad_token,
             "eos_token": eos_token,
         }
+        # Convert string tokens to AddedToken objects if they're not already
+        for token_name, token_value in special_tokens_dict.items():
+            if isinstance(token_value, str):
+                special_tokens_dict[token_name] = AddedToken(token_value,
+                                                           lstrip=False,
+                                                           rstrip=False,
+                                                           normalized=True,
+                                                           special=True)
         # Call parent constructor
         super().__init__(
             vocab_file=vocab_file,
             tokenizer_file=tokenizer_file,
+            **special_tokens_dict,
             **kwargs
         )
         self.do_lower_case = do_lower_case
         self.max_cell_length = max_cell_length
+    @property
+    def vocab_size(self) -> int:
+        """
+        Return the size of vocabulary
+        Returns:
+            int: The vocabulary size
+        """
+        return len(self.vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Return the vocabulary mapping
+        Returns:
+            Dict[str, int]: The vocabulary mapping
+        """
+        return dict(self.vocab)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, ...]:
         """
         Save the tokenizer vocabulary and special tokens file
             f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
         )
+        # Save tokenizer file
+        tokenizer_file = os.path.join(
+            save_directory,
+            f"{filename_prefix + '-' if filename_prefix else ''}tokenizer.json"
+        )
         # Save special tokens configuration
         special_tokens_file = os.path.join(
             save_directory,
             f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
         )
+        # Get vocabulary from tokenizer
+        vocab_dict = self.get_vocab()
         # Save vocabulary
         with open(vocab_file, 'w', encoding='utf-8') as f:
+            json.dump(vocab_dict, f, ensure_ascii=False, indent=2)
+        # Save the tokenizer file if it exists
+        if hasattr(self, "backend_tokenizer") and hasattr(self.backend_tokenizer, "save"):
+            self.backend_tokenizer.save(tokenizer_file)
         # Save special tokens configuration
         special_tokens_config = {
             "max_cell_length": self.max_cell_length
         }
+        # Convert token objects to strings for JSON serialization
+        for key, token in special_tokens_config.items():
+            if hasattr(token, "content"):
+                special_tokens_config[key] = token.content
         with open(special_tokens_file, 'w', encoding='utf-8') as f:
             json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
+        return (vocab_file, tokenizer_file, special_tokens_file)
     def _tokenize(self, text: str) -> List[str]:
         """
         tokens = super()._tokenize(text)
         # Optional: Add custom cell-length truncation
+        if self.max_cell_length > 0:
+            tokens = tokens[:self.max_cell_length]
         return tokens
         self,
         ids: List[int],
         pair_ids: Optional[List[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
         **kwargs
+    ) -> Dict[str, Any]:
         """
         Prepare tokenized inputs for the model
             dict: Prepared model inputs
         """
         # Implement any Tessar-specific model preparation logic
+        # For example, you might want to handle table data differently
+        return super().prepare_for_model(
+            ids,
+            pair_ids=pair_ids,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs
+        )
+    def batch_encode_tables(
+        self,
+        tables: List[List[List[str]]],
+        max_length: Optional[int] = None,
+        padding: Union[bool, str] = True,
+        truncation: Union[bool, str] = True,
+        return_tensors: Optional[str] = "pt",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Encode a batch of tables for table question answering
+        Args:
+            tables (List[List[List[str]]]): List of tables, where each table is a list of rows,
+                                          and each row is a list of cell values
+            max_length (Optional[int], optional): Maximum sequence length
+            padding (Union[bool, str], optional): Padding strategy
+            truncation (Union[bool, str], optional): Truncation strategy
+            return_tensors (Optional[str], optional): Type of tensors to return
+        Returns:
+            Dict[str, Any]: Encoded table batch
+        """
+        # Flatten tables into text sequences with appropriate format
+        flattened_inputs = []
+        for table in tables:
+            # Convert table to a flattened text representation
+            # This is a simplified example - real implementation would depend on your specific format
+            table_text = ""
+            for row_idx, row in enumerate(table):
+                for col_idx, cell in enumerate(row):
+                    # Apply cell-level processing
+                    if self.do_lower_case:
+                        cell = cell.lower()
+                    # Add cell with position information
+                    table_text += f"[CELL_{row_idx}_{col_idx}] {cell} "
+                # Add row separator
+                table_text += "[ROW_END] "
+            flattened_inputs.append(table_text.strip())
+        # Encode the flattened text inputs
+        return self(
+            flattened_inputs,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            return_tensors=return_tensors,
+            **kwargs
+        )
+def load_tessar_tokenizer(pretrained_model_name_or_path: str, **kwargs):
     """
     Load a pretrained Tessar tokenizer
     Args:
         pretrained_model_name_or_path (str): Path to the pretrained model
+        **kwargs: Additional arguments to pass to from_pretrained
     Returns:
         TessarTokenizer: Initialized tokenizer
     """
+    return TessarTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+# Register the tokenizer with the Transformers library
+from transformers import AutoTokenizer
+AutoTokenizer.register("SVECTOR-CORPORATION/Tessar-largest", TessarTokenizer)
+# Example usage
 if __name__ == "__main__":
     # Example of loading a pretrained tokenizer
     try:
+        # Method 1: Direct loading with the class
         tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
         print("Tokenizer loaded successfully!")
+        # Method 2: Loading through AutoTokenizer
+        # This will work after the registration above
+        auto_tokenizer = AutoTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
+        print("AutoTokenizer loaded successfully!")
         # Basic tokenization example
         text = "Hello, how are you doing today?"
         encoded = tokenizer(text, return_tensors="pt")
         print("Encoded Input:", encoded)
+        # Example with table data
+        table = [
+            ["Header1", "Header2", "Header3"],
+            ["Value1", "Value2", "Value3"],
+            ["Value4", "Value5", "Value6"]
+        ]
+        # Example of batch encoding tables
+        encoded_table = tokenizer.batch_encode_tables([table], return_tensors="pt")
+        print("Encoded Table:", encoded_table)
     except Exception as e:
         print(f"Error loading tokenizer: {e}")