Upload 11 files

Browse files

Files changed (11) hide show

config.json +37 -0
generation_config.json +11 -0
merges.txt +0 -0
model.safetensors +3 -0
pytorch_model.bin +3 -0
setup.py +52 -0
special_tokens_map.json +51 -0
tessar_tokenizer.py +164 -0
tessar_tokenizer_example.py +38 -0
tokenizer_config.json +67 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_name_or_path": "SVECTOR-CORPORATION/Tessar-largest",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+      "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 1024,
+  "max_position_embeddings": 1024,
+  "model_type": "bart",
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.17.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50265
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 2,
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "max_length": 1024,
+  "pad_token_id": 1,
+  "transformers_version": "4.27.0.dev0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3fd086d5435c71f07dbe525e859840b1e218490bfb974d5d5cdf91506f967ee
+size 1625426996

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d9dd92d3ee268740d9790bac260f0fd2fd6f7ad783b0d87769a11e7534c7cb3
+size 1625481368

setup.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from setuptools import find_packages, setup
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+setup(
+    name="tessar_tokenizer",
+    version="0.1.0",
+    description="Advanced Tokenizer for Table-based Transformations by SVECTOR",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author="SVECTOR",
+    author_email="team@svector.co.in",
+    url="https://www.svector.co.in",
+    packages=find_packages(),
+    package_data={
+        'tessar_tokenizer': ['*.json'],
+    },
+    install_requires=[
+        "transformers>=4.27.0",
+        "torch>=1.10.0",
+        "numpy>=1.19.0"
+    ],
+    extras_require={
+        'dev': [
+            'pytest',
+            'black',
+            'mypy',
+            'isort'
+        ]
+    },
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+    keywords="nlp tokenizer machine-learning table-transformations",
+    python_requires=">=3.7",
+    entry_points={
+        'console_scripts': [
+            'tessar-tokenizer=tessar_tokenizer.cli:main',
+        ],
+    },
+)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+    "bos_token": {
+        "content": "<s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true
+    },
+    "eos_token": {
+        "content": "</s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true
+    },
+    "unk_token": {
+        "content": "<unk>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true
+    },
+    "sep_token": {
+        "content": "</s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true
+    },
+    "pad_token": {
+        "content": "<pad>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true
+    },
+    "cls_token": {
+        "content": "<s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true
+    },
+    "mask_token": {
+        "content": "<mask>",
+        "single_word": false,
+        "lstrip": true,
+        "rstrip": false,
+        "normalized": true
+    }
+}

tessar_tokenizer.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import json
+import os
+from typing import List, Optional, Union
+from transformers import PreTrainedTokenizerFast
+class TessarTokenizer(PreTrainedTokenizerFast):
+    """
+    Tessar Tokenizer implementation for Hugging Face Transformers
+    """
+    model_input_names = ['input_ids', 'attention_mask']
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="<unk>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="<s>",
+        mask_token="<mask>",
+        bos_token="<s>",
+        eos_token="</s>",
+        max_cell_length=15,
+        **kwargs
+    ):
+        """
+        Initialize the Tessar Tokenizer with specific token configurations
+        Args:
+            vocab_file (str, optional): Path to the vocabulary file
+            tokenizer_file (str, optional): Path to the pre-trained tokenizer file
+            do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
+            max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
+        """
+        # Prepare special tokens
+        special_tokens = {
+            "unk_token": unk_token,
+            "sep_token": sep_token,
+            "pad_token": pad_token,
+            "cls_token": cls_token,
+            "mask_token": mask_token,
+            "bos_token": bos_token,
+            "eos_token": eos_token,
+        }
+        # Remove None values
+        special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
+        # Call parent constructor
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            **special_tokens,
+            **kwargs
+        )
+        # Custom Tessar-specific attributes
+        self.do_lower_case = do_lower_case
+        self.max_cell_length = max_cell_length
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        """
+        Save the tokenizer vocabulary and special tokens file
+        Args:
+            save_directory (str): Directory to save the vocabulary
+            filename_prefix (str, optional): Prefix for the saved files
+        Returns:
+            tuple: Paths to the saved files
+        """
+        # Prepare file paths
+        vocab_file = os.path.join(
+            save_directory,
+            f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
+        )
+        # Save special tokens configuration
+        special_tokens_file = os.path.join(
+            save_directory,
+            f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
+        )
+        # Save vocabulary
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            json.dump(self.vocab, f, ensure_ascii=False, indent=2)
+        # Save special tokens configuration
+        special_tokens_config = {
+            "unk_token": self.unk_token,
+            "sep_token": self.sep_token,
+            "pad_token": self.pad_token,
+            "cls_token": self.cls_token,
+            "mask_token": self.mask_token,
+            "bos_token": self.bos_token,
+            "eos_token": self.eos_token,
+            "do_lower_case": self.do_lower_case,
+            "max_cell_length": self.max_cell_length
+        }
+        with open(special_tokens_file, 'w', encoding='utf-8') as f:
+            json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
+        return (vocab_file, special_tokens_file)
+    def _tokenize(self, text: str) -> List[str]:
+        """
+        Custom tokenization method
+        Args:
+            text (str): Input text to tokenize
+        Returns:
+            List[str]: List of tokens
+        """
+        # Apply lowercase if required
+        if self.do_lower_case:
+            text = text.lower()
+        # Use the parent tokenizer's tokenization method
+        tokens = super()._tokenize(text)
+        # Optional: Add custom cell-length truncation
+        tokens = tokens[:self.max_cell_length]
+        return tokens
+    def prepare_for_model(
+        self,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        **kwargs
+    ) -> dict:
+        """
+        Prepare tokenized inputs for the model
+        Args:
+            ids (List[int]): List of input token ids
+            pair_ids (Optional[List[int]], optional): List of pair token ids
+        Returns:
+            dict: Prepared model inputs
+        """
+        # Implement any Tessar-specific model preparation logic
+        # This method can be extended to add Tessar-specific preprocessing
+        return super().prepare_for_model(ids, pair_ids, **kwargs)
+# Example usage and initialization
+def load_tessar_tokenizer(pretrained_model_name_or_path: str):
+    """
+    Load a pretrained Tessar tokenizer
+    Args:
+        pretrained_model_name_or_path (str): Path to the pretrained model
+    Returns:
+        TessarTokenizer: Initialized tokenizer
+    """
+    return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)

tessar_tokenizer_example.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from tessar_tokenizer import TessarTokenizer, load_tessar_tokenizer
+# Example 1: Initialize a new Tessar Tokenizer
+tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
+# Example 2: Tokenize a simple text
+text = "Hello, how are you doing today?"
+encoded = tokenizer(text, return_tensors="pt")
+print("Encoded Input:", encoded)
+# Example 3: Batch tokenization
+texts = [
+    "Hello, world!",
+    "This is a test sentence.",
+    "Tokenization is an important NLP task."
+]
+batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+print("Batch Encoded Inputs:", batch_encoded)
+# Example 4: Save and reload tokenizer
+save_directory = "./tessar_tokenizer"
+tokenizer.save_pretrained(save_directory)
+# Reload the saved tokenizer
+reloaded_tokenizer = load_tessar_tokenizer(save_directory)
+# Example 5: Custom tokenization with specific parameters
+custom_tokenizer = TessarTokenizer(
+    do_lower_case=True,
+    max_cell_length=20,
+    unk_token="[UNK]",
+    pad_token="[PAD]"
+)
+# Tokenize with custom settings
+custom_text = "A custom tokenization example"
+custom_encoded = custom_tokenizer(custom_text, return_tensors="pt")
+print("Custom Tokenizer Encoded:", custom_encoded)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+    "do_lower_case": true,
+    "errors": "replace",
+    "bos_token": {
+        "content": "<s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "eos_token": {
+        "content": "</s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "unk_token": {
+        "content": "<unk>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "sep_token": {
+        "content": "</s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "cls_token": {
+        "content": "<s>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "pad_token": {
+        "content": "<pad>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "mask_token": {
+        "content": "<mask>",
+        "single_word": false,
+        "lstrip": true,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "add_prefix_space": true,
+    "max_cell_length": 15,
+    "model_max_length": 1024,
+    "special_tokens_map_file": null,
+    "name_or_path": "SVECTOR-CORPORATION/Tessar-largest",
+    "use_fast": true,
+    "tokenizer_class": "TessarTokenizer"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff