Upload tokenizer

Browse files

Files changed (5) hide show

merges.txt +0 -0
special_tokens_map.json +1 -0
tokenization_mamba.py +45 -0
tokenizer_config.json +12 -0
vocab.json +0 -0

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

tokenization_mamba.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Dict, Optional, Tuple
+from transformers import AutoTokenizer, PreTrainedTokenizer
+class MambaTokenizer(PreTrainedTokenizer):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        super().__init__(**kwargs)
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Returns the vocabulary as a dictionary of token to index.
+        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
+        vocab.
+        Returns:
+            `Dict[str, int]`: The vocabulary.
+        """
+        return self.tokenizer.get_vocab()
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary + added tokens).
+        This method won't save the configuration and special token mappings of the tokenizer. Use
+        [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        return self.tokenizer.save_vocabulary(
+            save_directory=save_directory,
+            filename_prefix=filename_prefix,
+        )

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_mamba.MambaTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "MambaTokenizer"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff