mjschock commited on
Commit
f2f5917
·
verified ·
1 Parent(s): 181a8a4

Upload tokenizer

Browse files
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenization_mamba.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional, Tuple
2
+ from transformers import AutoTokenizer, PreTrainedTokenizer
3
+
4
+ class MambaTokenizer(PreTrainedTokenizer):
5
+ def __init__(
6
+ self,
7
+ **kwargs,
8
+ ):
9
+ self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
10
+ self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
11
+
12
+ super().__init__(**kwargs)
13
+
14
+ def get_vocab(self) -> Dict[str, int]:
15
+ """
16
+ Returns the vocabulary as a dictionary of token to index.
17
+
18
+ `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
19
+ vocab.
20
+
21
+ Returns:
22
+ `Dict[str, int]`: The vocabulary.
23
+ """
24
+ return self.tokenizer.get_vocab()
25
+
26
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
27
+ """
28
+ Save only the vocabulary of the tokenizer (vocabulary + added tokens).
29
+
30
+ This method won't save the configuration and special token mappings of the tokenizer. Use
31
+ [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
32
+
33
+ Args:
34
+ save_directory (`str`):
35
+ The directory in which to save the vocabulary.
36
+ filename_prefix (`str`, *optional*):
37
+ An optional prefix to add to the named of the saved files.
38
+
39
+ Returns:
40
+ `Tuple(str)`: Paths to the files saved.
41
+ """
42
+ return self.tokenizer.save_vocabulary(
43
+ save_directory=save_directory,
44
+ filename_prefix=filename_prefix,
45
+ )
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenization_mamba.MambaTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "clean_up_tokenization_spaces": true,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "tokenizer_class": "MambaTokenizer"
12
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff