Upload tokenizer
Browse files- tokenizer_script.py +18 -20
tokenizer_script.py
CHANGED
|
@@ -12,6 +12,7 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
| 12 |
|
| 13 |
def __init__(
|
| 14 |
self,
|
|
|
|
| 15 |
unk_token="[UNK]",
|
| 16 |
pad_token="[PAD]",
|
| 17 |
bos_token="[BOS]",
|
|
@@ -19,19 +20,19 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
| 19 |
sep_token="[SEP]",
|
| 20 |
**kwargs
|
| 21 |
):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
self.vocab = vocab
|
| 36 |
self.inv_vocab = {v: k for k, v in self.vocab.items()}
|
| 37 |
|
|
@@ -113,16 +114,13 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
| 113 |
|
| 114 |
@classmethod
|
| 115 |
def from_json(cls, vocab_file, **kwargs):
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
return cls(vocab=vocab, **kwargs)
|
| 121 |
|
| 122 |
@classmethod
|
| 123 |
def from_vocab(cls, vocab, **kwargs):
|
| 124 |
-
|
| 125 |
-
return cls(vocab=vocab, **kwargs)
|
| 126 |
|
| 127 |
@classmethod
|
| 128 |
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
|
|
|
| 12 |
|
| 13 |
def __init__(
|
| 14 |
self,
|
| 15 |
+
vocab=None,
|
| 16 |
unk_token="[UNK]",
|
| 17 |
pad_token="[PAD]",
|
| 18 |
bos_token="[BOS]",
|
|
|
|
| 20 |
sep_token="[SEP]",
|
| 21 |
**kwargs
|
| 22 |
):
|
| 23 |
+
if vocab is None:
|
| 24 |
+
vocab = {}
|
| 25 |
+
# Add special tokens
|
| 26 |
+
special_tokens = [
|
| 27 |
+
unk_token,
|
| 28 |
+
pad_token,
|
| 29 |
+
bos_token,
|
| 30 |
+
eos_token,
|
| 31 |
+
sep_token,
|
| 32 |
+
]
|
| 33 |
+
for token in special_tokens:
|
| 34 |
+
if token not in vocab:
|
| 35 |
+
vocab[token] = len(vocab)
|
| 36 |
self.vocab = vocab
|
| 37 |
self.inv_vocab = {v: k for k, v in self.vocab.items()}
|
| 38 |
|
|
|
|
| 114 |
|
| 115 |
@classmethod
|
| 116 |
def from_json(cls, vocab_file, **kwargs):
|
| 117 |
+
with open(vocab_file, 'r', encoding='utf-8') as f:
|
| 118 |
+
vocab = json.load(f)
|
| 119 |
+
return cls(vocab=vocab, **kwargs)
|
|
|
|
|
|
|
| 120 |
|
| 121 |
@classmethod
|
| 122 |
def from_vocab(cls, vocab, **kwargs):
|
| 123 |
+
return cls(vocab=vocab, **kwargs)
|
|
|
|
| 124 |
|
| 125 |
@classmethod
|
| 126 |
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|