Update char_tokenizer.py
Browse files- char_tokenizer.py +2 -12
char_tokenizer.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Copypasted from
|
| 3 |
https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/char_tokenizer.py
|
| 4 |
with Apache 2.0 license
|
| 5 |
"""
|
|
@@ -39,17 +39,7 @@ class CharTokenizer(PreTrainedTokenizer):
|
|
| 39 |
do_lower_case=False,
|
| 40 |
*args,
|
| 41 |
**kwargs
|
| 42 |
-
):
|
| 43 |
-
self.do_lower_case = do_lower_case
|
| 44 |
-
self.space_token = space_token
|
| 45 |
-
|
| 46 |
-
if not vocab_file or not os.path.isfile(vocab_file):
|
| 47 |
-
self.vocab = OrderedDict()
|
| 48 |
-
self.ids_to_tokens = OrderedDict()
|
| 49 |
-
else:
|
| 50 |
-
self.vocab = load_vocab(vocab_file)
|
| 51 |
-
self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
| 52 |
-
|
| 53 |
super().__init__(
|
| 54 |
pad_token=pad_token,
|
| 55 |
unk_token=unk_token,
|
|
|
|
| 1 |
"""
|
| 2 |
+
Copypasted and updated from
|
| 3 |
https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/char_tokenizer.py
|
| 4 |
with Apache 2.0 license
|
| 5 |
"""
|
|
|
|
| 39 |
do_lower_case=False,
|
| 40 |
*args,
|
| 41 |
**kwargs
|
| 42 |
+
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
super().__init__(
|
| 44 |
pad_token=pad_token,
|
| 45 |
unk_token=unk_token,
|