fix: make `eos_token`/`pad_token` overridable
Browse files- tokenization_arcade100k.py +6 -2
- tokenizer_config.json +3 -1
tokenization_arcade100k.py
CHANGED
|
@@ -124,8 +124,12 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
| 124 |
|
| 125 |
self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
|
| 126 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|
| 127 |
-
|
| 128 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
# Expose for convenience
|
| 130 |
self.mergeable_ranks = self.tokenizer._mergeable_ranks
|
| 131 |
self.special_tokens = self.tokenizer._special_tokens
|
|
|
|
| 124 |
|
| 125 |
self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
|
| 126 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|
| 127 |
+
# Provide default `eos_token` and `pad_token`
|
| 128 |
+
if self.eos_token is None:
|
| 129 |
+
self.eos_token = self.decoder[self.tokenizer.eot_token]
|
| 130 |
+
if self.pad_token is None:
|
| 131 |
+
self.pad_token = self.decoder[self.tokenizer.pad_token]
|
| 132 |
+
|
| 133 |
# Expose for convenience
|
| 134 |
self.mergeable_ranks = self.tokenizer._mergeable_ranks
|
| 135 |
self.special_tokens = self.tokenizer._special_tokens
|
tokenizer_config.json
CHANGED
|
@@ -5,5 +5,7 @@
|
|
| 5 |
"tokenization_arcade100k.Arcade100kTokenizer",
|
| 6 |
null
|
| 7 |
]
|
| 8 |
-
}
|
|
|
|
|
|
|
| 9 |
}
|
|
|
|
| 5 |
"tokenization_arcade100k.Arcade100kTokenizer",
|
| 6 |
null
|
| 7 |
]
|
| 8 |
+
},
|
| 9 |
+
"eos_token": "<|endoftext|>",
|
| 10 |
+
"pad_token": "<|endoftext|>"
|
| 11 |
}
|