ds
Browse files- tokenizeConfig.py +5 -1
tokenizeConfig.py
CHANGED
|
@@ -121,6 +121,10 @@ class OBITokenizer(PreTrainedTokenizer):
|
|
| 121 |
)
|
| 122 |
|
| 123 |
# Save the BPE vocab
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
return (out_vocab_file,)
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
# Save the BPE vocab
|
| 124 |
+
# Training: Fit the tokenizer on your text data
|
| 125 |
+
trainer = trainers.BpeTrainer(special_tokens=["<unk>", "<s>", "</s>","[PAD]"])
|
| 126 |
+
self.tokenizer.train(trainer=trainer, files=[out_vocab_file])
|
| 127 |
+
# Save the trained tokenizer to a file
|
| 128 |
+
self.tokenizer.save(out_vocab_file)
|
| 129 |
|
| 130 |
return (out_vocab_file,)
|