Create tokenization_octagon.py
Browse files- tokenization_octagon.py +54 -0
tokenization_octagon.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PreTrainedTokenizerFast
|
| 2 |
+
from tokenizers import Tokenizer, normalizers, pre_tokenizers, trainers, models
|
| 3 |
+
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
| 4 |
+
from tokenizers.pre_tokenizers import Whitespace
|
| 5 |
+
from typing import Optional, List, Union
|
| 6 |
+
|
| 7 |
+
class OctagonTokenizer(PreTrainedTokenizerFast):
|
| 8 |
+
def __init__(
|
| 9 |
+
self,
|
| 10 |
+
vocab_file=None,
|
| 11 |
+
merges_file=None,
|
| 12 |
+
tokenizer_file=None,
|
| 13 |
+
unk_token="[UNK]",
|
| 14 |
+
sep_token="[SEP]",
|
| 15 |
+
pad_token="[PAD]",
|
| 16 |
+
cls_token="[CLS]",
|
| 17 |
+
mask_token="[MASK]",
|
| 18 |
+
**kwargs
|
| 19 |
+
):
|
| 20 |
+
super().__init__(
|
| 21 |
+
tokenizer_file=tokenizer_file,
|
| 22 |
+
unk_token=unk_token,
|
| 23 |
+
sep_token=sep_token,
|
| 24 |
+
pad_token=pad_token,
|
| 25 |
+
cls_token=cls_token,
|
| 26 |
+
mask_token=mask_token,
|
| 27 |
+
**kwargs
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
@classmethod
|
| 31 |
+
def train_tokenizer(cls, texts: List[str], vocab_size: int = 30522, save_path: Optional[str] = None):
|
| 32 |
+
# Initialize a tokenizer
|
| 33 |
+
tokenizer = Tokenizer(models.BPE())
|
| 34 |
+
|
| 35 |
+
# Normalizer
|
| 36 |
+
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
|
| 37 |
+
|
| 38 |
+
# Pre-tokenizer
|
| 39 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| 40 |
+
|
| 41 |
+
# Trainer
|
| 42 |
+
trainer = trainers.BpeTrainer(
|
| 43 |
+
vocab_size=vocab_size,
|
| 44 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Train the tokenizer
|
| 48 |
+
tokenizer.train_from_iterator(texts, trainer=trainer)
|
| 49 |
+
|
| 50 |
+
# Save if path is provided
|
| 51 |
+
if save_path:
|
| 52 |
+
tokenizer.save(save_path)
|
| 53 |
+
|
| 54 |
+
return cls(tokenizer_file=save_path) if save_path else cls(tokenizer_object=tokenizer)
|