| | from transformers import BertTokenizer, WordpieceTokenizer |
| | from unicodedata import normalize |
| |
|
| | def whitespace_tokenize(text): |
| | text = text.strip() |
| | if not text: |
| | return [] |
| | tokens = text.split() |
| | return tokens |
| |
|
| |
|
| | class KorWordpieceTokenizer(WordpieceTokenizer): |
| | def tokenize(self, text): |
| | output_tokens = [] |
| | for token in whitespace_tokenize(text): |
| | chars = list(normalize('NFC',token)) |
| | if len(chars) > self.max_input_chars_per_word: |
| | output_tokens.append(self.unk_token) |
| | continue |
| |
|
| | is_bad = False |
| | start = 0 |
| | sub_tokens = [] |
| | while start < len(chars): |
| | end = len(chars) |
| | cur_substr = None |
| | while start < end: |
| | substr = "".join(chars[start:end]) |
| | if substr in self.vocab: |
| | cur_substr = substr |
| | break |
| | end -= 1 |
| | if cur_substr is None: |
| | is_bad = True |
| | break |
| | sub_tokens.append(cur_substr) |
| | start = end |
| |
|
| | if is_bad: |
| | output_tokens.append(self.unk_token) |
| | else: |
| | output_tokens.extend(sub_tokens) |
| | return output_tokens |
| |
|
| |
|
| |
|
| | class KorBertTokenizer(BertTokenizer): |
| |
|
| | def __init__(self, |
| | vocab_file, |
| | do_lower_case=True, |
| | do_basic_tokenize=True, |
| | never_split=None, |
| | unk_token="[UNK]", |
| | sep_token="[SEP]", |
| | pad_token="[PAD]", |
| | cls_token="[CLS]", |
| | mask_token="[MASK]", |
| | tokenize_chinese_chars=True, |
| | strip_accents=None, |
| | **kwargs): |
| | super().__init__(vocab_file, |
| | do_lower_case=True, |
| | do_basic_tokenize=True, |
| | never_split=None, |
| | unk_token="[UNK]", |
| | sep_token="[SEP]", |
| | pad_token="[PAD]", |
| | cls_token="[CLS]", |
| | mask_token="[MASK]", |
| | tokenize_chinese_chars=True, |
| | strip_accents=None, |
| | **kwargs) |
| | self.wordpiece_tokenizer = KorWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) |
| |
|
| | def _tokenize(self, text): |
| | split_tokens = [] |
| | if self.do_basic_tokenize: |
| | for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): |
| |
|
| | token += '_' |
| | |
| | if token in self.basic_tokenizer.never_split: |
| | split_tokens.append(token) |
| | else: |
| | split_tokens += self.wordpiece_tokenizer.tokenize(token) |
| | else: |
| | split_tokens = self.wordpiece_tokenizer.tokenize(text) |
| | return split_tokens |
| |
|