| from typing import List |
|
|
| import jieba |
| from tokenizers import NormalizedString, PreTokenizedString, Regex, Tokenizer |
| from tokenizers.decoders import Decoder |
| from tokenizers.models import BPE |
| from tokenizers.normalizers import Normalizer |
| from tokenizers.pre_tokenizers import PreTokenizer |
|
|
|
|
| class JiebaPreTokenizer: |
| def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]: |
| splits = [] |
| |
| |
| for token, start, stop in jieba.tokenize(str(normalized_string)): |
| splits.append(normalized_string[start:stop]) |
|
|
| return splits |
| |
| |
|
|
| def odd_number_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]: |
| |
| splits = [] |
| last = 0 |
| for i, char in enumerate(str(normalized_string)): |
| if char.isnumeric() and int(char) % 2 == 1: |
| splits.append(normalized_string[last:i]) |
| last = i |
| |
| splits.append(normalized_string[last:]) |
| return splits |
|
|
| def pre_tokenize(self, pretok: PreTokenizedString): |
| |
| pretok.split(self.jieba_split) |
| |
| |
| pretok.split(self.odd_number_split) |
|
|
|
|
| class CustomDecoder: |
| def decode(self, tokens: List[str]) -> str: |
| return "".join(tokens) |
|
|
|
|
| class CustomNormalizer: |
| def normalize(self, normalized: NormalizedString): |
| |
| |
| |
| |
| normalized.nfkc() |
| normalized.filter(lambda char: not char.isnumeric()) |
| normalized.replace(Regex("\s+"), " ") |
| normalized.lowercase() |
|
|
|
|
| |
| tok = Tokenizer(BPE()) |
| tok.normalizer = Normalizer.custom(CustomNormalizer()) |
| tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer()) |
| tok.decoder = Decoder.custom(CustomDecoder()) |
|
|
| input = "ζ°Έεζθ£
ι₯°εζιε
¬εΈ" |
| print("PreTokenize:", input) |
| print(tok.pre_tokenizer.pre_tokenize_str(input)) |
| |
|
|
| input = "112233" |
| print("PreTokenize:", input) |
| print(tok.pre_tokenizer.pre_tokenize_str(input)) |
| |
|
|
| input = "1234 βπ’π©π©π¬ π±π₯π’π―π’ ππ πΉβ―πΆπ ππππ£ ππ£ππππ!" |
| print("Normalize:", input) |
| print(tok.normalizer.normalize_str(input)) |
| |
|
|