| from datasets import load_dataset | |
| from tokenizers import ( | |
| decoders, | |
| models, | |
| normalizers, | |
| pre_tokenizers, | |
| processors, | |
| trainers, | |
| Tokenizer, | |
| Regex, | |
| ) | |
| from transformers import PreTrainedTokenizerFast, PreTrainedTokenizerBase | |
| from tqdm import tqdm | |
| dataset = load_dataset( | |
| "parquet", data_dir="Mxode/IndustryCorpus-Subset-zh-en", split="train") | |
| dataset = dataset.shuffle(seed=3407) | |
| ds = dataset[:1000000] | |
| ds_val = dataset[-10000:] | |
| char_len = sum(len(x) for x in ds_val['text']) | |
| def get_training_corpus(): | |
| for i in range(0, len(ds), 1000): | |
| yield ds["text"][i: i + 1000] | |
| def train(): | |
| tokenizer = Tokenizer(models.BPE()) | |
| tokenizer.normalizer = normalizers.NFC() | |
| tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ | |
| pre_tokenizers.Split( | |
| pattern=Regex( | |
| "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"), | |
| behavior="isolated", | |
| invert=False, | |
| ), | |
| pre_tokenizers.ByteLevel( | |
| add_prefix_space=False, | |
| use_regex=False, | |
| trim_offsets=False | |
| ) | |
| ]) | |
| trainer = trainers.BpeTrainer( | |
| vocab_size=16000, | |
| special_tokens=["<|endoftext|>", "<|im_start|>", "<|im_end|>"] | |
| ) | |
| tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) | |
| tokenizer.post_processor = processors.ByteLevel( | |
| add_prefix_space=False, | |
| use_regex=False, | |
| trim_offsets=False | |
| ) | |
| tokenizer.decoder = decoders.ByteLevel( | |
| add_prefix_space=False, | |
| use_regex=False, | |
| trim_offsets=False | |
| ) | |
| wrapped_tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_object=tokenizer, | |
| bos_token="<|endoftext|>", | |
| eos_token="<|im_end|>", | |
| pad_token="<|endoftext|>", | |
| model_max_length=4096, | |
| clean_up_tokenization_spaces=False, | |
| errors="replace", | |
| split_special_tokens=False, | |
| ) | |
| wrapped_tokenizer.chat_template = """{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}""" | |
| wrapped_tokenizer.save_pretrained( | |
| 'Mxode/Bilingual-Tokenizer/BilingualTokenizer-16K') | |
| return wrapped_tokenizer | |
| def eval(tokenizer: PreTrainedTokenizerBase): | |
| def get_compress_len(tokenizer): | |
| return sum(len(tokenizer(x, return_tensors=None)['input_ids']) for x in tqdm(ds_val['text'])) | |
| compress_len = get_compress_len(tokenizer) | |
| compression_rate = compress_len / char_len * 100 | |
| print(f'{len(tokenizer):<40} {compression_rate:.2f}%') | |
| if __name__ == "__main__": | |
| tokenizer = train() | |
| eval(tokenizer) | |