Byte-lingua-code / superbpe /tokenizers_superbpe /bindings /python /examples /train_bert_wordpiece.py
| import argparse | |
| import glob | |
| from tokenizers import BertWordPieceTokenizer | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--files", | |
| default=None, | |
| metavar="path", | |
| type=str, | |
| required=True, | |
| help="The files to use as training; accept '**/*.txt' type of patterns \ | |
| if enclosed in quotes", | |
| ) | |
| parser.add_argument( | |
| "--out", | |
| default="./", | |
| type=str, | |
| help="Path to the output directory, where the files will be saved", | |
| ) | |
| parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files") | |
| args = parser.parse_args() | |
| files = glob.glob(args.files) | |
| if not files: | |
| print(f"File does not exist: {args.files}") | |
| exit(1) | |
| # Initialize an empty tokenizer | |
| tokenizer = BertWordPieceTokenizer( | |
| clean_text=True, | |
| handle_chinese_chars=True, | |
| strip_accents=True, | |
| lowercase=True, | |
| ) | |
| # And then train | |
| tokenizer.train( | |
| files, | |
| vocab_size=10000, | |
| min_frequency=2, | |
| show_progress=True, | |
| special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], | |
| limit_alphabet=1000, | |
| wordpieces_prefix="##", | |
| ) | |
| # Save the files | |
| tokenizer.save_model(args.out, args.name) | |