| import argparse |
| import glob |
| import os |
| from tokenizers import ByteLevelBPETokenizer |
| from transformers import GPT2TokenizerFast |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--data-files", type=str, required=True) |
| parser.add_argument("--vocab-size", type=int, required=True) |
| parser.add_argument("--output-dir", type=str, required=True) |
| parser.add_argument("--output-file-name", type=str, required=True) |
| args = parser.parse_args() |
|
|
| gpt2_tok = ByteLevelBPETokenizer(add_prefix_space=True) |
|
|
| files = glob.glob(args.data_files) |
| if len(files) > 10: |
| print(files[0:10]) |
| else: |
| print(files) |
|
|
| gpt2_tok.train( |
| files=files, |
| vocab_size=args.vocab_size, |
| show_progress=True, |
| special_tokens=["<|endoftext|>", "<s>", "<pad>", "</s>"], |
| ) |
|
|
| if not os.path.exists(args.output_dir): |
| os.makedirs(args.output_dir) |
|
|
|
|
| gpt2_tok.save( |
| os.path.join(args.output_dir,"tokenizer.json"), pretty=True |
| ) |
| gpt2_tok.save_model(args.output_dir, args.output_file_name) |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|