| #INPUT="roberta_train_data_raw/valid.json" | |
| INPUT="/mnt/nvme0/ouyangxuan/project_pretrain/make_pretrain_data/roberta_train_data_raw/valid.json" | |
| python preprocess_data.py \ | |
| --input ${INPUT} \ | |
| --output-prefix my-bert \ | |
| --vocab bert-vocab.txt \ | |
| --dataset-impl mmap \ | |
| --worker 1 \ | |
| --chunk-size 1 \ | |
| --tokenizer-type BertWordPieceLowerCase \ | |
| --split-sentences | |
| #--input /mnt/nvme1/ouyangxuan/project_pretrain/find_framework/tmp_data/data.json \ | |
| #--input roberta_train_data_raw/train_1g.json \ | |