| set -eux | |
| LLM_RECIPES_DIR=/project | |
| INPUT_JSONL_FILE_PATH=${1:-/share/yans/datasets/jsonl/llm-jp-corpus-v1/ja/ja_wiki/train_0.jsonl} | |
| OUTPUT_FILE_PREFIX=${2:-/work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data} | |
| TOKENIZER_PATH=${3:-/share/pretrained_lm/Phi/Phi-2} | |
| mkdir -p $(dirname $OUTPUT_FILE_PREFIX) | |
| python $LLM_RECIPES_DIR/megatron_lm/tools/preprocess_data.py \ | |
| --input $INPUT_JSONL_FILE_PATH \ | |
| --output-prefix $OUTPUT_FILE_PREFIX \ | |
| --tokenizer-type HFPreTrainedTokenizer \ | |
| --tokenizer-model $TOKENIZER_PATH \ | |
| --workers 32 \ | |
| --append-eod \ | |
| --log-interval 1000 | |