| # Launch with 4 processes (one for each GPU) | |
| export KMER=6 | |
| export TRAIN_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_train.txt | |
| export TEST_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_val.txt | |
| export SOURCE=PATH_TO_DNABERT_REPO | |
| export OUTPUT_PATH=output$KMER | |
| python run_pretrain.py \ | |
| --output_dir $OUTPUT_PATH \ | |
| --model_type=dna \ | |
| --tokenizer_name=dna$KMER \ | |
| --config_name=$SOURCE/src/transformers/dnabert-config/bert-config-$KMER/config.json \ | |
| --do_train \ | |
| --train_data_file=$TRAIN_FILE \ | |
| --do_eval \ | |
| --eval_data_file=$TEST_FILE \ | |
| --mlm \ | |
| --gradient_accumulation_steps 7 \ # ADJUSTED for 4 GPUs: (10 * 7 * 4 = 280) | |
| --per_gpu_train_batch_size 10 \ | |
| --per_gpu_eval_batch_size 6 \ | |
| --save_steps 500 \ | |
| --save_total_limit 20 \ | |
| --max_steps 10000 \ # Recommended starting point for a custom dataset | |
| --evaluate_during_training \ | |
| --logging_steps 500 \ | |
| --line_by_line \ | |
| --learning_rate 4e-4 \ | |
| --block_size 512 \ | |
| --adam_epsilon 1e-6 \ | |
| --weight_decay 0.01 \ | |
| --beta1 0.9 \ | |
| --beta2 0.98 \ | |
| --mlm_probability 0.025 \ | |
| --warmup_steps 10000 \ | |
| --overwrite_output_dir \ | |
| --n_process 24 | |