# Launch with 4 processes (one for each GPU) export KMER=6 export TRAIN_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_train.txt export TEST_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_val.txt export SOURCE=PATH_TO_DNABERT_REPO export OUTPUT_PATH=output$KMER python run_pretrain.py \ --output_dir $OUTPUT_PATH \ --model_type=dna \ --tokenizer_name=dna$KMER \ --config_name=$SOURCE/src/transformers/dnabert-config/bert-config-$KMER/config.json \ --do_train \ --train_data_file=$TRAIN_FILE \ --do_eval \ --eval_data_file=$TEST_FILE \ --mlm \ --gradient_accumulation_steps 7 \ # ADJUSTED for 4 GPUs: (10 * 7 * 4 = 280) --per_gpu_train_batch_size 10 \ --per_gpu_eval_batch_size 6 \ --save_steps 500 \ --save_total_limit 20 \ --max_steps 10000 \ # Recommended starting point for a custom dataset --evaluate_during_training \ --logging_steps 500 \ --line_by_line \ --learning_rate 4e-4 \ --block_size 512 \ --adam_epsilon 1e-6 \ --weight_decay 0.01 \ --beta1 0.9 \ --beta2 0.98 \ --mlm_probability 0.025 \ --warmup_steps 10000 \ --overwrite_output_dir \ --n_process 24