DNABERT_save / examples /run_pretrain.sh.save
nancyH's picture
Upload folder using huggingface_hub
ab6c03c verified
# Launch with 4 processes (one for each GPU)
export KMER=6
export TRAIN_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_train.txt
export TEST_FILE=/home/n5huang/dna_token/output_tokens/all_tokenized_val.txt
export SOURCE=PATH_TO_DNABERT_REPO
export OUTPUT_PATH=output$KMER
python run_pretrain.py \
--output_dir $OUTPUT_PATH \
--model_type=dna \
--tokenizer_name=dna$KMER \
--config_name=$SOURCE/src/transformers/dnabert-config/bert-config-$KMER/config.json \
--do_train \
--train_data_file=$TRAIN_FILE \
--do_eval \
--eval_data_file=$TEST_FILE \
--mlm \
--gradient_accumulation_steps 7 \ # ADJUSTED for 4 GPUs: (10 * 7 * 4 = 280)
--per_gpu_train_batch_size 10 \
--per_gpu_eval_batch_size 6 \
--save_steps 500 \
--save_total_limit 20 \
--max_steps 10000 \ # Recommended starting point for a custom dataset
--evaluate_during_training \
--logging_steps 500 \
--line_by_line \
--learning_rate 4e-4 \
--block_size 512 \
--adam_epsilon 1e-6 \
--weight_decay 0.01 \
--beta1 0.9 \
--beta2 0.98 \
--mlm_probability 0.025 \
--warmup_steps 10000 \
--overwrite_output_dir \
--n_process 24