base_5120 / 04.pretrain.sh
nancyH's picture
Upload folder using huggingface_hub
21666b1 verified
#!/usr/bin/env bash
set -e
# --- initialize conda ---
source /root/miniconda3/etc/profile.d/conda.sh
# --- activate env ---
conda activate bpe_v2
# select GPU
work_dir=/root/NaN/dna-tokenizer
data_dir=${work_dir}/dna-tokenizer-data
cache_dir=${data_dir}/cache
# runnum=len2_2048
runnum=base_5120
mkdir -p ${work_dir}/pretrain/models/model_${runnum}
output_dir=${work_dir}/pretrain/models/model_${runnum}
# export CUDA_VISIBLE_DEVICES=0
# export WORLD_SIZE=1
# export RANK=0
# export LOCAL_RANK=0
# export MASTER_ADDR=127.0.0.1
# export MASTER_PORT=29500
cp 04.pretrain.sh ${output_dir}
torchrun --nproc_per_node=8 run_mlm.py \
--output_dir ${output_dir} \
--model_type bert \
--tokenizer_name ${work_dir}/baseline_bpe/vocab_5120/5120_tokenizer.json \
--config_name ${work_dir}/BPE_merge/config_5120.json \
--project_name token_eval_base_5120 \
--do_train True \
--model_max_length 512 \
--max_seq_length 512 \
--line_by_line True \
--pad_to_max_length True \
--train_file ${data_dir}/baseline_bpe_5120_allchr_all_tokenized_train_chrOnly.tsv \
--validation_file ${data_dir}/baseline_bpe_5120_allchr_all_tokenized_val_chrOnly.tsv \
--cache_dir ${cache_dir} \
--use_fast_tokenizer True \
--do_eval True \
--gradient_accumulation_steps 1 \
--per_device_train_batch_size 96 \
--per_device_eval_batch_size 96 \
--save_steps 1000 \
--save_total_limit 10 \
--max_steps 100000 \
--logging_steps 1000 \
--learning_rate 4e-5 \
--adam_epsilon 1e-6 \
--weight_decay 0.01 \
--adam_beta1 0.9 \
--adam_beta2 0.98 \
--mlm_probability 0.15 \
--warmup_steps 10000 \
--seed 42 \
--preprocessing_num_workers 8 \
--overwrite_output_dir True
# --resume_from_checkpoint /root/NaN/dna-tokenizer/pretrain/models/model_base_2048/checkpoint-37000