#!/usr/bin/env bash set -e # --- initialize conda --- source /root/miniconda3/etc/profile.d/conda.sh # --- activate env --- conda activate bpe_v2 # select GPU work_dir=/root/NaN/dna-tokenizer data_dir=${work_dir}/dna-tokenizer-data cache_dir=${data_dir}/cache # runnum=len2_2048 runnum=base_5120 mkdir -p ${work_dir}/pretrain/models/model_${runnum} output_dir=${work_dir}/pretrain/models/model_${runnum} # export CUDA_VISIBLE_DEVICES=0 # export WORLD_SIZE=1 # export RANK=0 # export LOCAL_RANK=0 # export MASTER_ADDR=127.0.0.1 # export MASTER_PORT=29500 cp 04.pretrain.sh ${output_dir} torchrun --nproc_per_node=8 run_mlm.py \ --output_dir ${output_dir} \ --model_type bert \ --tokenizer_name ${work_dir}/baseline_bpe/vocab_5120/5120_tokenizer.json \ --config_name ${work_dir}/BPE_merge/config_5120.json \ --project_name token_eval_base_5120 \ --do_train True \ --model_max_length 512 \ --max_seq_length 512 \ --line_by_line True \ --pad_to_max_length True \ --train_file ${data_dir}/baseline_bpe_5120_allchr_all_tokenized_train_chrOnly.tsv \ --validation_file ${data_dir}/baseline_bpe_5120_allchr_all_tokenized_val_chrOnly.tsv \ --cache_dir ${cache_dir} \ --use_fast_tokenizer True \ --do_eval True \ --gradient_accumulation_steps 1 \ --per_device_train_batch_size 96 \ --per_device_eval_batch_size 96 \ --save_steps 1000 \ --save_total_limit 10 \ --max_steps 100000 \ --logging_steps 1000 \ --learning_rate 4e-5 \ --adam_epsilon 1e-6 \ --weight_decay 0.01 \ --adam_beta1 0.9 \ --adam_beta2 0.98 \ --mlm_probability 0.15 \ --warmup_steps 10000 \ --seed 42 \ --preprocessing_num_workers 8 \ --overwrite_output_dir True # --resume_from_checkpoint /root/NaN/dna-tokenizer/pretrain/models/model_base_2048/checkpoint-37000