| #!/usr/bin/env bash |
| set -e |
|
|
| |
| source /root/miniconda3/etc/profile.d/conda.sh |
|
|
| |
| conda activate bpe_v2 |
|
|
| |
| work_dir=/root/NaN/dna-tokenizer |
| data_dir=${work_dir}/dna-tokenizer-data |
| cache_dir=${data_dir}/cache |
|
|
| |
| runnum=base_5120 |
|
|
| mkdir -p ${work_dir}/pretrain/models/model_${runnum} |
| output_dir=${work_dir}/pretrain/models/model_${runnum} |
|
|
| |
| |
| |
| |
| |
| |
|
|
| cp 04.pretrain.sh ${output_dir} |
| torchrun --nproc_per_node=8 run_mlm.py \ |
| --output_dir ${output_dir} \ |
| --model_type bert \ |
| --tokenizer_name ${work_dir}/baseline_bpe/vocab_5120/5120_tokenizer.json \ |
| --config_name ${work_dir}/BPE_merge/config_5120.json \ |
| --project_name token_eval_base_5120 \ |
| --do_train True \ |
| --model_max_length 512 \ |
| --max_seq_length 512 \ |
| --line_by_line True \ |
| --pad_to_max_length True \ |
| --train_file ${data_dir}/baseline_bpe_5120_allchr_all_tokenized_train_chrOnly.tsv \ |
| --validation_file ${data_dir}/baseline_bpe_5120_allchr_all_tokenized_val_chrOnly.tsv \ |
| --cache_dir ${cache_dir} \ |
| --use_fast_tokenizer True \ |
| --do_eval True \ |
| --gradient_accumulation_steps 1 \ |
| --per_device_train_batch_size 96 \ |
| --per_device_eval_batch_size 96 \ |
| --save_steps 1000 \ |
| --save_total_limit 10 \ |
| --max_steps 100000 \ |
| --logging_steps 1000 \ |
| --learning_rate 4e-5 \ |
| --adam_epsilon 1e-6 \ |
| --weight_decay 0.01 \ |
| --adam_beta1 0.9 \ |
| --adam_beta2 0.98 \ |
| --mlm_probability 0.15 \ |
| --warmup_steps 10000 \ |
| --seed 42 \ |
| --preprocessing_num_workers 8 \ |
| --overwrite_output_dir True |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|