File size: 3,948 Bytes
0dbbebb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | #!/bin/bash
data_path=$1
lr=$2
output_path=$3
project_name=$4
vocab=117M
model=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/pretrain/models/model_2/checkpoint-50000
tokenizer=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/hg38/tokenizer.json
echo "The provided data_path is $data_path"
datasets=(
demo_human_or_worm
dummy_mouse_enhancers_ensembl
human_enhancers_ensembl
human_nontata_promoters
demo_coding_vs_intergenomic_seqs
drosophila_enhancers_stark
human_enhancers_cohn
human_ensembl_regulatory
human_ocr_ensembl
)
for seed in 42
do
for data in demo_human_or_worm demo_coding_vs_intergenomic_seqs human_nontata_promoters # length all 200, 251
do
python train.py \
--model_name_or_path ${model} \
--tokenizer_path ${tokenizer} \
--trust_remote_code True \
--data_path $data_path/$data/split \
--kmer -1 \
--run_name hg38_BPE_${lr}_${data}_seed${seed} \
--model_max_length 100 \
--per_device_train_batch_size 128 \
--per_device_eval_batch_size 128 \
--gradient_accumulation_steps 1 \
--learning_rate ${lr} \
--num_train_epochs 3 \
--fp16 \
--save_steps 200 \
--output_dir ${output_path} \
--evaluation_strategy steps \
--eval_steps 200 \
--warmup_steps 30 \
--logging_steps 100000 \
--overwrite_output_dir True \
--log_level info \
--seed ${seed} \
--find_unused_parameters False \
--project_name ${project_name}
done
for data in drosophila_enhancers_stark dummy_mouse_enhancers_ensembl # length mostly 2000, 3000~4000
do
python train.py \
--model_name_or_path ${model} \
--tokenizer_path ${tokenizer} \
--trust_remote_code True \
--data_path $data_path/$data/split \
--kmer -1 \
--run_name hg38_BPE_${lr}_${data}_seed${seed} \
--model_max_length 512 \
--per_device_train_batch_size 128 \
--per_device_eval_batch_size 128 \
--gradient_accumulation_steps 1 \
--learning_rate ${lr} \
--num_train_epochs 3 \
--fp16 \
--save_steps 200 \
--output_dir ${output_path} \
--evaluation_strategy steps \
--eval_steps 200 \
--warmup_steps 30 \
--logging_steps 100000 \
--overwrite_output_dir True \
--log_level info \
--seed ${seed} \
--find_unused_parameters False \
--project_name ${project_name}
done
for data in human_enhancers_ensembl human_enhancers_cohn human_ensembl_regulatory human_ocr_ensembl # length usually 200~700
do
python train.py \
--model_name_or_path ${model} \
--tokenizer_path ${tokenizer} \
--trust_remote_code True \
--data_path $data_path/$data/split \
--kmer -1 \
--run_name hg38_BPE_${lr}_${data}_seed${seed} \
--model_max_length 250 \
--per_device_train_batch_size 128 \
--per_device_eval_batch_size 128 \
--gradient_accumulation_steps 1 \
--learning_rate ${lr} \
--num_train_epochs 3 \
--fp16 \
--save_steps 200 \
--output_dir ${output_path} \
--evaluation_strategy steps \
--eval_steps 200 \
--warmup_steps 30 \
--logging_steps 100000 \
--overwrite_output_dir True \
--log_level info \
--seed ${seed} \
--find_unused_parameters False \
--project_name ${project_name}
done
done
|