File size: 3,948 Bytes
0dbbebb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/bin/bash

data_path=$1
lr=$2
output_path=$3
project_name=$4
vocab=117M

model=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/pretrain/models/model_2/checkpoint-50000
tokenizer=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/hg38/tokenizer.json

echo "The provided data_path is $data_path"

datasets=(
    demo_human_or_worm
    dummy_mouse_enhancers_ensembl
    human_enhancers_ensembl
    human_nontata_promoters
    demo_coding_vs_intergenomic_seqs
    drosophila_enhancers_stark
    human_enhancers_cohn
    human_ensembl_regulatory
    human_ocr_ensembl
)


for seed in 42
do
    for data in demo_human_or_worm demo_coding_vs_intergenomic_seqs human_nontata_promoters        # length all 200, 251
    do 
        python train.py \
            --model_name_or_path ${model} \
            --tokenizer_path ${tokenizer} \
            --trust_remote_code True \
            --data_path  $data_path/$data/split \
            --kmer -1 \
            --run_name hg38_BPE_${lr}_${data}_seed${seed} \
            --model_max_length 100 \
            --per_device_train_batch_size 128 \
            --per_device_eval_batch_size 128 \
            --gradient_accumulation_steps 1 \
            --learning_rate ${lr} \
            --num_train_epochs 3 \
            --fp16 \
            --save_steps 200 \
            --output_dir ${output_path} \
            --evaluation_strategy steps \
            --eval_steps 200 \
            --warmup_steps 30 \
            --logging_steps 100000 \
            --overwrite_output_dir True \
            --log_level info \
            --seed ${seed} \
            --find_unused_parameters False \
            --project_name ${project_name}
    done 

    for data in drosophila_enhancers_stark dummy_mouse_enhancers_ensembl        # length mostly 2000, 3000~4000
    do 
        python train.py \
            --model_name_or_path ${model} \
            --tokenizer_path ${tokenizer} \
            --trust_remote_code True \
            --data_path  $data_path/$data/split \
            --kmer -1 \
            --run_name hg38_BPE_${lr}_${data}_seed${seed} \
            --model_max_length 512 \
            --per_device_train_batch_size 128 \
            --per_device_eval_batch_size 128 \
            --gradient_accumulation_steps 1 \
            --learning_rate ${lr} \
            --num_train_epochs 3 \
            --fp16 \
            --save_steps 200 \
            --output_dir ${output_path} \
            --evaluation_strategy steps \
            --eval_steps 200 \
            --warmup_steps 30 \
            --logging_steps 100000 \
            --overwrite_output_dir True \
            --log_level info \
            --seed ${seed} \
            --find_unused_parameters False \
            --project_name ${project_name}
    done 

    for data in human_enhancers_ensembl human_enhancers_cohn human_ensembl_regulatory human_ocr_ensembl      # length usually 200~700
    do 
        python train.py \
            --model_name_or_path ${model} \
            --tokenizer_path ${tokenizer} \
            --trust_remote_code True \
            --data_path  $data_path/$data/split \
            --kmer -1 \
            --run_name hg38_BPE_${lr}_${data}_seed${seed} \
            --model_max_length 250 \
            --per_device_train_batch_size 128 \
            --per_device_eval_batch_size 128 \
            --gradient_accumulation_steps 1 \
            --learning_rate ${lr} \
            --num_train_epochs 3 \
            --fp16 \
            --save_steps 200 \
            --output_dir ${output_path} \
            --evaluation_strategy steps \
            --eval_steps 200 \
            --warmup_steps 30 \
            --logging_steps 100000 \
            --overwrite_output_dir True \
            --log_level info \
            --seed ${seed} \
            --find_unused_parameters False \
            --project_name ${project_name}
    done 
done