| #!/bin/bash |
|
|
| data_path=$1 |
| lr=$2 |
| output_path=$3 |
| project_name=$4 |
| vocab=117M |
|
|
| model=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/pretrain/models/model_2/checkpoint-50000 |
| tokenizer=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/hg38/tokenizer.json |
|
|
| echo "The provided data_path is $data_path" |
|
|
| datasets=( |
| demo_human_or_worm |
| dummy_mouse_enhancers_ensembl |
| human_enhancers_ensembl |
| human_nontata_promoters |
| demo_coding_vs_intergenomic_seqs |
| drosophila_enhancers_stark |
| human_enhancers_cohn |
| human_ensembl_regulatory |
| human_ocr_ensembl |
| ) |
|
|
|
|
| for seed in 42 |
| do |
| for data in demo_human_or_worm demo_coding_vs_intergenomic_seqs human_nontata_promoters |
| do |
| python train.py \ |
| --model_name_or_path ${model} \ |
| --tokenizer_path ${tokenizer} \ |
| --trust_remote_code True \ |
| --data_path $data_path/$data/split \ |
| --kmer -1 \ |
| --run_name hg38_BPE_${lr}_${data}_seed${seed} \ |
| --model_max_length 100 \ |
| --per_device_train_batch_size 128 \ |
| --per_device_eval_batch_size 128 \ |
| --gradient_accumulation_steps 1 \ |
| --learning_rate ${lr} \ |
| --num_train_epochs 3 \ |
| --fp16 \ |
| --save_steps 200 \ |
| --output_dir ${output_path} \ |
| --evaluation_strategy steps \ |
| --eval_steps 200 \ |
| --warmup_steps 30 \ |
| --logging_steps 100000 \ |
| --overwrite_output_dir True \ |
| --log_level info \ |
| --seed ${seed} \ |
| --find_unused_parameters False \ |
| --project_name ${project_name} |
| done |
|
|
| for data in drosophila_enhancers_stark dummy_mouse_enhancers_ensembl |
| do |
| python train.py \ |
| --model_name_or_path ${model} \ |
| --tokenizer_path ${tokenizer} \ |
| --trust_remote_code True \ |
| --data_path $data_path/$data/split \ |
| --kmer -1 \ |
| --run_name hg38_BPE_${lr}_${data}_seed${seed} \ |
| --model_max_length 512 \ |
| --per_device_train_batch_size 128 \ |
| --per_device_eval_batch_size 128 \ |
| --gradient_accumulation_steps 1 \ |
| --learning_rate ${lr} \ |
| --num_train_epochs 3 \ |
| --fp16 \ |
| --save_steps 200 \ |
| --output_dir ${output_path} \ |
| --evaluation_strategy steps \ |
| --eval_steps 200 \ |
| --warmup_steps 30 \ |
| --logging_steps 100000 \ |
| --overwrite_output_dir True \ |
| --log_level info \ |
| --seed ${seed} \ |
| --find_unused_parameters False \ |
| --project_name ${project_name} |
| done |
|
|
| for data in human_enhancers_ensembl human_enhancers_cohn human_ensembl_regulatory human_ocr_ensembl |
| do |
| python train.py \ |
| --model_name_or_path ${model} \ |
| --tokenizer_path ${tokenizer} \ |
| --trust_remote_code True \ |
| --data_path $data_path/$data/split \ |
| --kmer -1 \ |
| --run_name hg38_BPE_${lr}_${data}_seed${seed} \ |
| --model_max_length 250 \ |
| --per_device_train_batch_size 128 \ |
| --per_device_eval_batch_size 128 \ |
| --gradient_accumulation_steps 1 \ |
| --learning_rate ${lr} \ |
| --num_train_epochs 3 \ |
| --fp16 \ |
| --save_steps 200 \ |
| --output_dir ${output_path} \ |
| --evaluation_strategy steps \ |
| --eval_steps 200 \ |
| --warmup_steps 30 \ |
| --logging_steps 100000 \ |
| --overwrite_output_dir True \ |
| --log_level info \ |
| --seed ${seed} \ |
| --find_unused_parameters False \ |
| --project_name ${project_name} |
| done |
| done |
|
|