#!/bin/bash data_path=$1 lr=$2 output_path=$3 project_name=$4 vocab=117M model=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/pretrain/models/model_2/checkpoint-50000 tokenizer=/storage2/fs1/btc/Active/yeli/xiaoxiao.zhou/tokenize/tokenizers/DNAbert2/hg38/tokenizer.json echo "The provided data_path is $data_path" datasets=( demo_human_or_worm dummy_mouse_enhancers_ensembl human_enhancers_ensembl human_nontata_promoters demo_coding_vs_intergenomic_seqs drosophila_enhancers_stark human_enhancers_cohn human_ensembl_regulatory human_ocr_ensembl ) for seed in 42 do for data in demo_human_or_worm demo_coding_vs_intergenomic_seqs human_nontata_promoters # length all 200, 251 do python train.py \ --model_name_or_path ${model} \ --tokenizer_path ${tokenizer} \ --trust_remote_code True \ --data_path $data_path/$data/split \ --kmer -1 \ --run_name hg38_BPE_${lr}_${data}_seed${seed} \ --model_max_length 100 \ --per_device_train_batch_size 128 \ --per_device_eval_batch_size 128 \ --gradient_accumulation_steps 1 \ --learning_rate ${lr} \ --num_train_epochs 3 \ --fp16 \ --save_steps 200 \ --output_dir ${output_path} \ --evaluation_strategy steps \ --eval_steps 200 \ --warmup_steps 30 \ --logging_steps 100000 \ --overwrite_output_dir True \ --log_level info \ --seed ${seed} \ --find_unused_parameters False \ --project_name ${project_name} done for data in drosophila_enhancers_stark dummy_mouse_enhancers_ensembl # length mostly 2000, 3000~4000 do python train.py \ --model_name_or_path ${model} \ --tokenizer_path ${tokenizer} \ --trust_remote_code True \ --data_path $data_path/$data/split \ --kmer -1 \ --run_name hg38_BPE_${lr}_${data}_seed${seed} \ --model_max_length 512 \ --per_device_train_batch_size 128 \ --per_device_eval_batch_size 128 \ --gradient_accumulation_steps 1 \ --learning_rate ${lr} \ --num_train_epochs 3 \ --fp16 \ --save_steps 200 \ --output_dir ${output_path} \ --evaluation_strategy steps \ --eval_steps 200 \ --warmup_steps 30 \ --logging_steps 100000 \ --overwrite_output_dir True \ --log_level info \ --seed ${seed} \ --find_unused_parameters False \ --project_name ${project_name} done for data in human_enhancers_ensembl human_enhancers_cohn human_ensembl_regulatory human_ocr_ensembl # length usually 200~700 do python train.py \ --model_name_or_path ${model} \ --tokenizer_path ${tokenizer} \ --trust_remote_code True \ --data_path $data_path/$data/split \ --kmer -1 \ --run_name hg38_BPE_${lr}_${data}_seed${seed} \ --model_max_length 250 \ --per_device_train_batch_size 128 \ --per_device_eval_batch_size 128 \ --gradient_accumulation_steps 1 \ --learning_rate ${lr} \ --num_train_epochs 3 \ --fp16 \ --save_steps 200 \ --output_dir ${output_path} \ --evaluation_strategy steps \ --eval_steps 200 \ --warmup_steps 30 \ --logging_steps 100000 \ --overwrite_output_dir True \ --log_level info \ --seed ${seed} \ --find_unused_parameters False \ --project_name ${project_name} done done