| # --- initialize conda --- | |
| source /root/miniconda3/etc/profile.d/conda.sh | |
| # --- activate env --- | |
| conda activate bpe_v2 | |
| export CUDA_VISIBLE_DEVICES=6 | |
| data_path=$1 | |
| lr=$2 | |
| output_root=$3 | |
| project_name=$4 | |
| # Model / tokenizer pairs to sweep | |
| MODELS=( | |
| # "/home/n5huang/dna_token/pretrain/models/model_cpu_test_1/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_len_reg/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_len_2/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_tfidf/checkpoint-100000" | |
| # "/root/NaN/dna-tokenizer/pretrain/models/base_2048/checkpoint-100000" | |
| # "/root/NaN/dna-tokenizer/pretrain/models/len2_2048/checkpoint-100000" | |
| "/root/NaN/dna-tokenizer/pretrain/models/len2_3072/checkpoint-100000" | |
| # "/root/NaN/dna-tokenizer/pretrain/models/base_3072/checkpoint-100000" | |
| # "/root/NaN/dna-tokenizer/pretrain/models/base_4096/checkpoint-100000" | |
| # "/root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000" | |
| ) | |
| TOKENIZERS=( | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/tokenizer.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len2.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_tf_idf.json" | |
| # "/root/NaN/dna-tokenizer/baseline_bpe/vocab_2048/2048_tokenizer.json" | |
| # "/root/NaN/dna-tokenizer/merge_bpe/vocab_2048/merge_tokenizer_unigram_len2.json" | |
| "/root/NaN/dna-tokenizer/merge_bpe/vocab_3072/merge_tokenizer_unigram_len2.json" | |
| # "/root/NaN/dna-tokenizer/baseline_bpe/vocab_3072/3072_tokenizer.json" | |
| # "/root/NaN/dna-tokenizer/baseline_bpe/vocab_4096/4096_tokenizer.json" | |
| # "/root/NaN/dna-tokenizer/merge_bpe/vocab_4096/merge_tokenizer_unigram_len2.json" | |
| ) | |
| # MODEL_NAMES=("len2_2048") | |
| MODEL_NAMES=("len2_3072") | |
| # MODEL_NAMES=("base_2048" "len2_2048" "len2_3072" "base_3072" "base_4096" "len2_4096") | |
| if [ ${#MODELS[@]} -ne ${#TOKENIZERS[@]} ] || [ ${#MODELS[@]} -ne ${#MODEL_NAMES[@]} ]; then | |
| echo "MODELS, TOKENIZERS, MODEL_NAMES must have the same length" >&2 | |
| exit 1 | |
| fi | |
| echo "The provided data_path is $data_path" | |
| echo "Output root: $output_root" | |
| for seed in 42; do | |
| for idx in "${!MODELS[@]}"; do | |
| model=${MODELS[$idx]} | |
| tokenizer=${TOKENIZERS[$idx]} | |
| model_name=${MODEL_NAMES[$idx]} | |
| # for data in demo_coding_vs_intergenomic_seqs human_nontata_promoters human_enhancers_cohn human_ocr_ensembl; do # length ~200 | |
| # run_output_dir=${output_root}/${data}/${model_name} | |
| # mkdir -p "${run_output_dir}" | |
| # echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" | |
| # torchrun --nproc_per_node=1 \ | |
| # --master_port=${MASTER_PORT:-29500} \ | |
| # train.py \ | |
| # --model_name_or_path ${model} \ | |
| # --tokenizer_path ${tokenizer} \ | |
| # --trust_remote_code True \ | |
| # --data_path $data_path/$data/split \ | |
| # --kmer -1 \ | |
| # --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ | |
| # --model_max_length 100 \ | |
| # --per_device_train_batch_size 128 \ | |
| # --per_device_eval_batch_size 128 \ | |
| # --gradient_accumulation_steps 1 \ | |
| # --learning_rate ${lr} \ | |
| # --num_train_epochs 3 \ | |
| # --fp16 \ | |
| # --save_steps 200 \ | |
| # --output_dir ${run_output_dir} \ | |
| # --evaluation_strategy steps \ | |
| # --eval_steps 200 \ | |
| # --warmup_steps 30 \ | |
| # --logging_steps 100000 \ | |
| # --overwrite_output_dir True \ | |
| # --log_level info \ | |
| # --seed ${seed} \ | |
| # --find_unused_parameters False \ | |
| # --project_name ${project_name} | |
| # done | |
| for data in drosophila_enhancers_stark dummy_mouse_enhancers_ensembl human_enhancers_ensembl; do | |
| run_output_dir=${output_root}/${data}/${model_name} | |
| mkdir -p "${run_output_dir}" | |
| echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" | |
| torchrun --nproc_per_node=1 \ | |
| --master_port=${MASTER_PORT:-29500} \ | |
| train.py \ | |
| --model_name_or_path ${model} \ | |
| --tokenizer_path ${tokenizer} \ | |
| --trust_remote_code True \ | |
| --data_path $data_path/$data/split \ | |
| --kmer -1 \ | |
| --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ | |
| --model_max_length 512 \ | |
| --per_device_train_batch_size 128 \ | |
| --per_device_eval_batch_size 128 \ | |
| --gradient_accumulation_steps 1 \ | |
| --learning_rate ${lr} \ | |
| --num_train_epochs 5 \ | |
| --fp16 \ | |
| --save_steps 200 \ | |
| --output_dir ${run_output_dir} \ | |
| --evaluation_strategy steps \ | |
| --eval_steps 200 \ | |
| --warmup_steps 30 \ | |
| --logging_steps 100000 \ | |
| --overwrite_output_dir True \ | |
| --log_level info \ | |
| --seed ${seed} \ | |
| --find_unused_parameters False \ | |
| --project_name ${project_name} | |
| done | |
| # for data in demo_human_or_worm drosophila_enhancers_stark dummy_mouse_enhancers_ensembl human_enhancers_ensembl; do # length mostly 2000+ | |
| # run_output_dir=${output_root}/${data}/${model_name} | |
| # mkdir -p "${run_output_dir}" | |
| # echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" | |
| # torchrun --nproc_per_node=1 \ | |
| # --master_port=${MASTER_PORT:-29500} \ | |
| # train.py \ | |
| # --model_name_or_path ${model} \ | |
| # --tokenizer_path ${tokenizer} \ | |
| # --trust_remote_code True \ | |
| # --data_path $data_path/$data/split \ | |
| # --kmer -1 \ | |
| # --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ | |
| # --model_max_length 512 \ | |
| # --per_device_train_batch_size 128 \ | |
| # --per_device_eval_batch_size 128 \ | |
| # --gradient_accumulation_steps 1 \ | |
| # --learning_rate ${lr} \ | |
| # --num_train_epochs 5 \ | |
| # --fp16 \ | |
| # --save_steps 200 \ | |
| # --output_dir ${run_output_dir} \ | |
| # --evaluation_strategy steps \ | |
| # --eval_steps 200 \ | |
| # --warmup_steps 30 \ | |
| # --logging_steps 100000 \ | |
| # --overwrite_output_dir True \ | |
| # --log_level info \ | |
| # --seed ${seed} \ | |
| # --find_unused_parameters False \ | |
| # --project_name ${project_name} | |
| # done | |
| # for data in human_ensembl_regulatory; do # length ~200-700 | |
| # run_output_dir=${output_root}/${data}/${model_name} | |
| # mkdir -p "${run_output_dir}" | |
| # echo "Running ${model_name} on ${data}, seed ${seed}, lr ${lr}, output ${run_output_dir}" | |
| # torchrun --nproc_per_node=1 \ | |
| # --master_port=${MASTER_PORT:-29500} \ | |
| # train.py \ | |
| # --model_name_or_path ${model} \ | |
| # --tokenizer_path ${tokenizer} \ | |
| # --trust_remote_code True \ | |
| # --data_path $data_path/$data/split \ | |
| # --kmer -1 \ | |
| # --run_name ${model_name}_hg38_BPE_${lr}_${data}_seed${seed} \ | |
| # --model_max_length 250 \ | |
| # --per_device_train_batch_size 128 \ | |
| # --per_device_eval_batch_size 128 \ | |
| # --gradient_accumulation_steps 1 \ | |
| # --learning_rate ${lr} \ | |
| # --num_train_epochs 8 \ | |
| # --fp16 \ | |
| # --save_steps 200 \ | |
| # --output_dir ${run_output_dir} \ | |
| # --evaluation_strategy steps \ | |
| # --eval_steps 200 \ | |
| # --warmup_steps 30 \ | |
| # --logging_steps 100000 \ | |
| # --overwrite_output_dir True \ | |
| # --log_level info \ | |
| # --seed ${seed} \ | |
| # --find_unused_parameters False \ | |
| # --project_name ${project_name} | |
| # done | |
| done | |
| done | |