#!/bin/bash # --- initialize conda --- source /root/miniconda3/etc/profile.d/conda.sh # --- activate env --- conda activate bpe_v2 export CUDA_VISIBLE_DEVICES=2 data_path=$1 lr=$2 output_root=$3 project_name=$4 #vocab=117M # Model / tokenizer pairs to sweep MODELS=( "/root/NaN/dna-tokenizer/pretrain/models/base_2048/checkpoint-100000" "/root/NaN/dna-tokenizer/pretrain/models/len2_2048/checkpoint-100000" "/root/NaN/dna-tokenizer/pretrain/models/len2_3072/checkpoint-100000" # "/root/NaN/dna-tokenizer/pretrain/models/base_3072/checkpoint-100000" # "/root/NaN/dna-tokenizer/pretrain/models/base_4096/checkpoint-100000" # "/root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000" ) TOKENIZERS=( "/root/NaN/dna-tokenizer/baseline_bpe/vocab_2048/2048_tokenizer.json" "/root/NaN/dna-tokenizer/merge_bpe/vocab_2048/merge_tokenizer_unigram_len2.json" "/root/NaN/dna-tokenizer/merge_bpe/vocab_3072/merge_tokenizer_unigram_len2.json" # "/root/NaN/dna-tokenizer/baseline_bpe/vocab_3072/3072_tokenizer.json" # "/root/NaN/dna-tokenizer/baseline_bpe/vocab_4096/4096_tokenizer.json" # "/root/NaN/dna-tokenizer/merge_bpe/vocab_4096/merge_tokenizer_unigram_len2.json" ) MODEL_NAMES=("base_2048" "len2_2048" "len2_3072") # MODEL_NAMES=("base_3072" "base_4096" "len2_4096") echo "The provided data_path is $data_path" for seed in 42 do dataset_name=$(basename "${data_path}") for i in "${!MODEL_NAMES[@]}" do model=${MODELS[$i]} tokenizer=${TOKENIZERS[$i]} name=${MODEL_NAMES[$i]} run_output_dir="${output_root}/${name}" mkdir -p "${run_output_dir}" torchrun --nproc_per_node=1 \ --master_port=${MASTER_PORT:-29500} \ /root/NaN/dna-tokenizer/SFT/train.py \ --model_name_or_path ${model} \ --tokenizer_path ${tokenizer} \ --trust_remote_code True \ --data_path ${data_path} \ --kmer -1 \ --run_name hg38_${name}_${lr}_${dataset_name}_seed${seed} \ --model_max_length 200 \ --per_device_train_batch_size 128 \ --per_device_eval_batch_size 128 \ --gradient_accumulation_steps 1 \ --learning_rate ${lr} \ --num_train_epochs 3 \ --fp16 \ --save_steps 2000 \ --output_dir ${run_output_dir} \ --evaluation_strategy steps \ --eval_steps 2000 \ --warmup_steps 30 \ --logging_steps 100000 \ --overwrite_output_dir True \ --log_level info \ --seed ${seed} \ --find_unused_parameters False \ --project_name ${project_name} done done