#!/bin/bash source ~/miniconda3/etc/profile.d/conda.sh conda activate bpe # Limit visible GPUs # export CUDA_VISIBLE_DEVICES=6 export CUDA_VISIBLE_DEVICES=6 #7 # Positional args: # 1) base directory containing per-class folders with train.csv/dev.csv/test.csv # 2) learning rate # 3) base output directory # 4) wandb project name # 5) optional comma-separated seeds (default: 42) # 6) optional num_train_epochs (default: 5) data_root=$1 lr=$2 output_root=$3 project_name=$4 seeds=$5 #vocab=117M # # Model / tokenizer pairs to sweep # MODELS=( # "/home/n5huang/dna_token/pretrain/models/model_cpu_test_1/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/model_len_reg/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/model_len_2/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/model_tfidf/checkpoint-100000" # ) # TOKENIZERS=( # "/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/tokenizer.json" # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len.json" # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len2.json" # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_tf_idf.json" # ) # MODEL_NAMES=("base" "len_reg" "len2" "tfidf") # Model / tokenizer pairs to sweep MODELS=( # "/home/n5huang/dna_token/pretrain/models/model_cpu_test_1/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/model_len_reg/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/model_len_2/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/model_tfidf/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/model_base_2048/baseline_2048/checkpoint-100000" # "/home/n5huang/dna_token/pretrain/models/len2_2048/checkpoint-100000" "/home/n5huang/dna_token/pretrain/models/len2_3072/checkpoint-100000" ) TOKENIZERS=( # "/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/tokenizer.json" # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len.json" # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len2.json" # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_tf_idf.json" # "/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/vocab_2048/2048_tokenizer.json" # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/vocab_2048/merge_tokenizer_unigram_len2.json" "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/vocab_3072/merge_tokenizer_unigram_len2.json" ) # MODEL_NAMES=("base" "len_reg" "len2" "tfidf") # MODEL_NAMES=("base" "len2") MODEL_NAMES=("len2_3072") IFS=',' read -ra SEED_LIST <<< "${seeds}" for dataset_path in "${data_root}"/*; do [ -d "${dataset_path}" ] || continue dataset_name=$(basename "${dataset_path}") echo "Running fine-tune for ${dataset_name} from ${dataset_path}" for idx in "${!MODELS[@]}"; do model=${MODELS[$idx]} tokenizer=${TOKENIZERS[$idx]} model_name=${MODEL_NAMES[$idx]} for seed in "${SEED_LIST[@]}"; do run_name="hg38_${model_name}_binary_${dataset_name}_${lr}_seed${seed}" python /home/n5huang/dna_token/Finetune-screen/train.py \ --model_name_or_path ${model} \ --tokenizer_path ${tokenizer} \ --trust_remote_code True \ --data_path ${dataset_path} \ --kmer -1 \ --run_name ${run_name} \ --model_max_length 200 \ --per_device_train_batch_size 128 \ --per_device_eval_batch_size 128 \ --gradient_accumulation_steps 1 \ --learning_rate ${lr} \ --num_train_epochs 8 \ --fp16 \ --save_steps 2000 \ --output_dir ${output_root}/${dataset_name}/${model_name}/${lr} \ --evaluation_strategy steps \ --eval_steps 2000 \ --warmup_steps 30 \ --logging_steps 100000 \ --overwrite_output_dir True \ --log_level info \ --seed ${seed} \ --find_unused_parameters False \ --project_name ${project_name} done done done