| source ~/miniconda3/etc/profile.d/conda.sh | |
| conda activate bpe | |
| # Limit visible GPUs | |
| # export CUDA_VISIBLE_DEVICES=6 | |
| export CUDA_VISIBLE_DEVICES=6 | |
| #7 | |
| # Positional args: | |
| # 1) base directory containing per-class folders with train.csv/dev.csv/test.csv | |
| # 2) learning rate | |
| # 3) base output directory | |
| # 4) wandb project name | |
| # 5) optional comma-separated seeds (default: 42) | |
| # 6) optional num_train_epochs (default: 5) | |
| data_root=$1 | |
| lr=$2 | |
| output_root=$3 | |
| project_name=$4 | |
| seeds=$5 | |
| #vocab=117M | |
| # # Model / tokenizer pairs to sweep | |
| # MODELS=( | |
| # "/home/n5huang/dna_token/pretrain/models/model_cpu_test_1/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_len_reg/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_len_2/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_tfidf/checkpoint-100000" | |
| # ) | |
| # TOKENIZERS=( | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/tokenizer.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len2.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_tf_idf.json" | |
| # ) | |
| # MODEL_NAMES=("base" "len_reg" "len2" "tfidf") | |
| # Model / tokenizer pairs to sweep | |
| MODELS=( | |
| # "/home/n5huang/dna_token/pretrain/models/model_cpu_test_1/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_len_reg/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_len_2/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_tfidf/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/model_base_2048/baseline_2048/checkpoint-100000" | |
| # "/home/n5huang/dna_token/pretrain/models/len2_2048/checkpoint-100000" | |
| "/home/n5huang/dna_token/pretrain/models/len2_3072/checkpoint-100000" | |
| ) | |
| TOKENIZERS=( | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/tokenizer.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_len2.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/merge_tokenizer_unigram_tf_idf.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/vocab_2048/2048_tokenizer.json" | |
| # "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/vocab_2048/merge_tokenizer_unigram_len2.json" | |
| "/home/n5huang/dna_token/tokenizer_evaluation/merge_bpe/vocab_3072/merge_tokenizer_unigram_len2.json" | |
| ) | |
| # MODEL_NAMES=("base" "len_reg" "len2" "tfidf") | |
| # MODEL_NAMES=("base" "len2") | |
| MODEL_NAMES=("len2_3072") | |
| IFS=',' read -ra SEED_LIST <<< "${seeds}" | |
| for dataset_path in "${data_root}"/*; do | |
| [ -d "${dataset_path}" ] || continue | |
| dataset_name=$(basename "${dataset_path}") | |
| echo "Running fine-tune for ${dataset_name} from ${dataset_path}" | |
| for idx in "${!MODELS[@]}"; do | |
| model=${MODELS[$idx]} | |
| tokenizer=${TOKENIZERS[$idx]} | |
| model_name=${MODEL_NAMES[$idx]} | |
| for seed in "${SEED_LIST[@]}"; do | |
| run_name="hg38_${model_name}_binary_${dataset_name}_${lr}_seed${seed}" | |
| python /home/n5huang/dna_token/Finetune-screen/train.py \ | |
| --model_name_or_path ${model} \ | |
| --tokenizer_path ${tokenizer} \ | |
| --trust_remote_code True \ | |
| --data_path ${dataset_path} \ | |
| --kmer -1 \ | |
| --run_name ${run_name} \ | |
| --model_max_length 200 \ | |
| --per_device_train_batch_size 128 \ | |
| --per_device_eval_batch_size 128 \ | |
| --gradient_accumulation_steps 1 \ | |
| --learning_rate ${lr} \ | |
| --num_train_epochs 8 \ | |
| --fp16 \ | |
| --save_steps 2000 \ | |
| --output_dir ${output_root}/${dataset_name}/${model_name}/${lr} \ | |
| --evaluation_strategy steps \ | |
| --eval_steps 2000 \ | |
| --warmup_steps 30 \ | |
| --logging_steps 100000 \ | |
| --overwrite_output_dir True \ | |
| --log_level info \ | |
| --seed ${seed} \ | |
| --find_unused_parameters False \ | |
| --project_name ${project_name} | |
| done | |
| done | |
| done | |