| #!/bin/bash |
| |
| source /root/miniconda3/etc/profile.d/conda.sh |
|
|
| |
| conda activate bpe_v2 |
| export CUDA_VISIBLE_DEVICES=2 |
|
|
| data_path=$1 |
| lr=$2 |
| output_root=$3 |
| project_name=$4 |
| |
|
|
|
|
| |
| MODELS=( |
| "/root/NaN/dna-tokenizer/pretrain/models/base_2048/checkpoint-100000" |
| "/root/NaN/dna-tokenizer/pretrain/models/len2_2048/checkpoint-100000" |
| "/root/NaN/dna-tokenizer/pretrain/models/len2_3072/checkpoint-100000" |
| |
| |
| |
| ) |
| TOKENIZERS=( |
| "/root/NaN/dna-tokenizer/baseline_bpe/vocab_2048/2048_tokenizer.json" |
| "/root/NaN/dna-tokenizer/merge_bpe/vocab_2048/merge_tokenizer_unigram_len2.json" |
| "/root/NaN/dna-tokenizer/merge_bpe/vocab_3072/merge_tokenizer_unigram_len2.json" |
| |
| |
| |
| ) |
|
|
| MODEL_NAMES=("base_2048" "len2_2048" "len2_3072") |
| |
|
|
| echo "The provided data_path is $data_path" |
|
|
| for seed in 42 |
| do |
| dataset_name=$(basename "${data_path}") |
| for i in "${!MODEL_NAMES[@]}" |
| do |
| model=${MODELS[$i]} |
| tokenizer=${TOKENIZERS[$i]} |
| name=${MODEL_NAMES[$i]} |
|
|
| run_output_dir="${output_root}/${name}" |
| mkdir -p "${run_output_dir}" |
|
|
| torchrun --nproc_per_node=1 \ |
| --master_port=${MASTER_PORT:-29500} \ |
| /root/NaN/dna-tokenizer/SFT/train.py \ |
| --model_name_or_path ${model} \ |
| --tokenizer_path ${tokenizer} \ |
| --trust_remote_code True \ |
| --data_path ${data_path} \ |
| --kmer -1 \ |
| --run_name hg38_${name}_${lr}_${dataset_name}_seed${seed} \ |
| --model_max_length 200 \ |
| --per_device_train_batch_size 128 \ |
| --per_device_eval_batch_size 128 \ |
| --gradient_accumulation_steps 1 \ |
| --learning_rate ${lr} \ |
| --num_train_epochs 3 \ |
| --fp16 \ |
| --save_steps 2000 \ |
| --output_dir ${run_output_dir} \ |
| --evaluation_strategy steps \ |
| --eval_steps 2000 \ |
| --warmup_steps 30 \ |
| --logging_steps 100000 \ |
| --overwrite_output_dir True \ |
| --log_level info \ |
| --seed ${seed} \ |
| --find_unused_parameters False \ |
| --project_name ${project_name} |
| done |
| done |
|
|