File size: 2,758 Bytes
ef7d2db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/bash
# --- initialize conda ---
source /root/miniconda3/etc/profile.d/conda.sh

# --- activate env ---
conda activate bpe_v2
export CUDA_VISIBLE_DEVICES=2

data_path=$1
lr=$2
output_root=$3
project_name=$4
#vocab=117M


# Model / tokenizer pairs to sweep
MODELS=(
    "/root/NaN/dna-tokenizer/pretrain/models/base_2048/checkpoint-100000"
    "/root/NaN/dna-tokenizer/pretrain/models/len2_2048/checkpoint-100000"
    "/root/NaN/dna-tokenizer/pretrain/models/len2_3072/checkpoint-100000"
    # "/root/NaN/dna-tokenizer/pretrain/models/base_3072/checkpoint-100000"
    # "/root/NaN/dna-tokenizer/pretrain/models/base_4096/checkpoint-100000"
    # "/root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000"
)
TOKENIZERS=(
    "/root/NaN/dna-tokenizer/baseline_bpe/vocab_2048/2048_tokenizer.json"
    "/root/NaN/dna-tokenizer/merge_bpe/vocab_2048/merge_tokenizer_unigram_len2.json"
    "/root/NaN/dna-tokenizer/merge_bpe/vocab_3072/merge_tokenizer_unigram_len2.json"
    # "/root/NaN/dna-tokenizer/baseline_bpe/vocab_3072/3072_tokenizer.json"
    # "/root/NaN/dna-tokenizer/baseline_bpe/vocab_4096/4096_tokenizer.json"
    # "/root/NaN/dna-tokenizer/merge_bpe/vocab_4096/merge_tokenizer_unigram_len2.json"
)

MODEL_NAMES=("base_2048" "len2_2048" "len2_3072")
# MODEL_NAMES=("base_3072" "base_4096" "len2_4096")

echo "The provided data_path is $data_path"

for seed in 42
do
    dataset_name=$(basename "${data_path}")
    for i in "${!MODEL_NAMES[@]}"
    do
        model=${MODELS[$i]}
        tokenizer=${TOKENIZERS[$i]}
        name=${MODEL_NAMES[$i]}

        run_output_dir="${output_root}/${name}"
        mkdir -p "${run_output_dir}"

        torchrun --nproc_per_node=1 \
            --master_port=${MASTER_PORT:-29500} \
            /root/NaN/dna-tokenizer/SFT/train.py \
            --model_name_or_path ${model} \
            --tokenizer_path ${tokenizer} \
            --trust_remote_code True \
            --data_path  ${data_path} \
            --kmer -1 \
            --run_name hg38_${name}_${lr}_${dataset_name}_seed${seed} \
            --model_max_length 200 \
            --per_device_train_batch_size 128 \
            --per_device_eval_batch_size 128 \
            --gradient_accumulation_steps 1 \
            --learning_rate ${lr} \
            --num_train_epochs 3 \
            --fp16 \
            --save_steps 2000 \
            --output_dir ${run_output_dir} \
            --evaluation_strategy steps \
            --eval_steps 2000 \
            --warmup_steps 30 \
            --logging_steps 100000 \
            --overwrite_output_dir True \
            --log_level info \
            --seed ${seed} \
            --find_unused_parameters False \
            --project_name ${project_name}
    done
done