File size: 5,701 Bytes
8cd1f2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#!/bin/bash -ex
source ./scripts/env.sh
# 1: model
# 2: dataset
# 3: size
# 4: loss
# 5: num_seed
# 6: val_check_interval
# 7: train_dir
# 8: early_stopping
# 9: train_list
# 10: val_list
# 11: test_list
run_lora() {
local model=$1
local dataset=$2
local size=$3
local loss=$4
local num_seed=$5
local seed=$((base_seed + num_seed))
local val_check_interval=$6
local train_dir=$7
local early_stopping=$8
local train_list=$9
local val_list=${10}
local test_list=${11}
local lora_args="--lora_r=8 --lora_alpha=16 --lora_dropout=0.05 --lora_query --lora_key --lora_value --lora_projection --lora_mlp --lora_head"
local global_batch_size=8
local micro_batch_size=1
local learning_rate=0.0001
local optimizer="adamw"
local weight_decay=0.0
local patience=10
local precision="bf16-true"
if [ $early_stopping = true ] && [ $train_list = "0.0-1.0" ]; then
local max_steps=$(python -c "import torch; print(torch.load('$train_dir/../../0.0-0.7/0.7-1.0/best.ckpt', weights_only=False)['step_count'],end='')")
else
local max_steps=-1
fi
# TRAIN
local model_dir="$CHECKPOINTS_DIR/${model2checkpoint[$model]}"
local log_dir="$train_dir/logs"
local output_checkpoint_dir="$train_dir/checkpoint"
if [ ! -f $train_dir/train_args.yaml ]; then
mkdir -p $train_dir $log_dir $output_checkpoint_dir
for file in config.json generation_config.json model_config.yaml tokenizer.json tokenizer.model tokenizer_config.json; do
if [ -f $model_dir/$file ]; then
cp $model_dir/$file $output_checkpoint_dir
fi
done
ln -sf $(readlink -f $model_dir/lit_model.pth) $output_checkpoint_dir/lit_model.pth
python -m llmcal.scripts.train_lora \
--base_checkpoint_dir $model_dir \
--data_paths outputs/prompts/$model/$dataset/all.jsonl \
--train_lists lists/$dataset/size=$size/seed=$num_seed/$train_list.txt \
--val_lists lists/$dataset/size=$size/seed=$num_seed/$val_list.txt \
--output_dir $train_dir \
--output_checkpoint_dir $output_checkpoint_dir \
--log_dir $log_dir \
--precision $precision \
--devices 1 \
--num_nodes 1 \
--global_batch_size $global_batch_size \
--micro_batch_size $micro_batch_size \
--val_check_interval $val_check_interval \
--learning_rate $learning_rate \
--optimizer $optimizer \
--weight_decay $weight_decay \
--loss $loss \
--patience $patience \
--max_steps $max_steps \
--seed $seed \
$lora_args
fi
# PREDICT ON TEST
local output_dir="$train_dir/test=$dataset/list=$test_list"
if [ ! -f $output_dir/logits.csv ]; then
mkdir -p $output_dir
python -m llmcal.scripts.run_posteriors \
--base_checkpoint_dir $model_dir \
--checkpoint_dir $output_checkpoint_dir \
--peft "lora" \
--data_path outputs/prompts/$model/$dataset/all.jsonl \
--output_dir $output_dir \
--prediction_lists lists/$dataset/$test_list.txt \
--precision $precision \
--devices 1 \
--num_nodes 1 \
--batch_size 1 \
--max_seq_length $max_seq_length \
$lora_args
fi
}
# 1: model
# 2: sizes
# 3: val_check_interval
run_lora_vs_samples() {
local model=$1
local val_check_interval=$2
for size in ${FACTORS[@]}; do
for dataset in "${DATASETS[@]}"; do
local test_list="test_${dataset2testsize[$dataset]}"
local num_seeds=${dataset2nseeds[$dataset]}
for num_seed in $(seq 0 $(($num_seeds - 1))); do
# Train lora-ans without early stopping on 70% of the data
train_list="0.0-0.7"
val_list="0.0-0.3"
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans_no_es/$train_list/$val_list"
mkdir -p $train_dir
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir false $train_list $val_list $test_list
# Train lora-ans without early stopping on 100% of the data
train_list="0.0-1.0"
val_list="0.7-1.0"
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans_no_es/$train_list/$val_list"
mkdir -p $train_dir
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir false $train_list $val_list $test_list
# Train lora-ans with early stopping on 70% of the data
train_list="0.0-0.7"
val_list="0.7-1.0"
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans/$train_list/$val_list"
mkdir -p $train_dir
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir true $train_list $val_list $test_list
# Train lora-ans with early stopping on 100% of the data
train_list="0.0-1.0"
val_list="0.7-1.0"
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans/$train_list/$val_list"
mkdir -p $train_dir
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir true $train_list $val_list $test_list
done
done
done
}
run_lora_vs_samples $model 16
|