|
|
#!/bin/bash -ex |
|
|
|
|
|
source ./scripts/env.sh |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_lora() { |
|
|
local model=$1 |
|
|
local dataset=$2 |
|
|
local size=$3 |
|
|
local loss=$4 |
|
|
local num_seed=$5 |
|
|
local seed=$((base_seed + num_seed)) |
|
|
local val_check_interval=$6 |
|
|
local train_dir=$7 |
|
|
local early_stopping=$8 |
|
|
local train_list=$9 |
|
|
local val_list=${10} |
|
|
local test_list=${11} |
|
|
local lora_args="--lora_r=8 --lora_alpha=16 --lora_dropout=0.05 --lora_query --lora_key --lora_value --lora_projection --lora_mlp --lora_head" |
|
|
local global_batch_size=8 |
|
|
local micro_batch_size=1 |
|
|
local learning_rate=0.0001 |
|
|
local optimizer="adamw" |
|
|
local weight_decay=0.0 |
|
|
local patience=10 |
|
|
local precision="bf16-true" |
|
|
|
|
|
if [ $early_stopping = true ] && [ $train_list = "0.0-1.0" ]; then |
|
|
local max_steps=$(python -c "import torch; print(torch.load('$train_dir/../../0.0-0.7/0.7-1.0/best.ckpt', weights_only=False)['step_count'],end='')") |
|
|
else |
|
|
local max_steps=-1 |
|
|
fi |
|
|
|
|
|
|
|
|
local model_dir="$CHECKPOINTS_DIR/${model2checkpoint[$model]}" |
|
|
local log_dir="$train_dir/logs" |
|
|
local output_checkpoint_dir="$train_dir/checkpoint" |
|
|
if [ ! -f $train_dir/train_args.yaml ]; then |
|
|
mkdir -p $train_dir $log_dir $output_checkpoint_dir |
|
|
for file in config.json generation_config.json model_config.yaml tokenizer.json tokenizer.model tokenizer_config.json; do |
|
|
if [ -f $model_dir/$file ]; then |
|
|
cp $model_dir/$file $output_checkpoint_dir |
|
|
fi |
|
|
done |
|
|
ln -sf $(readlink -f $model_dir/lit_model.pth) $output_checkpoint_dir/lit_model.pth |
|
|
python -m llmcal.scripts.train_lora \ |
|
|
--base_checkpoint_dir $model_dir \ |
|
|
--data_paths outputs/prompts/$model/$dataset/all.jsonl \ |
|
|
--train_lists lists/$dataset/size=$size/seed=$num_seed/$train_list.txt \ |
|
|
--val_lists lists/$dataset/size=$size/seed=$num_seed/$val_list.txt \ |
|
|
--output_dir $train_dir \ |
|
|
--output_checkpoint_dir $output_checkpoint_dir \ |
|
|
--log_dir $log_dir \ |
|
|
--precision $precision \ |
|
|
--devices 1 \ |
|
|
--num_nodes 1 \ |
|
|
--global_batch_size $global_batch_size \ |
|
|
--micro_batch_size $micro_batch_size \ |
|
|
--val_check_interval $val_check_interval \ |
|
|
--learning_rate $learning_rate \ |
|
|
--optimizer $optimizer \ |
|
|
--weight_decay $weight_decay \ |
|
|
--loss $loss \ |
|
|
--patience $patience \ |
|
|
--max_steps $max_steps \ |
|
|
--seed $seed \ |
|
|
$lora_args |
|
|
fi |
|
|
|
|
|
|
|
|
local output_dir="$train_dir/test=$dataset/list=$test_list" |
|
|
if [ ! -f $output_dir/logits.csv ]; then |
|
|
mkdir -p $output_dir |
|
|
python -m llmcal.scripts.run_posteriors \ |
|
|
--base_checkpoint_dir $model_dir \ |
|
|
--checkpoint_dir $output_checkpoint_dir \ |
|
|
--peft "lora" \ |
|
|
--data_path outputs/prompts/$model/$dataset/all.jsonl \ |
|
|
--output_dir $output_dir \ |
|
|
--prediction_lists lists/$dataset/$test_list.txt \ |
|
|
--precision $precision \ |
|
|
--devices 1 \ |
|
|
--num_nodes 1 \ |
|
|
--batch_size 1 \ |
|
|
--max_seq_length $max_seq_length \ |
|
|
$lora_args |
|
|
fi |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_lora_vs_samples() { |
|
|
local model=$1 |
|
|
local val_check_interval=$2 |
|
|
for size in ${FACTORS[@]}; do |
|
|
for dataset in "${DATASETS[@]}"; do |
|
|
local test_list="test_${dataset2testsize[$dataset]}" |
|
|
local num_seeds=${dataset2nseeds[$dataset]} |
|
|
for num_seed in $(seq 0 $(($num_seeds - 1))); do |
|
|
|
|
|
|
|
|
train_list="0.0-0.7" |
|
|
val_list="0.0-0.3" |
|
|
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans_no_es/$train_list/$val_list" |
|
|
mkdir -p $train_dir |
|
|
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir false $train_list $val_list $test_list |
|
|
|
|
|
|
|
|
train_list="0.0-1.0" |
|
|
val_list="0.7-1.0" |
|
|
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans_no_es/$train_list/$val_list" |
|
|
mkdir -p $train_dir |
|
|
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir false $train_list $val_list $test_list |
|
|
|
|
|
|
|
|
train_list="0.0-0.7" |
|
|
val_list="0.7-1.0" |
|
|
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans/$train_list/$val_list" |
|
|
mkdir -p $train_dir |
|
|
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir true $train_list $val_list $test_list |
|
|
|
|
|
|
|
|
train_list="0.0-1.0" |
|
|
val_list="0.7-1.0" |
|
|
train_dir="outputs/finetune_lora/$model/$dataset/size=$size/seed=$num_seed/lora_ans/$train_list/$val_list" |
|
|
mkdir -p $train_dir |
|
|
run_lora $model $dataset $size ans $num_seed $val_check_interval $train_dir true $train_list $val_list $test_list |
|
|
done |
|
|
done |
|
|
done |
|
|
} |
|
|
|
|
|
run_lora_vs_samples $model 16 |
|
|
|
|
|
|