nancyH's picture
Upload 2 files
887ef68 verified
#!/bin/bash
set -euo pipefail
# Usage:
# nohup bash run_hg38_1024_multi_nt.sh \
# ft_data \
# full_output_multi_tune_hg38_1024 \
# genomic_bench_tune_hg38_1024 \
# 0 > full_multi_tune_hg38_1024_3e-5.log 2>&1 &
#
# Args:
# 1) data_path (e.g., ft_data)
# 2) output_path
# 3) project_name
# 4) gpu_id (optional, default: 0)
source ~/miniconda3/etc/profile.d/conda.sh
conda activate bpe
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
data_path=${1:?"Missing data_path"}
output_path=${2:?"Missing output_path"}
project_name=${3:?"Missing project_name"}
gpu_id=${4:-0}
export CUDA_VISIBLE_DEVICES="${gpu_id}"
BEST_PARAMS_CSV="/home/n5huang/dna_token/best_params_len2_5120_by_task.csv"
MODEL="/home/n5huang/dna_token/pretrain/models/base_5120/checkpoint-100000"
TOKENIZER="/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/vocab_5120/5120_tokenizer.json"
MODEL_NAME="base_5120"
if [[ ! -d "${data_path}" && -d "${SCRIPT_DIR}/${data_path}" ]]; then
data_path="${SCRIPT_DIR}/${data_path}"
fi
if [[ ! -d "${data_path}" ]]; then
echo "data_path does not exist: ${data_path}" >&2
exit 1
fi
declare -A TASK_LR
declare -A TASK_WD
declare -A TASK_WR
declare -A TASK_EP
declare -A TASK_SEED
while IFS=, read -r benchmark task metric best_score lr weight_decay warmup_ratio num_train_epochs selected_epoch seed run_name; do
[[ "${benchmark}" == "benchmark" ]] && continue
[[ "${benchmark}" != "NT" ]] && continue
TASK_LR["${task}"]="${lr}"
TASK_WD["${task}"]="${weight_decay}"
TASK_WR["${task}"]="${warmup_ratio}"
TASK_EP["${task}"]="${selected_epoch}"
TASK_SEED["${task}"]="${seed}"
done < "${BEST_PARAMS_CSV}"
run_task() {
local task="$1"
local model_max_length="$2"
local split_dir="${data_path}/${task}/split"
local train_csv="${split_dir}/train.csv"
if [[ ! -f "${train_csv}" ]]; then
echo "[WARN] Missing ${train_csv}, skip ${task}"
return
fi
local best_lr="${TASK_LR[$task]}"
local best_wd="${TASK_WD[$task]}"
local best_wr="${TASK_WR[$task]}"
local best_ep="${TASK_EP[$task]}"
local best_seed="${TASK_SEED[$task]}"
if [[ -z "${best_lr:-}" ]]; then
echo "[WARN] No best params found in CSV for task ${task}, skip"
return
fi
hp_tag="lr${best_lr}_wd${best_wd}_wr${best_wr}_ep${best_ep}_seed${best_seed}"
run_name="base5120_${task}_${hp_tag}"
run_output_dir="${output_path}/${task}/${MODEL_NAME}/${hp_tag}"
result_json="${run_output_dir}/results/${run_name}/eval_results.json"
if [[ -f "${result_json}" ]]; then
echo "[SKIP] ${run_name}"
return
fi
mkdir -p "${run_output_dir}"
echo "[RUN ] ${run_name}"
cmd=(
python /home/n5huang/dna_token/mario/Finetune-NucleotideTransformerBenchmarks/train.py
--model_name_or_path "${MODEL}"
--tokenizer_path "${TOKENIZER}"
--trust_remote_code True
--data_path "${split_dir}"
--kmer -1
--run_name "${run_name}"
--model_max_length "${model_max_length}"
--per_device_train_batch_size 128
--per_device_eval_batch_size 128
--gradient_accumulation_steps 1
--learning_rate "${best_lr}"
--weight_decay "${best_wd}"
--num_train_epochs "${best_ep}"
--lr_scheduler_type linear
--warmup_steps 0
--warmup_ratio "${best_wr}"
--fp16
--output_dir "${run_output_dir}"
--evaluation_strategy epoch
--save_strategy epoch
--load_best_model_at_end True
--metric_for_best_model eval_f1
--greater_is_better True
--save_total_limit 1
--save_model True
--logging_steps 100
--overwrite_output_dir True
--log_level info
--seed "${best_seed}"
--find_unused_parameters False
--project_name "${project_name}"
)
"${cmd[@]}"
}
for task in enhancers enhancers_types; do
run_task "${task}" 100
done
for task in promoter_all promoter_no_tata promoter_tata; do
run_task "${task}" 80
done
for task in splice_sites_acceptors splice_sites_all splice_sites_donors; do
run_task "${task}" 140
done
for task in H2AFZ H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me2 H3K4me3 H3K9ac H3K9me3 H4K20me1; do
run_task "${task}" 220
done