#!/bin/bash set -euo pipefail # Usage: # nohup bash run_hg38_1024_multi_nt.sh \ # ft_data \ # full_output_multi_tune_hg38_1024 \ # genomic_bench_tune_hg38_1024 \ # 0 > full_multi_tune_hg38_1024_3e-5.log 2>&1 & # # Args: # 1) data_path (e.g., ft_data) # 2) output_path # 3) project_name # 4) gpu_id (optional, default: 0) source ~/miniconda3/etc/profile.d/conda.sh conda activate bpe SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" data_path=${1:?"Missing data_path"} output_path=${2:?"Missing output_path"} project_name=${3:?"Missing project_name"} gpu_id=${4:-0} export CUDA_VISIBLE_DEVICES="${gpu_id}" BEST_PARAMS_CSV="/home/n5huang/dna_token/best_params_len2_5120_by_task.csv" MODEL="/home/n5huang/dna_token/pretrain/models/base_5120/checkpoint-100000" TOKENIZER="/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/vocab_5120/5120_tokenizer.json" MODEL_NAME="base_5120" if [[ ! -d "${data_path}" && -d "${SCRIPT_DIR}/${data_path}" ]]; then data_path="${SCRIPT_DIR}/${data_path}" fi if [[ ! -d "${data_path}" ]]; then echo "data_path does not exist: ${data_path}" >&2 exit 1 fi declare -A TASK_LR declare -A TASK_WD declare -A TASK_WR declare -A TASK_EP declare -A TASK_SEED while IFS=, read -r benchmark task metric best_score lr weight_decay warmup_ratio num_train_epochs selected_epoch seed run_name; do [[ "${benchmark}" == "benchmark" ]] && continue [[ "${benchmark}" != "NT" ]] && continue TASK_LR["${task}"]="${lr}" TASK_WD["${task}"]="${weight_decay}" TASK_WR["${task}"]="${warmup_ratio}" TASK_EP["${task}"]="${selected_epoch}" TASK_SEED["${task}"]="${seed}" done < "${BEST_PARAMS_CSV}" run_task() { local task="$1" local model_max_length="$2" local split_dir="${data_path}/${task}/split" local train_csv="${split_dir}/train.csv" if [[ ! -f "${train_csv}" ]]; then echo "[WARN] Missing ${train_csv}, skip ${task}" return fi local best_lr="${TASK_LR[$task]}" local best_wd="${TASK_WD[$task]}" local best_wr="${TASK_WR[$task]}" local best_ep="${TASK_EP[$task]}" local best_seed="${TASK_SEED[$task]}" if [[ -z "${best_lr:-}" ]]; then echo "[WARN] No best params found in CSV for task ${task}, skip" return fi hp_tag="lr${best_lr}_wd${best_wd}_wr${best_wr}_ep${best_ep}_seed${best_seed}" run_name="base5120_${task}_${hp_tag}" run_output_dir="${output_path}/${task}/${MODEL_NAME}/${hp_tag}" result_json="${run_output_dir}/results/${run_name}/eval_results.json" if [[ -f "${result_json}" ]]; then echo "[SKIP] ${run_name}" return fi mkdir -p "${run_output_dir}" echo "[RUN ] ${run_name}" cmd=( python /home/n5huang/dna_token/mario/Finetune-NucleotideTransformerBenchmarks/train.py --model_name_or_path "${MODEL}" --tokenizer_path "${TOKENIZER}" --trust_remote_code True --data_path "${split_dir}" --kmer -1 --run_name "${run_name}" --model_max_length "${model_max_length}" --per_device_train_batch_size 128 --per_device_eval_batch_size 128 --gradient_accumulation_steps 1 --learning_rate "${best_lr}" --weight_decay "${best_wd}" --num_train_epochs "${best_ep}" --lr_scheduler_type linear --warmup_steps 0 --warmup_ratio "${best_wr}" --fp16 --output_dir "${run_output_dir}" --evaluation_strategy epoch --save_strategy epoch --load_best_model_at_end True --metric_for_best_model eval_f1 --greater_is_better True --save_total_limit 1 --save_model True --logging_steps 100 --overwrite_output_dir True --log_level info --seed "${best_seed}" --find_unused_parameters False --project_name "${project_name}" ) "${cmd[@]}" } for task in enhancers enhancers_types; do run_task "${task}" 100 done for task in promoter_all promoter_no_tata promoter_tata; do run_task "${task}" 80 done for task in splice_sites_acceptors splice_sites_all splice_sites_donors; do run_task "${task}" 140 done for task in H2AFZ H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me2 H3K4me3 H3K9ac H3K9me3 H4K20me1; do run_task "${task}" 220 done