| #!/bin/bash |
| set -euo pipefail |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| source ~/miniconda3/etc/profile.d/conda.sh |
| conda activate bpe |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
|
|
| data_path=${1:?"Missing data_path"} |
| output_path=${2:?"Missing output_path"} |
| project_name=${3:?"Missing project_name"} |
| gpu_id=${4:-0} |
|
|
| export CUDA_VISIBLE_DEVICES="${gpu_id}" |
|
|
| BEST_PARAMS_CSV="/home/n5huang/dna_token/best_params_len2_5120_by_task.csv" |
|
|
| MODEL="/home/n5huang/dna_token/pretrain/models/base_5120/checkpoint-100000" |
| TOKENIZER="/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/vocab_5120/5120_tokenizer.json" |
| MODEL_NAME="base_5120" |
|
|
| if [[ ! -d "${data_path}" && -d "${SCRIPT_DIR}/${data_path}" ]]; then |
| data_path="${SCRIPT_DIR}/${data_path}" |
| fi |
|
|
| if [[ ! -d "${data_path}" ]]; then |
| echo "data_path does not exist: ${data_path}" >&2 |
| exit 1 |
| fi |
|
|
| declare -A TASK_LR |
| declare -A TASK_WD |
| declare -A TASK_WR |
| declare -A TASK_EP |
| declare -A TASK_SEED |
|
|
| while IFS=, read -r benchmark task metric best_score lr weight_decay warmup_ratio num_train_epochs selected_epoch seed run_name; do |
| [[ "${benchmark}" == "benchmark" ]] && continue |
| [[ "${benchmark}" != "NT" ]] && continue |
|
|
| TASK_LR["${task}"]="${lr}" |
| TASK_WD["${task}"]="${weight_decay}" |
| TASK_WR["${task}"]="${warmup_ratio}" |
| TASK_EP["${task}"]="${selected_epoch}" |
| TASK_SEED["${task}"]="${seed}" |
| done < "${BEST_PARAMS_CSV}" |
|
|
| run_task() { |
| local task="$1" |
| local model_max_length="$2" |
|
|
| local split_dir="${data_path}/${task}/split" |
| local train_csv="${split_dir}/train.csv" |
|
|
| if [[ ! -f "${train_csv}" ]]; then |
| echo "[WARN] Missing ${train_csv}, skip ${task}" |
| return |
| fi |
|
|
| local best_lr="${TASK_LR[$task]}" |
| local best_wd="${TASK_WD[$task]}" |
| local best_wr="${TASK_WR[$task]}" |
| local best_ep="${TASK_EP[$task]}" |
| local best_seed="${TASK_SEED[$task]}" |
|
|
| if [[ -z "${best_lr:-}" ]]; then |
| echo "[WARN] No best params found in CSV for task ${task}, skip" |
| return |
| fi |
|
|
| hp_tag="lr${best_lr}_wd${best_wd}_wr${best_wr}_ep${best_ep}_seed${best_seed}" |
| run_name="base5120_${task}_${hp_tag}" |
| run_output_dir="${output_path}/${task}/${MODEL_NAME}/${hp_tag}" |
| result_json="${run_output_dir}/results/${run_name}/eval_results.json" |
|
|
| if [[ -f "${result_json}" ]]; then |
| echo "[SKIP] ${run_name}" |
| return |
| fi |
|
|
| mkdir -p "${run_output_dir}" |
| echo "[RUN ] ${run_name}" |
|
|
| cmd=( |
| python /home/n5huang/dna_token/mario/Finetune-NucleotideTransformerBenchmarks/train.py |
| --model_name_or_path "${MODEL}" |
| --tokenizer_path "${TOKENIZER}" |
| --trust_remote_code True |
| --data_path "${split_dir}" |
| --kmer -1 |
| --run_name "${run_name}" |
| --model_max_length "${model_max_length}" |
| --per_device_train_batch_size 128 |
| --per_device_eval_batch_size 128 |
| --gradient_accumulation_steps 1 |
| --learning_rate "${best_lr}" |
| --weight_decay "${best_wd}" |
| --num_train_epochs "${best_ep}" |
| --lr_scheduler_type linear |
| --warmup_steps 0 |
| --warmup_ratio "${best_wr}" |
| --fp16 |
| --output_dir "${run_output_dir}" |
| --evaluation_strategy epoch |
| --save_strategy epoch |
| --load_best_model_at_end True |
| --metric_for_best_model eval_f1 |
| --greater_is_better True |
| --save_total_limit 1 |
| --save_model True |
| --logging_steps 100 |
| --overwrite_output_dir True |
| --log_level info |
| --seed "${best_seed}" |
| --find_unused_parameters False |
| --project_name "${project_name}" |
| ) |
| "${cmd[@]}" |
| } |
|
|
| for task in enhancers enhancers_types; do |
| run_task "${task}" 100 |
| done |
|
|
| for task in promoter_all promoter_no_tata promoter_tata; do |
| run_task "${task}" 80 |
| done |
|
|
| for task in splice_sites_acceptors splice_sites_all splice_sites_donors; do |
| run_task "${task}" 140 |
| done |
|
|
| for task in H2AFZ H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me2 H3K4me3 H3K9ac H3K9me3 H4K20me1; do |
| run_task "${task}" 220 |
| done |
|
|