#!/bin/bash
set -euo pipefail

# Usage:
# nohup bash run_hg38_1024_multi_nt.sh \
#   ft_data \
#   full_output_multi_tune_hg38_1024 \
#   genomic_bench_tune_hg38_1024 \
#   0 > full_multi_tune_hg38_1024_3e-5.log 2>&1 &
#
# Args:
#   1) data_path     (e.g., ft_data)
#   2) output_path
#   3) project_name
#   4) gpu_id        (optional, default: 0)

source ~/miniconda3/etc/profile.d/conda.sh
conda activate bpe

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

data_path=${1:?"Missing data_path"}
output_path=${2:?"Missing output_path"}
project_name=${3:?"Missing project_name"}
gpu_id=${4:-0}

export CUDA_VISIBLE_DEVICES="${gpu_id}"

BEST_PARAMS_CSV="/home/n5huang/dna_token/best_params_len2_5120_by_task.csv"

MODEL="/home/n5huang/dna_token/pretrain/models/base_5120/checkpoint-100000" 
TOKENIZER="/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/vocab_5120/5120_tokenizer.json"
MODEL_NAME="base_5120"

if [[ ! -d "${data_path}" && -d "${SCRIPT_DIR}/${data_path}" ]]; then
    data_path="${SCRIPT_DIR}/${data_path}"
fi

if [[ ! -d "${data_path}" ]]; then
    echo "data_path does not exist: ${data_path}" >&2
    exit 1
fi

declare -A TASK_LR
declare -A TASK_WD
declare -A TASK_WR
declare -A TASK_EP
declare -A TASK_SEED

while IFS=, read -r benchmark task metric best_score lr weight_decay warmup_ratio num_train_epochs selected_epoch seed run_name; do
    [[ "${benchmark}" == "benchmark" ]] && continue
    [[ "${benchmark}" != "NT" ]] && continue

    TASK_LR["${task}"]="${lr}"
    TASK_WD["${task}"]="${weight_decay}"
    TASK_WR["${task}"]="${warmup_ratio}"
    TASK_EP["${task}"]="${selected_epoch}"
    TASK_SEED["${task}"]="${seed}"
done < "${BEST_PARAMS_CSV}"

run_task() {
    local task="$1"
    local model_max_length="$2"

    local split_dir="${data_path}/${task}/split"
    local train_csv="${split_dir}/train.csv"

    if [[ ! -f "${train_csv}" ]]; then
        echo "[WARN] Missing ${train_csv}, skip ${task}"
        return
    fi

    local best_lr="${TASK_LR[$task]}"
    local best_wd="${TASK_WD[$task]}"
    local best_wr="${TASK_WR[$task]}"
    local best_ep="${TASK_EP[$task]}"
    local best_seed="${TASK_SEED[$task]}"

    if [[ -z "${best_lr:-}" ]]; then
        echo "[WARN] No best params found in CSV for task ${task}, skip"
        return
    fi

    hp_tag="lr${best_lr}_wd${best_wd}_wr${best_wr}_ep${best_ep}_seed${best_seed}"
    run_name="base5120_${task}_${hp_tag}"
    run_output_dir="${output_path}/${task}/${MODEL_NAME}/${hp_tag}"
    result_json="${run_output_dir}/results/${run_name}/eval_results.json"

    if [[ -f "${result_json}" ]]; then
        echo "[SKIP] ${run_name}"
        return
    fi

    mkdir -p "${run_output_dir}"
    echo "[RUN ] ${run_name}"

    cmd=(
    python /home/n5huang/dna_token/mario/Finetune-NucleotideTransformerBenchmarks/train.py
        --model_name_or_path "${MODEL}"
        --tokenizer_path "${TOKENIZER}"
        --trust_remote_code True
        --data_path "${split_dir}"
        --kmer -1
        --run_name "${run_name}"
        --model_max_length "${model_max_length}"
        --per_device_train_batch_size 128
        --per_device_eval_batch_size 128
        --gradient_accumulation_steps 1
        --learning_rate "${best_lr}"
        --weight_decay "${best_wd}"
        --num_train_epochs "${best_ep}"
        --lr_scheduler_type linear
        --warmup_steps 0
        --warmup_ratio "${best_wr}"
        --fp16
        --output_dir "${run_output_dir}"
        --evaluation_strategy epoch
        --save_strategy epoch
        --load_best_model_at_end True
        --metric_for_best_model eval_f1
        --greater_is_better True
        --save_total_limit 1
        --save_model True
        --logging_steps 100
        --overwrite_output_dir True
        --log_level info
        --seed "${best_seed}"
        --find_unused_parameters False
        --project_name "${project_name}"
    )
    "${cmd[@]}"
}

for task in enhancers enhancers_types; do
    run_task "${task}" 100 
done

for task in promoter_all promoter_no_tata promoter_tata; do
    run_task "${task}" 80 
done

for task in splice_sites_acceptors splice_sites_all splice_sites_donors; do
    run_task "${task}" 140
done

for task in H2AFZ H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me2 H3K4me3 H3K9ac H3K9me3 H4K20me1; do
    run_task "${task}" 220
done