Upload 2 files

887ef68 verified 2 months ago

4.33 kB

	#!/bin/bash
	set -euo pipefail

	# Usage:
	# nohup bash run_hg38_1024_multi_nt.sh \
	# ft_data \
	# full_output_multi_tune_hg38_1024 \
	# genomic_bench_tune_hg38_1024 \
	# 0 > full_multi_tune_hg38_1024_3e-5.log 2>&1 &
	#
	# Args:
	# 1) data_path (e.g., ft_data)
	# 2) output_path
	# 3) project_name
	# 4) gpu_id (optional, default: 0)

	source ~/miniconda3/etc/profile.d/conda.sh
	conda activate bpe

	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

	data_path=${1:?"Missing data_path"}
	output_path=${2:?"Missing output_path"}
	project_name=${3:?"Missing project_name"}
	gpu_id=${4:-0}

	export CUDA_VISIBLE_DEVICES="${gpu_id}"

	BEST_PARAMS_CSV="/home/n5huang/dna_token/best_params_len2_5120_by_task.csv"

	MODEL="/home/n5huang/dna_token/pretrain/models/base_5120/checkpoint-100000"
	TOKENIZER="/home/n5huang/dna_token/tokenizer_evaluation/baseline_bpe/vocab_5120/5120_tokenizer.json"
	MODEL_NAME="base_5120"

	if [[ ! -d "${data_path}" && -d "${SCRIPT_DIR}/${data_path}" ]]; then
	data_path="${SCRIPT_DIR}/${data_path}"
	fi

	if [[ ! -d "${data_path}" ]]; then
	echo "data_path does not exist: ${data_path}" >&2
	exit 1
	fi

	declare -A TASK_LR
	declare -A TASK_WD
	declare -A TASK_WR
	declare -A TASK_EP
	declare -A TASK_SEED

	while IFS=, read -r benchmark task metric best_score lr weight_decay warmup_ratio num_train_epochs selected_epoch seed run_name; do
	[[ "${benchmark}" == "benchmark" ]] && continue
	[[ "${benchmark}" != "NT" ]] && continue

	TASK_LR["${task}"]="${lr}"
	TASK_WD["${task}"]="${weight_decay}"
	TASK_WR["${task}"]="${warmup_ratio}"
	TASK_EP["${task}"]="${selected_epoch}"
	TASK_SEED["${task}"]="${seed}"
	done < "${BEST_PARAMS_CSV}"

	run_task() {
	local task="$1"
	local model_max_length="$2"

	local split_dir="${data_path}/${task}/split"
	local train_csv="${split_dir}/train.csv"

	if [[ ! -f "${train_csv}" ]]; then
	echo "[WARN] Missing ${train_csv}, skip ${task}"
	return
	fi

	local best_lr="${TASK_LR[$task]}"
	local best_wd="${TASK_WD[$task]}"
	local best_wr="${TASK_WR[$task]}"
	local best_ep="${TASK_EP[$task]}"
	local best_seed="${TASK_SEED[$task]}"

	if [[ -z "${best_lr:-}" ]]; then
	echo "[WARN] No best params found in CSV for task ${task}, skip"
	return
	fi

	hp_tag="lr${best_lr}_wd${best_wd}_wr${best_wr}_ep${best_ep}_seed${best_seed}"
	run_name="base5120_${task}_${hp_tag}"
	run_output_dir="${output_path}/${task}/${MODEL_NAME}/${hp_tag}"
	result_json="${run_output_dir}/results/${run_name}/eval_results.json"

	if [[ -f "${result_json}" ]]; then
	echo "[SKIP] ${run_name}"
	return
	fi

	mkdir -p "${run_output_dir}"
	echo "[RUN ] ${run_name}"

	cmd=(
	python /home/n5huang/dna_token/mario/Finetune-NucleotideTransformerBenchmarks/train.py
	--model_name_or_path "${MODEL}"
	--tokenizer_path "${TOKENIZER}"
	--trust_remote_code True
	--data_path "${split_dir}"
	--kmer -1
	--run_name "${run_name}"
	--model_max_length "${model_max_length}"
	--per_device_train_batch_size 128
	--per_device_eval_batch_size 128
	--gradient_accumulation_steps 1
	--learning_rate "${best_lr}"
	--weight_decay "${best_wd}"
	--num_train_epochs "${best_ep}"
	--lr_scheduler_type linear
	--warmup_steps 0
	--warmup_ratio "${best_wr}"
	--fp16
	--output_dir "${run_output_dir}"
	--evaluation_strategy epoch
	--save_strategy epoch
	--load_best_model_at_end True
	--metric_for_best_model eval_f1
	--greater_is_better True
	--save_total_limit 1
	--save_model True
	--logging_steps 100
	--overwrite_output_dir True
	--log_level info
	--seed "${best_seed}"
	--find_unused_parameters False
	--project_name "${project_name}"
	)
	"${cmd[@]}"
	}

	for task in enhancers enhancers_types; do
	run_task "${task}" 100
	done

	for task in promoter_all promoter_no_tata promoter_tata; do
	run_task "${task}" 80
	done

	for task in splice_sites_acceptors splice_sites_all splice_sites_donors; do
	run_task "${task}" 140
	done

	for task in H2AFZ H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me2 H3K4me3 H3K9ac H3K9me3 H4K20me1; do
	run_task "${task}" 220
	done