ITFormer / scripts /run_rats40k_2gpu.sh
a12354's picture
Add files using upload-large-folder tool
aabdb98 verified
Raw
History Blame Contribute Delete
13.3 kB
#!/usr/bin/env bash
# ============================================================
# ITFormer Γ— RATs40K univariate β€” 2-GPU full pipeline
#
# Stage A: Pre-train TimeSeriesEncoder (MAE)
# Stage B: SFT TLM (TS Encoder + ITFormer + LLM)
# Stage C: Eval on TSAD_test
#
# Usage:
# bash scripts/run_rats40k_2gpu.sh
#
# Override examples:
# CUDA_VISIBLE_DEVICES=2,3 LLM_MODEL_PATH=/path/to/Qwen2.5-3B \
# IT_D_MODEL=2048 IT_N_HEADS=16 SFT_LR=1e-5 bash scripts/run_rats40k_2gpu.sh
# ============================================================
set -Eeuo pipefail
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$PROJECT_DIR"
# ── environment ──────────────────────────────────────────────────────────
PYTHON_BIN="${PYTHON_BIN:-/home/suiqk/anaconda3/envs/scalerag-ts-v4/bin/python}"
ACCELERATE_BIN="$(dirname "$PYTHON_BIN")/accelerate"
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-accelerate_config_2gpu.yaml}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}"
RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
# ── data ─────────────────────────────────────────────────────────────────
DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}"
# ── TS encoder (PatchTST) ────────────────────────────────────────────────
# Trained from scratch on RATs40K (Stage A). patch_len=16 / input_len=128 -> 8 patches,
# better suited to RATs40K short sequences (len 32-128) for the MAE objective.
# Inputs are per-instance z-scored in the dataset before the encoder.
PATCH_LEN="${PATCH_LEN:-16}"
STRIDE="${STRIDE:-16}"
INPUT_LEN="${INPUT_LEN:-128}"
D_MODEL="${D_MODEL:-512}"
N_HEADS="${N_HEADS:-8}"
E_LAYERS="${E_LAYERS:-4}"
# ── LLM ──────────────────────────────────────────────────────────────────
# Default: Qwen2.5-7B (text), matching Time-RA's 7B-scale RATs40K experiments.
LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-7b-instruct}"
# Smaller options (override LLM_MODEL_PATH + IT_D_MODEL + IT_N_HEADS together):
# 0.5B : LLM/Qwen2.5-0.5B-Instruct IT_D_MODEL=896 IT_N_HEADS=8
# 3B : /mnt/share01/sqk/models/qwen2.5-3b-instruct IT_D_MODEL=2048 IT_N_HEADS=16
# Flash Attention 2 on the LLM (forces bf16 weights). Set to "eager"/"sdpa"/"" to disable.
LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-flash_attention_2}"
# ── ITFormer adapter ─────────────────────────────────────────────────────
# IT_D_MODEL must equal LLM hidden_size: 896 (0.5B) / 2048 (3B) / 3584 (7B)
IT_D_MODEL="${IT_D_MODEL:-3584}"
IT_N_HEADS="${IT_N_HEADS:-28}"
IT_LAYERS="${IT_LAYERS:-2}"
PREFIX_NUM="${PREFIX_NUM:-8}" # # of TS summary tokens injected into LLM
ADAPTER_TYPE="${ADAPTER_TYPE:-itformer}"
# ── Stage A (pretrain) ────────────────────────────────────────────────────
# DEFAULT: pre-train PatchTST from scratch on RATs40K (MAE).
# auto = skip if the RATs40K-pretrained ckpt already exists.
RUN_STAGE_A="${RUN_STAGE_A:-auto}" # auto | true | false
PRETRAIN_OUTPUT_DIR="${PRETRAIN_OUTPUT_DIR:-save/pretrain_rats40k_${RUN_ID}}"
PRETRAIN_FINAL_DIR="${PRETRAIN_FINAL_DIR:-save/pretrain_rats40k}"
# RATs40K-pretrained PatchTST encoder (patch_len=16, d_model=512, e_layers=4)
TS_ENCODER_CKPT="${TS_ENCODER_CKPT:-${PRETRAIN_FINAL_DIR}/model.safetensors}"
PRETRAIN_BATCH="${PRETRAIN_BATCH:-256}"
PRETRAIN_GRAD_ACCUM="${PRETRAIN_GRAD_ACCUM:-1}"
PRETRAIN_LR="${PRETRAIN_LR:-1e-4}"
PRETRAIN_EPOCHS="${PRETRAIN_EPOCHS:-20}"
MIN_MASK_RATIO="${MIN_MASK_RATIO:-0.4}"
MAX_MASK_RATIO="${MAX_MASK_RATIO:-0.6}"
# ── Stage B (SFT) ─────────────────────────────────────────────────────────
# Defaults tuned for 7B on 2 GPUs. effective batch = SFT_BATCH * SFT_GRAD_ACCUM * 2 = 32
SFT_OUTPUT_DIR="${SFT_OUTPUT_DIR:-save/sft_rats40k_${RUN_ID}}"
SFT_BATCH="${SFT_BATCH:-1}"
SFT_GRAD_ACCUM="${SFT_GRAD_ACCUM:-16}"
SFT_LR="${SFT_LR:-2e-5}"
SFT_EPOCHS="${SFT_EPOCHS:-3}"
SFT_SAVE_STEPS="${SFT_SAVE_STEPS:-500}"
SFT_LOGGING_STEPS="${SFT_LOGGING_STEPS:-20}"
FREEZE_TS="${FREEZE_TS:-true}" # freeze TS encoder during SFT
BF16="${BF16:-true}"
# ── Stage C (eval) ────────────────────────────────────────────────────────
EVAL_BATCH="${EVAL_BATCH:-4}"
EVAL_MAX_NEW_TOKENS="${EVAL_MAX_NEW_TOKENS:-256}"
EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-inference_results_rats40k_${RUN_ID}}"
# ── logging ───────────────────────────────────────────────────────────────
LOG_DIR="${LOG_DIR:-logs}"
LOG_FILE="${LOG_FILE:-${LOG_DIR}/rats40k_2gpu_${RUN_ID}.log}"
mkdir -p "$LOG_DIR"
# ── helpers ───────────────────────────────────────────────────────────────
on_error() {
echo "" >&2
echo "========== FAILED ==========" >&2
echo "Time: $(date)" >&2
echo "Line: $2 Exit: $1 Cmd: $3" >&2
exit "$1"
}
trap 'on_error "$?" "$LINENO" "$BASH_COMMAND"' ERR
exec > >(tee -a "$LOG_FILE") 2>&1
export CUDA_VISIBLE_DEVICES
export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub"
export TOKENIZERS_PARALLELISM=false
export WANDB_MODE=offline
export OMP_NUM_THREADS=4
# ── sanity checks ─────────────────────────────────────────────────────────
for bin in "$PYTHON_BIN" "$ACCELERATE_BIN"; do
[ -x "$bin" ] || { echo "Not found: $bin"; exit 1; }
done
[ -f "$ACCELERATE_CONFIG" ] || { echo "Accelerate config missing: $ACCELERATE_CONFIG"; exit 1; }
[ -f "$DATA_PATH" ] || { echo "RATs40K data not found: $DATA_PATH"; exit 1; }
"$PYTHON_BIN" - "$LLM_ATTN_IMPLEMENTATION" <<'PY'
import sys, importlib.util
import torch, accelerate
print("torch:", torch.__version__)
print("accelerate:", accelerate.__version__)
print("cuda devices:", torch.cuda.device_count())
assert torch.cuda.device_count() >= 2, "Need at least 2 GPUs"
attn = sys.argv[1]
if attn == "flash_attention_2":
ok = importlib.util.find_spec("flash_attn") is not None
print("flash_attn installed:", ok)
if not ok:
raise SystemExit("LLM_ATTN_IMPLEMENTATION=flash_attention_2 requires the flash-attn package.")
PY
echo "================================================================"
echo "ITFormer Γ— RATs40K | Run ID: ${RUN_ID}"
echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
echo "Data: ${DATA_PATH}"
echo "LLM: ${LLM_MODEL_PATH}"
echo "patch_len=${PATCH_LEN}, input_len=${INPUT_LEN}, prefix_num=${PREFIX_NUM}"
echo "it_d_model=${IT_D_MODEL}, it_n_heads=${IT_N_HEADS}"
echo "================================================================"
# ── Stage A: pre-train TS encoder ─────────────────────────────────────────
case "$RUN_STAGE_A" in
auto)
[ -f "$TS_ENCODER_CKPT" ] && RUN_STAGE_A=false || RUN_STAGE_A=true ;;
true|false) ;;
*) echo "Invalid RUN_STAGE_A=$RUN_STAGE_A"; exit 1 ;;
esac
if [ "$RUN_STAGE_A" = true ]; then
echo ""
echo "── Stage A: Pre-training TimeSeriesEncoder ──────────────────────"
"$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" \
train_pretrain_rats40k.py \
--data_path "$DATA_PATH" \
--model TimeSeriesEncoder \
--d_model "$D_MODEL" \
--n_heads "$N_HEADS" \
--e_layers "$E_LAYERS" \
--patch_len "$PATCH_LEN" \
--stride "$STRIDE" \
--input_len "$INPUT_LEN" \
--min_mask_ratio "$MIN_MASK_RATIO" \
--max_mask_ratio "$MAX_MASK_RATIO" \
--per_device_train_batch_size "$PRETRAIN_BATCH" \
--gradient_accumulation_steps "$PRETRAIN_GRAD_ACCUM" \
--learning_rate "$PRETRAIN_LR" \
--num_train_epochs "$PRETRAIN_EPOCHS" \
--output_dir "$PRETRAIN_OUTPUT_DIR" \
--dataloader_num_workers 4 \
--report_to none
# copy final weights to canonical path
mkdir -p "$PRETRAIN_FINAL_DIR"
cp "$PRETRAIN_OUTPUT_DIR/model.safetensors" "$PRETRAIN_FINAL_DIR/model.safetensors" 2>/dev/null || \
cp "$(ls -t "$PRETRAIN_OUTPUT_DIR"/checkpoint-*/model.safetensors 2>/dev/null | head -1)" \
"$PRETRAIN_FINAL_DIR/model.safetensors"
echo "Stage A done. TS encoder: ${TS_ENCODER_CKPT}"
else
echo "Stage A: skipped (TS encoder exists: ${TS_ENCODER_CKPT})"
fi
[ -f "$TS_ENCODER_CKPT" ] || { echo "TS encoder checkpoint not found: $TS_ENCODER_CKPT"; exit 1; }
# ── Stage B: SFT ──────────────────────────────────────────────────────────
echo ""
echo "── Stage B: SFT (ITFormer + LLM) ───────────────────────────────"
# flash_attention_2 needs bf16/fp16 weights -> force bf16 on.
if [ "$LLM_ATTN_IMPLEMENTATION" = "flash_attention_2" ] && [ "$BF16" != "true" ]; then
echo "flash_attention_2 requires bf16; forcing BF16=true"
BF16="true"
fi
BF16_FLAG=""
[ "$BF16" = "true" ] && BF16_FLAG="--bf16"
ATTN_FLAG=""
[ -n "$LLM_ATTN_IMPLEMENTATION" ] && ATTN_FLAG="--llm_attn_implementation $LLM_ATTN_IMPLEMENTATION"
echo "LLM attention: ${LLM_ATTN_IMPLEMENTATION:-default}"
"$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" \
train_sft_rats40k.py \
--data_path "$DATA_PATH" \
--model TimeSeriesEncoder \
--d_model "$D_MODEL" \
--n_heads "$N_HEADS" \
--e_layers "$E_LAYERS" \
--patch_len "$PATCH_LEN" \
--stride "$STRIDE" \
--input_len "$INPUT_LEN" \
--load_ts_encoder "$TS_ENCODER_CKPT" \
--it_d_model "$IT_D_MODEL" \
--it_n_heads "$IT_N_HEADS" \
--it_layers "$IT_LAYERS" \
--prefix_num "$PREFIX_NUM" \
--adapter_type "$ADAPTER_TYPE" \
--llm_model_path "$LLM_MODEL_PATH" \
--freeze_ts_model "$FREEZE_TS" \
$ATTN_FLAG \
--per_device_train_batch_size "$SFT_BATCH" \
--gradient_accumulation_steps "$SFT_GRAD_ACCUM" \
--learning_rate "$SFT_LR" \
--num_train_epochs "$SFT_EPOCHS" \
--save_steps "$SFT_SAVE_STEPS" \
--logging_steps "$SFT_LOGGING_STEPS" \
--output_dir "$SFT_OUTPUT_DIR" \
--dataloader_num_workers 4 \
$BF16_FLAG \
--report_to none
echo "Stage B done. SFT checkpoint: ${SFT_OUTPUT_DIR}"
# ── Stage C: eval ─────────────────────────────────────────────────────────
# Find the latest checkpoint under SFT_OUTPUT_DIR
LATEST_CKPT="$(ls -td "${SFT_OUTPUT_DIR}"/checkpoint-* 2>/dev/null | head -1)"
[ -z "$LATEST_CKPT" ] && LATEST_CKPT="$SFT_OUTPUT_DIR"
echo ""
echo "── Stage C: Evaluation ──────────────────────────────────────────"
echo "Checkpoint: ${LATEST_CKPT}"
"$PYTHON_BIN" inference_rats40k.py \
--checkpoint "$LATEST_CKPT" \
--data_path "$DATA_PATH" \
--eval_split TSAD_test \
--llm_model_path "$LLM_MODEL_PATH" \
--patch_len "$PATCH_LEN" \
--stride "$STRIDE" \
--input_len "$INPUT_LEN" \
--d_model "$D_MODEL" \
--n_heads "$N_HEADS" \
--e_layers "$E_LAYERS" \
--it_d_model "$IT_D_MODEL" \
--it_n_heads "$IT_N_HEADS" \
--it_layers "$IT_LAYERS" \
--prefix_num "$PREFIX_NUM" \
--adapter_type "$ADAPTER_TYPE" \
$ATTN_FLAG \
--batch_size "$EVAL_BATCH" \
--max_new_tokens "$EVAL_MAX_NEW_TOKENS" \
--output_dir "$EVAL_OUTPUT_DIR"
echo ""
echo "================================================================"
echo "All stages completed successfully."
echo " TS encoder : ${TS_ENCODER_CKPT}"
echo " SFT ckpt : ${SFT_OUTPUT_DIR}"
echo " Eval output: ${EVAL_OUTPUT_DIR}"
echo " Log : ${LOG_FILE}"
echo "================================================================"