#!/usr/bin/env bash # ============================================================ # ITFormer × RATs40K univariate — 2-GPU full pipeline # # Stage A: Pre-train TimeSeriesEncoder (MAE) # Stage B: SFT TLM (TS Encoder + ITFormer + LLM) # Stage C: Eval on TSAD_test # # Usage: # bash scripts/run_rats40k_2gpu.sh # # Override examples: # CUDA_VISIBLE_DEVICES=2,3 LLM_MODEL_PATH=/path/to/Qwen2.5-3B \ # IT_D_MODEL=2048 IT_N_HEADS=16 SFT_LR=1e-5 bash scripts/run_rats40k_2gpu.sh # ============================================================ set -Eeuo pipefail PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$PROJECT_DIR" # ── environment ────────────────────────────────────────────────────────── PYTHON_BIN="${PYTHON_BIN:-/home/suiqk/anaconda3/envs/scalerag-ts-v4/bin/python}" ACCELERATE_BIN="$(dirname "$PYTHON_BIN")/accelerate" ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-accelerate_config_2gpu.yaml}" CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}" RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}" # ── data ───────────────────────────────────────────────────────────────── DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}" # ── TS encoder (PatchTST) ──────────────────────────────────────────────── # Trained from scratch on RATs40K (Stage A). patch_len=16 / input_len=128 -> 8 patches, # better suited to RATs40K short sequences (len 32-128) for the MAE objective. # Inputs are per-instance z-scored in the dataset before the encoder. PATCH_LEN="${PATCH_LEN:-16}" STRIDE="${STRIDE:-16}" INPUT_LEN="${INPUT_LEN:-128}" D_MODEL="${D_MODEL:-512}" N_HEADS="${N_HEADS:-8}" E_LAYERS="${E_LAYERS:-4}" # ── LLM ────────────────────────────────────────────────────────────────── # Default: Qwen2.5-7B (text), matching Time-RA's 7B-scale RATs40K experiments. LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-7b-instruct}" # Smaller options (override LLM_MODEL_PATH + IT_D_MODEL + IT_N_HEADS together): # 0.5B : LLM/Qwen2.5-0.5B-Instruct IT_D_MODEL=896 IT_N_HEADS=8 # 3B : /mnt/share01/sqk/models/qwen2.5-3b-instruct IT_D_MODEL=2048 IT_N_HEADS=16 # Flash Attention 2 on the LLM (forces bf16 weights). Set to "eager"/"sdpa"/"" to disable. LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-flash_attention_2}" # ── ITFormer adapter ───────────────────────────────────────────────────── # IT_D_MODEL must equal LLM hidden_size: 896 (0.5B) / 2048 (3B) / 3584 (7B) IT_D_MODEL="${IT_D_MODEL:-3584}" IT_N_HEADS="${IT_N_HEADS:-28}" IT_LAYERS="${IT_LAYERS:-2}" PREFIX_NUM="${PREFIX_NUM:-8}" # # of TS summary tokens injected into LLM ADAPTER_TYPE="${ADAPTER_TYPE:-itformer}" # ── Stage A (pretrain) ──────────────────────────────────────────────────── # DEFAULT: pre-train PatchTST from scratch on RATs40K (MAE). # auto = skip if the RATs40K-pretrained ckpt already exists. RUN_STAGE_A="${RUN_STAGE_A:-auto}" # auto | true | false PRETRAIN_OUTPUT_DIR="${PRETRAIN_OUTPUT_DIR:-save/pretrain_rats40k_${RUN_ID}}" PRETRAIN_FINAL_DIR="${PRETRAIN_FINAL_DIR:-save/pretrain_rats40k}" # RATs40K-pretrained PatchTST encoder (patch_len=16, d_model=512, e_layers=4) TS_ENCODER_CKPT="${TS_ENCODER_CKPT:-${PRETRAIN_FINAL_DIR}/model.safetensors}" PRETRAIN_BATCH="${PRETRAIN_BATCH:-256}" PRETRAIN_GRAD_ACCUM="${PRETRAIN_GRAD_ACCUM:-1}" PRETRAIN_LR="${PRETRAIN_LR:-1e-4}" PRETRAIN_EPOCHS="${PRETRAIN_EPOCHS:-20}" MIN_MASK_RATIO="${MIN_MASK_RATIO:-0.4}" MAX_MASK_RATIO="${MAX_MASK_RATIO:-0.6}" # ── Stage B (SFT) ───────────────────────────────────────────────────────── # Defaults tuned for 7B on 2 GPUs. effective batch = SFT_BATCH * SFT_GRAD_ACCUM * 2 = 32 SFT_OUTPUT_DIR="${SFT_OUTPUT_DIR:-save/sft_rats40k_${RUN_ID}}" SFT_BATCH="${SFT_BATCH:-1}" SFT_GRAD_ACCUM="${SFT_GRAD_ACCUM:-16}" SFT_LR="${SFT_LR:-2e-5}" SFT_EPOCHS="${SFT_EPOCHS:-3}" SFT_SAVE_STEPS="${SFT_SAVE_STEPS:-500}" SFT_LOGGING_STEPS="${SFT_LOGGING_STEPS:-20}" FREEZE_TS="${FREEZE_TS:-true}" # freeze TS encoder during SFT BF16="${BF16:-true}" # ── Stage C (eval) ──────────────────────────────────────────────────────── EVAL_BATCH="${EVAL_BATCH:-4}" EVAL_MAX_NEW_TOKENS="${EVAL_MAX_NEW_TOKENS:-256}" EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-inference_results_rats40k_${RUN_ID}}" # ── logging ─────────────────────────────────────────────────────────────── LOG_DIR="${LOG_DIR:-logs}" LOG_FILE="${LOG_FILE:-${LOG_DIR}/rats40k_2gpu_${RUN_ID}.log}" mkdir -p "$LOG_DIR" # ── helpers ─────────────────────────────────────────────────────────────── on_error() { echo "" >&2 echo "========== FAILED ==========" >&2 echo "Time: $(date)" >&2 echo "Line: $2 Exit: $1 Cmd: $3" >&2 exit "$1" } trap 'on_error "$?" "$LINENO" "$BASH_COMMAND"' ERR exec > >(tee -a "$LOG_FILE") 2>&1 export CUDA_VISIBLE_DEVICES export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub" export TOKENIZERS_PARALLELISM=false export WANDB_MODE=offline export OMP_NUM_THREADS=4 # ── sanity checks ───────────────────────────────────────────────────────── for bin in "$PYTHON_BIN" "$ACCELERATE_BIN"; do [ -x "$bin" ] || { echo "Not found: $bin"; exit 1; } done [ -f "$ACCELERATE_CONFIG" ] || { echo "Accelerate config missing: $ACCELERATE_CONFIG"; exit 1; } [ -f "$DATA_PATH" ] || { echo "RATs40K data not found: $DATA_PATH"; exit 1; } "$PYTHON_BIN" - "$LLM_ATTN_IMPLEMENTATION" <<'PY' import sys, importlib.util import torch, accelerate print("torch:", torch.__version__) print("accelerate:", accelerate.__version__) print("cuda devices:", torch.cuda.device_count()) assert torch.cuda.device_count() >= 2, "Need at least 2 GPUs" attn = sys.argv[1] if attn == "flash_attention_2": ok = importlib.util.find_spec("flash_attn") is not None print("flash_attn installed:", ok) if not ok: raise SystemExit("LLM_ATTN_IMPLEMENTATION=flash_attention_2 requires the flash-attn package.") PY echo "================================================================" echo "ITFormer × RATs40K | Run ID: ${RUN_ID}" echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" echo "Data: ${DATA_PATH}" echo "LLM: ${LLM_MODEL_PATH}" echo "patch_len=${PATCH_LEN}, input_len=${INPUT_LEN}, prefix_num=${PREFIX_NUM}" echo "it_d_model=${IT_D_MODEL}, it_n_heads=${IT_N_HEADS}" echo "================================================================" # ── Stage A: pre-train TS encoder ───────────────────────────────────────── case "$RUN_STAGE_A" in auto) [ -f "$TS_ENCODER_CKPT" ] && RUN_STAGE_A=false || RUN_STAGE_A=true ;; true|false) ;; *) echo "Invalid RUN_STAGE_A=$RUN_STAGE_A"; exit 1 ;; esac if [ "$RUN_STAGE_A" = true ]; then echo "" echo "── Stage A: Pre-training TimeSeriesEncoder ──────────────────────" "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" \ train_pretrain_rats40k.py \ --data_path "$DATA_PATH" \ --model TimeSeriesEncoder \ --d_model "$D_MODEL" \ --n_heads "$N_HEADS" \ --e_layers "$E_LAYERS" \ --patch_len "$PATCH_LEN" \ --stride "$STRIDE" \ --input_len "$INPUT_LEN" \ --min_mask_ratio "$MIN_MASK_RATIO" \ --max_mask_ratio "$MAX_MASK_RATIO" \ --per_device_train_batch_size "$PRETRAIN_BATCH" \ --gradient_accumulation_steps "$PRETRAIN_GRAD_ACCUM" \ --learning_rate "$PRETRAIN_LR" \ --num_train_epochs "$PRETRAIN_EPOCHS" \ --output_dir "$PRETRAIN_OUTPUT_DIR" \ --dataloader_num_workers 4 \ --report_to none # copy final weights to canonical path mkdir -p "$PRETRAIN_FINAL_DIR" cp "$PRETRAIN_OUTPUT_DIR/model.safetensors" "$PRETRAIN_FINAL_DIR/model.safetensors" 2>/dev/null || \ cp "$(ls -t "$PRETRAIN_OUTPUT_DIR"/checkpoint-*/model.safetensors 2>/dev/null | head -1)" \ "$PRETRAIN_FINAL_DIR/model.safetensors" echo "Stage A done. TS encoder: ${TS_ENCODER_CKPT}" else echo "Stage A: skipped (TS encoder exists: ${TS_ENCODER_CKPT})" fi [ -f "$TS_ENCODER_CKPT" ] || { echo "TS encoder checkpoint not found: $TS_ENCODER_CKPT"; exit 1; } # ── Stage B: SFT ────────────────────────────────────────────────────────── echo "" echo "── Stage B: SFT (ITFormer + LLM) ───────────────────────────────" # flash_attention_2 needs bf16/fp16 weights -> force bf16 on. if [ "$LLM_ATTN_IMPLEMENTATION" = "flash_attention_2" ] && [ "$BF16" != "true" ]; then echo "flash_attention_2 requires bf16; forcing BF16=true" BF16="true" fi BF16_FLAG="" [ "$BF16" = "true" ] && BF16_FLAG="--bf16" ATTN_FLAG="" [ -n "$LLM_ATTN_IMPLEMENTATION" ] && ATTN_FLAG="--llm_attn_implementation $LLM_ATTN_IMPLEMENTATION" echo "LLM attention: ${LLM_ATTN_IMPLEMENTATION:-default}" "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" \ train_sft_rats40k.py \ --data_path "$DATA_PATH" \ --model TimeSeriesEncoder \ --d_model "$D_MODEL" \ --n_heads "$N_HEADS" \ --e_layers "$E_LAYERS" \ --patch_len "$PATCH_LEN" \ --stride "$STRIDE" \ --input_len "$INPUT_LEN" \ --load_ts_encoder "$TS_ENCODER_CKPT" \ --it_d_model "$IT_D_MODEL" \ --it_n_heads "$IT_N_HEADS" \ --it_layers "$IT_LAYERS" \ --prefix_num "$PREFIX_NUM" \ --adapter_type "$ADAPTER_TYPE" \ --llm_model_path "$LLM_MODEL_PATH" \ --freeze_ts_model "$FREEZE_TS" \ $ATTN_FLAG \ --per_device_train_batch_size "$SFT_BATCH" \ --gradient_accumulation_steps "$SFT_GRAD_ACCUM" \ --learning_rate "$SFT_LR" \ --num_train_epochs "$SFT_EPOCHS" \ --save_steps "$SFT_SAVE_STEPS" \ --logging_steps "$SFT_LOGGING_STEPS" \ --output_dir "$SFT_OUTPUT_DIR" \ --dataloader_num_workers 4 \ $BF16_FLAG \ --report_to none echo "Stage B done. SFT checkpoint: ${SFT_OUTPUT_DIR}" # ── Stage C: eval ───────────────────────────────────────────────────────── # Find the latest checkpoint under SFT_OUTPUT_DIR LATEST_CKPT="$(ls -td "${SFT_OUTPUT_DIR}"/checkpoint-* 2>/dev/null | head -1)" [ -z "$LATEST_CKPT" ] && LATEST_CKPT="$SFT_OUTPUT_DIR" echo "" echo "── Stage C: Evaluation ──────────────────────────────────────────" echo "Checkpoint: ${LATEST_CKPT}" "$PYTHON_BIN" inference_rats40k.py \ --checkpoint "$LATEST_CKPT" \ --data_path "$DATA_PATH" \ --eval_split TSAD_test \ --llm_model_path "$LLM_MODEL_PATH" \ --patch_len "$PATCH_LEN" \ --stride "$STRIDE" \ --input_len "$INPUT_LEN" \ --d_model "$D_MODEL" \ --n_heads "$N_HEADS" \ --e_layers "$E_LAYERS" \ --it_d_model "$IT_D_MODEL" \ --it_n_heads "$IT_N_HEADS" \ --it_layers "$IT_LAYERS" \ --prefix_num "$PREFIX_NUM" \ --adapter_type "$ADAPTER_TYPE" \ $ATTN_FLAG \ --batch_size "$EVAL_BATCH" \ --max_new_tokens "$EVAL_MAX_NEW_TOKENS" \ --output_dir "$EVAL_OUTPUT_DIR" echo "" echo "================================================================" echo "All stages completed successfully." echo " TS encoder : ${TS_ENCODER_CKPT}" echo " SFT ckpt : ${SFT_OUTPUT_DIR}" echo " Eval output: ${EVAL_OUTPUT_DIR}" echo " Log : ${LOG_FILE}" echo "================================================================"