ITFormer / scripts /run_full_train_4gpu.sh
a12354's picture
Add files using upload-large-folder tool
aabdb98 verified
Raw
History Blame Contribute Delete
7.57 kB
#!/bin/bash
set -Eeuo pipefail
# Full training pipeline for ITFormer on 4 GPUs:
# Stage A: pre-train TimeSeriesEncoder
# Stage B: supervised fine-tuning with Qwen2.5-3B-Instruct
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$PROJECT_DIR"
PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}"
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-accelerate_config.yaml}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
PRETRAIN_WORK_DIR="${PRETRAIN_WORK_DIR:-save/pretrain_ts_small_${RUN_ID}}"
PRETRAIN_FINAL_DIR="${PRETRAIN_FINAL_DIR:-save/pretrain}"
TS_ENCODER_CHECKPOINT="${TS_ENCODER_CHECKPOINT:-${PRETRAIN_FINAL_DIR}/model.safetensors}"
RUN_STAGE_A="${RUN_STAGE_A:-auto}"
SFT_OUTPUT_DIR="${SFT_OUTPUT_DIR:-save/sft_qwen2.5_3B_${RUN_ID}}"
LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-3b-instruct}"
ADAPTER_TYPE="${ADAPTER_TYPE:-itformer}"
LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-}"
PRETRAIN_BATCH_SIZE="${PRETRAIN_BATCH_SIZE:-512}"
PRETRAIN_GRADIENT_ACCUMULATION_STEPS="${PRETRAIN_GRADIENT_ACCUMULATION_STEPS:-2}"
SFT_BATCH_SIZE="${SFT_BATCH_SIZE:-2}"
SFT_GRADIENT_ACCUMULATION_STEPS="${SFT_GRADIENT_ACCUMULATION_STEPS:-12}"
SFT_SAVE_TOTAL_LIMIT="${SFT_SAVE_TOTAL_LIMIT:-2}"
SFT_LOGGING_STEPS="${SFT_LOGGING_STEPS:-5}"
PRETRAIN_EPOCHS="${PRETRAIN_EPOCHS:-10}"
SFT_EPOCHS="${SFT_EPOCHS:-2}"
PRETRAIN_LR="${PRETRAIN_LR:-3e-4}"
SFT_LR="${SFT_LR:-2e-5}"
LOG_DIR="${LOG_DIR:-logs}"
LOG_FILE="${LOG_FILE:-${LOG_DIR}/full_train_4gpu_${RUN_ID}.log}"
mkdir -p "$LOG_DIR"
touch "$LOG_FILE"
on_error() {
local exit_code="$1"
local line_no="$2"
local failed_command="$3"
{
echo ""
echo "========== TRAINING FAILED =========="
echo "Time: $(date '+%Y-%m-%d %H:%M:%S')"
echo "Exit code: $exit_code"
echo "Line: $line_no"
echo "Command: $failed_command"
echo "Log file: $LOG_FILE"
echo "====================================="
} >&2
exit "$exit_code"
}
fail() {
local line_no="$1"
shift
local message="$*"
echo "$message" >&2
on_error 1 "$line_no" "$message"
}
trap 'on_error "$?" "$LINENO" "$BASH_COMMAND"' ERR
exec > >(tee -a "$LOG_FILE") 2>&1
export CUDA_VISIBLE_DEVICES
export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub"
export TOKENIZERS_PARALLELISM=false
export WANDB_MODE=offline
echo "Log file: $LOG_FILE"
if [ ! -x "$PYTHON_BIN" ]; then
fail "$LINENO" "Python executable not found or not executable: $PYTHON_BIN. Set PYTHON_BIN=/path/to/bin/python and rerun."
fi
if [ ! -f "$ACCELERATE_CONFIG" ]; then
fail "$LINENO" "Accelerate config not found: $ACCELERATE_CONFIG"
fi
if [ ! -f "$LLM_MODEL_PATH/model.safetensors" ] && \
[ ! -f "$LLM_MODEL_PATH/model.safetensors.index.json" ]; then
fail "$LINENO" "LLM model not found under: $LLM_MODEL_PATH"
fi
if [ ! -f "dataset/datasets/time_series_data.h5" ] || \
[ ! -f "dataset/datasets/train_qa.jsonl" ] || \
[ ! -f "dataset/datasets/test_qa.jsonl" ]; then
fail "$LINENO" "Dataset files are missing under dataset/datasets/"
fi
ACCELERATE_BIN="$(dirname "$PYTHON_BIN")/accelerate"
if [ ! -x "$ACCELERATE_BIN" ]; then
fail "$LINENO" "Accelerate executable not found or not executable: $ACCELERATE_BIN. Install accelerate in the selected Python environment or set PYTHON_BIN to the right environment."
fi
echo "Project dir: $PROJECT_DIR"
echo "Run id: $RUN_ID"
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "Accelerate config: $ACCELERATE_CONFIG"
echo "Python: $PYTHON_BIN"
echo "Accelerate: $ACCELERATE_BIN"
echo "RUN_STAGE_A: $RUN_STAGE_A"
echo "TS encoder checkpoint: $TS_ENCODER_CHECKPOINT"
echo "Adapter type: $ADAPTER_TYPE"
echo "LLM attention implementation: ${LLM_ATTN_IMPLEMENTATION:-default}"
"$PYTHON_BIN" - "$LLM_ATTN_IMPLEMENTATION" <<'PY'
import importlib.util
import sys
import torch
import transformers
import accelerate
attn_implementation = sys.argv[1]
print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
if not torch.cuda.is_available() or torch.cuda.device_count() < 4:
raise SystemExit("Expected at least 4 visible CUDA devices for this script.")
if attn_implementation == "flash_attention_2":
print("flash_attn installed:", importlib.util.find_spec("flash_attn") is not None)
if importlib.util.find_spec("flash_attn") is None:
raise SystemExit("LLM_ATTN_IMPLEMENTATION=flash_attention_2 requires the flash-attn package in this Python environment.")
PY
backup_dir_if_exists() {
local path="$1"
if [ -e "$path" ]; then
local backup="${path}.bak_${RUN_ID}"
echo "Backing up existing $path -> $backup"
mv "$path" "$backup"
fi
}
case "$RUN_STAGE_A" in
auto)
if [ -f "$TS_ENCODER_CHECKPOINT" ]; then
echo "Stage A: skipped; using existing TS encoder checkpoint: $TS_ENCODER_CHECKPOINT"
else
RUN_STAGE_A=true
fi
;;
true|false)
;;
*)
fail "$LINENO" "Invalid RUN_STAGE_A=$RUN_STAGE_A. Use auto, true, or false."
;;
esac
if [ "$RUN_STAGE_A" = true ]; then
backup_dir_if_exists "$PRETRAIN_FINAL_DIR"
echo "Stage A: pre-training TimeSeriesEncoder"
"$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" train_pretrain.py \
--model TimeSeriesEncoder \
--d_model 512 \
--n_heads 8 \
--e_layers 4 \
--patch_len 60 \
--stride 60 \
--input_len 600 \
--output_dir "$PRETRAIN_WORK_DIR" \
--per_device_train_batch_size "$PRETRAIN_BATCH_SIZE" \
--gradient_accumulation_steps "$PRETRAIN_GRADIENT_ACCUMULATION_STEPS" \
--learning_rate "$PRETRAIN_LR" \
--num_train_epochs "$PRETRAIN_EPOCHS" \
--dataloader_num_workers 8 \
--report_to swanlab
TS_ENCODER_CHECKPOINT="${PRETRAIN_FINAL_DIR}/model.safetensors"
if [ ! -f "$TS_ENCODER_CHECKPOINT" ]; then
fail "$LINENO" "Pretrain finished, but $TS_ENCODER_CHECKPOINT was not created."
fi
elif [ ! -f "$TS_ENCODER_CHECKPOINT" ]; then
fail "$LINENO" "RUN_STAGE_A=false but TS encoder checkpoint does not exist: $TS_ENCODER_CHECKPOINT"
fi
SFT_EXTRA_ARGS=()
if [ -n "$LLM_ATTN_IMPLEMENTATION" ]; then
SFT_EXTRA_ARGS+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION")
fi
echo "Stage B: supervised fine-tuning"
"$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" train_sft.py \
--model TimeSeriesEncoder \
--d_model 512 \
--n_heads 8 \
--e_layers 4 \
--patch_len 60 \
--stride 60 \
--input_len 600 \
--it_d_model 896 \
--it_n_heads 8 \
--it_layers 2 \
--prefix_num 25 \
--adapter_type "$ADAPTER_TYPE" \
"${SFT_EXTRA_ARGS[@]}" \
--llm_model_path "$LLM_MODEL_PATH" \
--load_ts_encoder "$TS_ENCODER_CHECKPOINT" \
--output_dir "$SFT_OUTPUT_DIR" \
--per_device_train_batch_size "$SFT_BATCH_SIZE" \
--gradient_accumulation_steps "$SFT_GRADIENT_ACCUMULATION_STEPS" \
--bf16 \
--learning_rate "$SFT_LR" \
--num_train_epochs "$SFT_EPOCHS" \
--dataloader_num_workers 4 \
--logging_steps "$SFT_LOGGING_STEPS" \
--save_total_limit "$SFT_SAVE_TOTAL_LIMIT" \
--report_to swanlab
echo "Done."
echo "TS encoder checkpoint: $TS_ENCODER_CHECKPOINT"
echo "SFT output dir: $SFT_OUTPUT_DIR"