#!/bin/bash set -Eeuo pipefail # Full training pipeline for ITFormer on 4 GPUs: # Stage A: pre-train TimeSeriesEncoder # Stage B: supervised fine-tuning with Qwen2.5-3B-Instruct PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$PROJECT_DIR" PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}" ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-accelerate_config.yaml}" CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}" RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}" PRETRAIN_WORK_DIR="${PRETRAIN_WORK_DIR:-save/pretrain_ts_small_${RUN_ID}}" PRETRAIN_FINAL_DIR="${PRETRAIN_FINAL_DIR:-save/pretrain}" TS_ENCODER_CHECKPOINT="${TS_ENCODER_CHECKPOINT:-${PRETRAIN_FINAL_DIR}/model.safetensors}" RUN_STAGE_A="${RUN_STAGE_A:-auto}" SFT_OUTPUT_DIR="${SFT_OUTPUT_DIR:-save/sft_qwen2.5_3B_${RUN_ID}}" LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-3b-instruct}" ADAPTER_TYPE="${ADAPTER_TYPE:-itformer}" LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-}" PRETRAIN_BATCH_SIZE="${PRETRAIN_BATCH_SIZE:-512}" PRETRAIN_GRADIENT_ACCUMULATION_STEPS="${PRETRAIN_GRADIENT_ACCUMULATION_STEPS:-2}" SFT_BATCH_SIZE="${SFT_BATCH_SIZE:-2}" SFT_GRADIENT_ACCUMULATION_STEPS="${SFT_GRADIENT_ACCUMULATION_STEPS:-12}" SFT_SAVE_TOTAL_LIMIT="${SFT_SAVE_TOTAL_LIMIT:-2}" SFT_LOGGING_STEPS="${SFT_LOGGING_STEPS:-5}" PRETRAIN_EPOCHS="${PRETRAIN_EPOCHS:-10}" SFT_EPOCHS="${SFT_EPOCHS:-2}" PRETRAIN_LR="${PRETRAIN_LR:-3e-4}" SFT_LR="${SFT_LR:-2e-5}" LOG_DIR="${LOG_DIR:-logs}" LOG_FILE="${LOG_FILE:-${LOG_DIR}/full_train_4gpu_${RUN_ID}.log}" mkdir -p "$LOG_DIR" touch "$LOG_FILE" on_error() { local exit_code="$1" local line_no="$2" local failed_command="$3" { echo "" echo "========== TRAINING FAILED ==========" echo "Time: $(date '+%Y-%m-%d %H:%M:%S')" echo "Exit code: $exit_code" echo "Line: $line_no" echo "Command: $failed_command" echo "Log file: $LOG_FILE" echo "=====================================" } >&2 exit "$exit_code" } fail() { local line_no="$1" shift local message="$*" echo "$message" >&2 on_error 1 "$line_no" "$message" } trap 'on_error "$?" "$LINENO" "$BASH_COMMAND"' ERR exec > >(tee -a "$LOG_FILE") 2>&1 export CUDA_VISIBLE_DEVICES export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub" export TOKENIZERS_PARALLELISM=false export WANDB_MODE=offline echo "Log file: $LOG_FILE" if [ ! -x "$PYTHON_BIN" ]; then fail "$LINENO" "Python executable not found or not executable: $PYTHON_BIN. Set PYTHON_BIN=/path/to/bin/python and rerun." fi if [ ! -f "$ACCELERATE_CONFIG" ]; then fail "$LINENO" "Accelerate config not found: $ACCELERATE_CONFIG" fi if [ ! -f "$LLM_MODEL_PATH/model.safetensors" ] && \ [ ! -f "$LLM_MODEL_PATH/model.safetensors.index.json" ]; then fail "$LINENO" "LLM model not found under: $LLM_MODEL_PATH" fi if [ ! -f "dataset/datasets/time_series_data.h5" ] || \ [ ! -f "dataset/datasets/train_qa.jsonl" ] || \ [ ! -f "dataset/datasets/test_qa.jsonl" ]; then fail "$LINENO" "Dataset files are missing under dataset/datasets/" fi ACCELERATE_BIN="$(dirname "$PYTHON_BIN")/accelerate" if [ ! -x "$ACCELERATE_BIN" ]; then fail "$LINENO" "Accelerate executable not found or not executable: $ACCELERATE_BIN. Install accelerate in the selected Python environment or set PYTHON_BIN to the right environment." fi echo "Project dir: $PROJECT_DIR" echo "Run id: $RUN_ID" echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" echo "Accelerate config: $ACCELERATE_CONFIG" echo "Python: $PYTHON_BIN" echo "Accelerate: $ACCELERATE_BIN" echo "RUN_STAGE_A: $RUN_STAGE_A" echo "TS encoder checkpoint: $TS_ENCODER_CHECKPOINT" echo "Adapter type: $ADAPTER_TYPE" echo "LLM attention implementation: ${LLM_ATTN_IMPLEMENTATION:-default}" "$PYTHON_BIN" - "$LLM_ATTN_IMPLEMENTATION" <<'PY' import importlib.util import sys import torch import transformers import accelerate attn_implementation = sys.argv[1] print("torch:", torch.__version__) print("transformers:", transformers.__version__) print("accelerate:", accelerate.__version__) print("cuda available:", torch.cuda.is_available()) print("cuda device count:", torch.cuda.device_count()) if not torch.cuda.is_available() or torch.cuda.device_count() < 4: raise SystemExit("Expected at least 4 visible CUDA devices for this script.") if attn_implementation == "flash_attention_2": print("flash_attn installed:", importlib.util.find_spec("flash_attn") is not None) if importlib.util.find_spec("flash_attn") is None: raise SystemExit("LLM_ATTN_IMPLEMENTATION=flash_attention_2 requires the flash-attn package in this Python environment.") PY backup_dir_if_exists() { local path="$1" if [ -e "$path" ]; then local backup="${path}.bak_${RUN_ID}" echo "Backing up existing $path -> $backup" mv "$path" "$backup" fi } case "$RUN_STAGE_A" in auto) if [ -f "$TS_ENCODER_CHECKPOINT" ]; then echo "Stage A: skipped; using existing TS encoder checkpoint: $TS_ENCODER_CHECKPOINT" else RUN_STAGE_A=true fi ;; true|false) ;; *) fail "$LINENO" "Invalid RUN_STAGE_A=$RUN_STAGE_A. Use auto, true, or false." ;; esac if [ "$RUN_STAGE_A" = true ]; then backup_dir_if_exists "$PRETRAIN_FINAL_DIR" echo "Stage A: pre-training TimeSeriesEncoder" "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" train_pretrain.py \ --model TimeSeriesEncoder \ --d_model 512 \ --n_heads 8 \ --e_layers 4 \ --patch_len 60 \ --stride 60 \ --input_len 600 \ --output_dir "$PRETRAIN_WORK_DIR" \ --per_device_train_batch_size "$PRETRAIN_BATCH_SIZE" \ --gradient_accumulation_steps "$PRETRAIN_GRADIENT_ACCUMULATION_STEPS" \ --learning_rate "$PRETRAIN_LR" \ --num_train_epochs "$PRETRAIN_EPOCHS" \ --dataloader_num_workers 8 \ --report_to swanlab TS_ENCODER_CHECKPOINT="${PRETRAIN_FINAL_DIR}/model.safetensors" if [ ! -f "$TS_ENCODER_CHECKPOINT" ]; then fail "$LINENO" "Pretrain finished, but $TS_ENCODER_CHECKPOINT was not created." fi elif [ ! -f "$TS_ENCODER_CHECKPOINT" ]; then fail "$LINENO" "RUN_STAGE_A=false but TS encoder checkpoint does not exist: $TS_ENCODER_CHECKPOINT" fi SFT_EXTRA_ARGS=() if [ -n "$LLM_ATTN_IMPLEMENTATION" ]; then SFT_EXTRA_ARGS+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION") fi echo "Stage B: supervised fine-tuning" "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" train_sft.py \ --model TimeSeriesEncoder \ --d_model 512 \ --n_heads 8 \ --e_layers 4 \ --patch_len 60 \ --stride 60 \ --input_len 600 \ --it_d_model 896 \ --it_n_heads 8 \ --it_layers 2 \ --prefix_num 25 \ --adapter_type "$ADAPTER_TYPE" \ "${SFT_EXTRA_ARGS[@]}" \ --llm_model_path "$LLM_MODEL_PATH" \ --load_ts_encoder "$TS_ENCODER_CHECKPOINT" \ --output_dir "$SFT_OUTPUT_DIR" \ --per_device_train_batch_size "$SFT_BATCH_SIZE" \ --gradient_accumulation_steps "$SFT_GRADIENT_ACCUMULATION_STEPS" \ --bf16 \ --learning_rate "$SFT_LR" \ --num_train_epochs "$SFT_EPOCHS" \ --dataloader_num_workers 4 \ --logging_steps "$SFT_LOGGING_STEPS" \ --save_total_limit "$SFT_SAVE_TOTAL_LIMIT" \ --report_to swanlab echo "Done." echo "TS encoder checkpoint: $TS_ENCODER_CHECKPOINT" echo "SFT output dir: $SFT_OUTPUT_DIR"