| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -Eeuo pipefail |
|
|
| PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
| cd "$PROJECT_DIR" |
|
|
| |
| PYTHON_BIN="${PYTHON_BIN:-/home/suiqk/anaconda3/envs/scalerag-ts-v4/bin/python}" |
| ACCELERATE_BIN="$(dirname "$PYTHON_BIN")/accelerate" |
| ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-accelerate_config_2gpu.yaml}" |
| CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}" |
| RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}" |
|
|
| |
| DATA_PATH="${DATA_PATH:-/mnt/share01/sqk/datasets/RATs40K/RATs-Uni-TSImage_Reason.json}" |
|
|
| |
| |
| |
| |
| PATCH_LEN="${PATCH_LEN:-16}" |
| STRIDE="${STRIDE:-16}" |
| INPUT_LEN="${INPUT_LEN:-128}" |
| D_MODEL="${D_MODEL:-512}" |
| N_HEADS="${N_HEADS:-8}" |
| E_LAYERS="${E_LAYERS:-4}" |
|
|
| |
| |
| LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-7b-instruct}" |
| |
| |
| |
| |
| LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-flash_attention_2}" |
|
|
| |
| |
| IT_D_MODEL="${IT_D_MODEL:-3584}" |
| IT_N_HEADS="${IT_N_HEADS:-28}" |
| IT_LAYERS="${IT_LAYERS:-2}" |
| PREFIX_NUM="${PREFIX_NUM:-8}" |
| ADAPTER_TYPE="${ADAPTER_TYPE:-itformer}" |
|
|
| |
| |
| |
| RUN_STAGE_A="${RUN_STAGE_A:-auto}" |
| PRETRAIN_OUTPUT_DIR="${PRETRAIN_OUTPUT_DIR:-save/pretrain_rats40k_${RUN_ID}}" |
| PRETRAIN_FINAL_DIR="${PRETRAIN_FINAL_DIR:-save/pretrain_rats40k}" |
| |
| TS_ENCODER_CKPT="${TS_ENCODER_CKPT:-${PRETRAIN_FINAL_DIR}/model.safetensors}" |
| PRETRAIN_BATCH="${PRETRAIN_BATCH:-256}" |
| PRETRAIN_GRAD_ACCUM="${PRETRAIN_GRAD_ACCUM:-1}" |
| PRETRAIN_LR="${PRETRAIN_LR:-1e-4}" |
| PRETRAIN_EPOCHS="${PRETRAIN_EPOCHS:-20}" |
| MIN_MASK_RATIO="${MIN_MASK_RATIO:-0.4}" |
| MAX_MASK_RATIO="${MAX_MASK_RATIO:-0.6}" |
|
|
| |
| |
| SFT_OUTPUT_DIR="${SFT_OUTPUT_DIR:-save/sft_rats40k_${RUN_ID}}" |
| SFT_BATCH="${SFT_BATCH:-1}" |
| SFT_GRAD_ACCUM="${SFT_GRAD_ACCUM:-16}" |
| SFT_LR="${SFT_LR:-2e-5}" |
| SFT_EPOCHS="${SFT_EPOCHS:-3}" |
| SFT_SAVE_STEPS="${SFT_SAVE_STEPS:-500}" |
| SFT_LOGGING_STEPS="${SFT_LOGGING_STEPS:-20}" |
| FREEZE_TS="${FREEZE_TS:-true}" |
| BF16="${BF16:-true}" |
|
|
| |
| EVAL_BATCH="${EVAL_BATCH:-4}" |
| EVAL_MAX_NEW_TOKENS="${EVAL_MAX_NEW_TOKENS:-256}" |
| EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-inference_results_rats40k_${RUN_ID}}" |
|
|
| |
| LOG_DIR="${LOG_DIR:-logs}" |
| LOG_FILE="${LOG_FILE:-${LOG_DIR}/rats40k_2gpu_${RUN_ID}.log}" |
| mkdir -p "$LOG_DIR" |
|
|
| |
| on_error() { |
| echo "" >&2 |
| echo "========== FAILED ==========" >&2 |
| echo "Time: $(date)" >&2 |
| echo "Line: $2 Exit: $1 Cmd: $3" >&2 |
| exit "$1" |
| } |
| trap 'on_error "$?" "$LINENO" "$BASH_COMMAND"' ERR |
| exec > >(tee -a "$LOG_FILE") 2>&1 |
|
|
| export CUDA_VISIBLE_DEVICES |
| export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub" |
| export TOKENIZERS_PARALLELISM=false |
| export WANDB_MODE=offline |
| export OMP_NUM_THREADS=4 |
|
|
| |
| for bin in "$PYTHON_BIN" "$ACCELERATE_BIN"; do |
| [ -x "$bin" ] || { echo "Not found: $bin"; exit 1; } |
| done |
| [ -f "$ACCELERATE_CONFIG" ] || { echo "Accelerate config missing: $ACCELERATE_CONFIG"; exit 1; } |
| [ -f "$DATA_PATH" ] || { echo "RATs40K data not found: $DATA_PATH"; exit 1; } |
|
|
| "$PYTHON_BIN" - "$LLM_ATTN_IMPLEMENTATION" <<'PY' |
| import sys, importlib.util |
| import torch, accelerate |
| print("torch:", torch.__version__) |
| print("accelerate:", accelerate.__version__) |
| print("cuda devices:", torch.cuda.device_count()) |
| assert torch.cuda.device_count() >= 2, "Need at least 2 GPUs" |
| attn = sys.argv[1] |
| if attn == "flash_attention_2": |
| ok = importlib.util.find_spec("flash_attn") is not None |
| print("flash_attn installed:", ok) |
| if not ok: |
| raise SystemExit("LLM_ATTN_IMPLEMENTATION=flash_attention_2 requires the flash-attn package.") |
| PY |
|
|
| echo "================================================================" |
| echo "ITFormer Γ RATs40K | Run ID: ${RUN_ID}" |
| echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" |
| echo "Data: ${DATA_PATH}" |
| echo "LLM: ${LLM_MODEL_PATH}" |
| echo "patch_len=${PATCH_LEN}, input_len=${INPUT_LEN}, prefix_num=${PREFIX_NUM}" |
| echo "it_d_model=${IT_D_MODEL}, it_n_heads=${IT_N_HEADS}" |
| echo "================================================================" |
|
|
| |
| case "$RUN_STAGE_A" in |
| auto) |
| [ -f "$TS_ENCODER_CKPT" ] && RUN_STAGE_A=false || RUN_STAGE_A=true ;; |
| true|false) ;; |
| *) echo "Invalid RUN_STAGE_A=$RUN_STAGE_A"; exit 1 ;; |
| esac |
|
|
| if [ "$RUN_STAGE_A" = true ]; then |
| echo "" |
| echo "ββ Stage A: Pre-training TimeSeriesEncoder ββββββββββββββββββββββ" |
| "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" \ |
| train_pretrain_rats40k.py \ |
| --data_path "$DATA_PATH" \ |
| --model TimeSeriesEncoder \ |
| --d_model "$D_MODEL" \ |
| --n_heads "$N_HEADS" \ |
| --e_layers "$E_LAYERS" \ |
| --patch_len "$PATCH_LEN" \ |
| --stride "$STRIDE" \ |
| --input_len "$INPUT_LEN" \ |
| --min_mask_ratio "$MIN_MASK_RATIO" \ |
| --max_mask_ratio "$MAX_MASK_RATIO" \ |
| --per_device_train_batch_size "$PRETRAIN_BATCH" \ |
| --gradient_accumulation_steps "$PRETRAIN_GRAD_ACCUM" \ |
| --learning_rate "$PRETRAIN_LR" \ |
| --num_train_epochs "$PRETRAIN_EPOCHS" \ |
| --output_dir "$PRETRAIN_OUTPUT_DIR" \ |
| --dataloader_num_workers 4 \ |
| --report_to none |
|
|
| |
| mkdir -p "$PRETRAIN_FINAL_DIR" |
| cp "$PRETRAIN_OUTPUT_DIR/model.safetensors" "$PRETRAIN_FINAL_DIR/model.safetensors" 2>/dev/null || \ |
| cp "$(ls -t "$PRETRAIN_OUTPUT_DIR"/checkpoint-*/model.safetensors 2>/dev/null | head -1)" \ |
| "$PRETRAIN_FINAL_DIR/model.safetensors" |
|
|
| echo "Stage A done. TS encoder: ${TS_ENCODER_CKPT}" |
| else |
| echo "Stage A: skipped (TS encoder exists: ${TS_ENCODER_CKPT})" |
| fi |
|
|
| [ -f "$TS_ENCODER_CKPT" ] || { echo "TS encoder checkpoint not found: $TS_ENCODER_CKPT"; exit 1; } |
|
|
| |
| echo "" |
| echo "ββ Stage B: SFT (ITFormer + LLM) βββββββββββββββββββββββββββββββ" |
|
|
| |
| if [ "$LLM_ATTN_IMPLEMENTATION" = "flash_attention_2" ] && [ "$BF16" != "true" ]; then |
| echo "flash_attention_2 requires bf16; forcing BF16=true" |
| BF16="true" |
| fi |
|
|
| BF16_FLAG="" |
| [ "$BF16" = "true" ] && BF16_FLAG="--bf16" |
|
|
| ATTN_FLAG="" |
| [ -n "$LLM_ATTN_IMPLEMENTATION" ] && ATTN_FLAG="--llm_attn_implementation $LLM_ATTN_IMPLEMENTATION" |
|
|
| echo "LLM attention: ${LLM_ATTN_IMPLEMENTATION:-default}" |
|
|
| "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" \ |
| train_sft_rats40k.py \ |
| --data_path "$DATA_PATH" \ |
| --model TimeSeriesEncoder \ |
| --d_model "$D_MODEL" \ |
| --n_heads "$N_HEADS" \ |
| --e_layers "$E_LAYERS" \ |
| --patch_len "$PATCH_LEN" \ |
| --stride "$STRIDE" \ |
| --input_len "$INPUT_LEN" \ |
| --load_ts_encoder "$TS_ENCODER_CKPT" \ |
| --it_d_model "$IT_D_MODEL" \ |
| --it_n_heads "$IT_N_HEADS" \ |
| --it_layers "$IT_LAYERS" \ |
| --prefix_num "$PREFIX_NUM" \ |
| --adapter_type "$ADAPTER_TYPE" \ |
| --llm_model_path "$LLM_MODEL_PATH" \ |
| --freeze_ts_model "$FREEZE_TS" \ |
| $ATTN_FLAG \ |
| --per_device_train_batch_size "$SFT_BATCH" \ |
| --gradient_accumulation_steps "$SFT_GRAD_ACCUM" \ |
| --learning_rate "$SFT_LR" \ |
| --num_train_epochs "$SFT_EPOCHS" \ |
| --save_steps "$SFT_SAVE_STEPS" \ |
| --logging_steps "$SFT_LOGGING_STEPS" \ |
| --output_dir "$SFT_OUTPUT_DIR" \ |
| --dataloader_num_workers 4 \ |
| $BF16_FLAG \ |
| --report_to none |
|
|
| echo "Stage B done. SFT checkpoint: ${SFT_OUTPUT_DIR}" |
|
|
| |
| |
| LATEST_CKPT="$(ls -td "${SFT_OUTPUT_DIR}"/checkpoint-* 2>/dev/null | head -1)" |
| [ -z "$LATEST_CKPT" ] && LATEST_CKPT="$SFT_OUTPUT_DIR" |
|
|
| echo "" |
| echo "ββ Stage C: Evaluation ββββββββββββββββββββββββββββββββββββββββββ" |
| echo "Checkpoint: ${LATEST_CKPT}" |
|
|
| "$PYTHON_BIN" inference_rats40k.py \ |
| --checkpoint "$LATEST_CKPT" \ |
| --data_path "$DATA_PATH" \ |
| --eval_split TSAD_test \ |
| --llm_model_path "$LLM_MODEL_PATH" \ |
| --patch_len "$PATCH_LEN" \ |
| --stride "$STRIDE" \ |
| --input_len "$INPUT_LEN" \ |
| --d_model "$D_MODEL" \ |
| --n_heads "$N_HEADS" \ |
| --e_layers "$E_LAYERS" \ |
| --it_d_model "$IT_D_MODEL" \ |
| --it_n_heads "$IT_N_HEADS" \ |
| --it_layers "$IT_LAYERS" \ |
| --prefix_num "$PREFIX_NUM" \ |
| --adapter_type "$ADAPTER_TYPE" \ |
| $ATTN_FLAG \ |
| --batch_size "$EVAL_BATCH" \ |
| --max_new_tokens "$EVAL_MAX_NEW_TOKENS" \ |
| --output_dir "$EVAL_OUTPUT_DIR" |
|
|
| echo "" |
| echo "================================================================" |
| echo "All stages completed successfully." |
| echo " TS encoder : ${TS_ENCODER_CKPT}" |
| echo " SFT ckpt : ${SFT_OUTPUT_DIR}" |
| echo " Eval output: ${EVAL_OUTPUT_DIR}" |
| echo " Log : ${LOG_FILE}" |
| echo "================================================================" |
|
|