| #!/bin/bash |
| set -Eeuo pipefail |
|
|
| |
| |
| |
|
|
| PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
| cd "$PROJECT_DIR" |
|
|
| PYTHON_BIN="${PYTHON_BIN:-/dev/shm/suiqk/conda_envs/scalerag-ts-v4/bin/python}" |
| ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-accelerate_config.yaml}" |
| CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}" |
| RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}" |
|
|
| PRETRAIN_WORK_DIR="${PRETRAIN_WORK_DIR:-save/pretrain_ts_small_${RUN_ID}}" |
| PRETRAIN_FINAL_DIR="${PRETRAIN_FINAL_DIR:-save/pretrain}" |
| TS_ENCODER_CHECKPOINT="${TS_ENCODER_CHECKPOINT:-${PRETRAIN_FINAL_DIR}/model.safetensors}" |
| RUN_STAGE_A="${RUN_STAGE_A:-auto}" |
| SFT_OUTPUT_DIR="${SFT_OUTPUT_DIR:-save/sft_qwen2.5_3B_${RUN_ID}}" |
| LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-3b-instruct}" |
| ADAPTER_TYPE="${ADAPTER_TYPE:-itformer}" |
| LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-}" |
|
|
| PRETRAIN_BATCH_SIZE="${PRETRAIN_BATCH_SIZE:-512}" |
| PRETRAIN_GRADIENT_ACCUMULATION_STEPS="${PRETRAIN_GRADIENT_ACCUMULATION_STEPS:-2}" |
| SFT_BATCH_SIZE="${SFT_BATCH_SIZE:-2}" |
| SFT_GRADIENT_ACCUMULATION_STEPS="${SFT_GRADIENT_ACCUMULATION_STEPS:-12}" |
| SFT_SAVE_TOTAL_LIMIT="${SFT_SAVE_TOTAL_LIMIT:-2}" |
| SFT_LOGGING_STEPS="${SFT_LOGGING_STEPS:-5}" |
| PRETRAIN_EPOCHS="${PRETRAIN_EPOCHS:-10}" |
| SFT_EPOCHS="${SFT_EPOCHS:-2}" |
| PRETRAIN_LR="${PRETRAIN_LR:-3e-4}" |
| SFT_LR="${SFT_LR:-2e-5}" |
|
|
| LOG_DIR="${LOG_DIR:-logs}" |
| LOG_FILE="${LOG_FILE:-${LOG_DIR}/full_train_4gpu_${RUN_ID}.log}" |
| mkdir -p "$LOG_DIR" |
| touch "$LOG_FILE" |
|
|
| on_error() { |
| local exit_code="$1" |
| local line_no="$2" |
| local failed_command="$3" |
|
|
| { |
| echo "" |
| echo "========== TRAINING FAILED ==========" |
| echo "Time: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "Exit code: $exit_code" |
| echo "Line: $line_no" |
| echo "Command: $failed_command" |
| echo "Log file: $LOG_FILE" |
| echo "=====================================" |
| } >&2 |
|
|
| exit "$exit_code" |
| } |
|
|
| fail() { |
| local line_no="$1" |
| shift |
| local message="$*" |
| echo "$message" >&2 |
| on_error 1 "$line_no" "$message" |
| } |
|
|
| trap 'on_error "$?" "$LINENO" "$BASH_COMMAND"' ERR |
| exec > >(tee -a "$LOG_FILE") 2>&1 |
|
|
| export CUDA_VISIBLE_DEVICES |
| export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub" |
| export TOKENIZERS_PARALLELISM=false |
| export WANDB_MODE=offline |
|
|
| echo "Log file: $LOG_FILE" |
|
|
| if [ ! -x "$PYTHON_BIN" ]; then |
| fail "$LINENO" "Python executable not found or not executable: $PYTHON_BIN. Set PYTHON_BIN=/path/to/bin/python and rerun." |
| fi |
|
|
| if [ ! -f "$ACCELERATE_CONFIG" ]; then |
| fail "$LINENO" "Accelerate config not found: $ACCELERATE_CONFIG" |
| fi |
|
|
| if [ ! -f "$LLM_MODEL_PATH/model.safetensors" ] && \ |
| [ ! -f "$LLM_MODEL_PATH/model.safetensors.index.json" ]; then |
| fail "$LINENO" "LLM model not found under: $LLM_MODEL_PATH" |
| fi |
|
|
| if [ ! -f "dataset/datasets/time_series_data.h5" ] || \ |
| [ ! -f "dataset/datasets/train_qa.jsonl" ] || \ |
| [ ! -f "dataset/datasets/test_qa.jsonl" ]; then |
| fail "$LINENO" "Dataset files are missing under dataset/datasets/" |
| fi |
|
|
| ACCELERATE_BIN="$(dirname "$PYTHON_BIN")/accelerate" |
|
|
| if [ ! -x "$ACCELERATE_BIN" ]; then |
| fail "$LINENO" "Accelerate executable not found or not executable: $ACCELERATE_BIN. Install accelerate in the selected Python environment or set PYTHON_BIN to the right environment." |
| fi |
|
|
| echo "Project dir: $PROJECT_DIR" |
| echo "Run id: $RUN_ID" |
| echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" |
| echo "Accelerate config: $ACCELERATE_CONFIG" |
| echo "Python: $PYTHON_BIN" |
| echo "Accelerate: $ACCELERATE_BIN" |
| echo "RUN_STAGE_A: $RUN_STAGE_A" |
| echo "TS encoder checkpoint: $TS_ENCODER_CHECKPOINT" |
| echo "Adapter type: $ADAPTER_TYPE" |
| echo "LLM attention implementation: ${LLM_ATTN_IMPLEMENTATION:-default}" |
|
|
| "$PYTHON_BIN" - "$LLM_ATTN_IMPLEMENTATION" <<'PY' |
| import importlib.util |
| import sys |
| import torch |
| import transformers |
| import accelerate |
|
|
| attn_implementation = sys.argv[1] |
|
|
| print("torch:", torch.__version__) |
| print("transformers:", transformers.__version__) |
| print("accelerate:", accelerate.__version__) |
| print("cuda available:", torch.cuda.is_available()) |
| print("cuda device count:", torch.cuda.device_count()) |
|
|
| if not torch.cuda.is_available() or torch.cuda.device_count() < 4: |
| raise SystemExit("Expected at least 4 visible CUDA devices for this script.") |
|
|
| if attn_implementation == "flash_attention_2": |
| print("flash_attn installed:", importlib.util.find_spec("flash_attn") is not None) |
| if importlib.util.find_spec("flash_attn") is None: |
| raise SystemExit("LLM_ATTN_IMPLEMENTATION=flash_attention_2 requires the flash-attn package in this Python environment.") |
| PY |
|
|
| backup_dir_if_exists() { |
| local path="$1" |
| if [ -e "$path" ]; then |
| local backup="${path}.bak_${RUN_ID}" |
| echo "Backing up existing $path -> $backup" |
| mv "$path" "$backup" |
| fi |
| } |
|
|
| case "$RUN_STAGE_A" in |
| auto) |
| if [ -f "$TS_ENCODER_CHECKPOINT" ]; then |
| echo "Stage A: skipped; using existing TS encoder checkpoint: $TS_ENCODER_CHECKPOINT" |
| else |
| RUN_STAGE_A=true |
| fi |
| ;; |
| true|false) |
| ;; |
| *) |
| fail "$LINENO" "Invalid RUN_STAGE_A=$RUN_STAGE_A. Use auto, true, or false." |
| ;; |
| esac |
|
|
| if [ "$RUN_STAGE_A" = true ]; then |
| backup_dir_if_exists "$PRETRAIN_FINAL_DIR" |
|
|
| echo "Stage A: pre-training TimeSeriesEncoder" |
| "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" train_pretrain.py \ |
| --model TimeSeriesEncoder \ |
| --d_model 512 \ |
| --n_heads 8 \ |
| --e_layers 4 \ |
| --patch_len 60 \ |
| --stride 60 \ |
| --input_len 600 \ |
| --output_dir "$PRETRAIN_WORK_DIR" \ |
| --per_device_train_batch_size "$PRETRAIN_BATCH_SIZE" \ |
| --gradient_accumulation_steps "$PRETRAIN_GRADIENT_ACCUMULATION_STEPS" \ |
| --learning_rate "$PRETRAIN_LR" \ |
| --num_train_epochs "$PRETRAIN_EPOCHS" \ |
| --dataloader_num_workers 8 \ |
| --report_to swanlab |
|
|
| TS_ENCODER_CHECKPOINT="${PRETRAIN_FINAL_DIR}/model.safetensors" |
|
|
| if [ ! -f "$TS_ENCODER_CHECKPOINT" ]; then |
| fail "$LINENO" "Pretrain finished, but $TS_ENCODER_CHECKPOINT was not created." |
| fi |
| elif [ ! -f "$TS_ENCODER_CHECKPOINT" ]; then |
| fail "$LINENO" "RUN_STAGE_A=false but TS encoder checkpoint does not exist: $TS_ENCODER_CHECKPOINT" |
| fi |
|
|
| SFT_EXTRA_ARGS=() |
| if [ -n "$LLM_ATTN_IMPLEMENTATION" ]; then |
| SFT_EXTRA_ARGS+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION") |
| fi |
|
|
| echo "Stage B: supervised fine-tuning" |
| "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" train_sft.py \ |
| --model TimeSeriesEncoder \ |
| --d_model 512 \ |
| --n_heads 8 \ |
| --e_layers 4 \ |
| --patch_len 60 \ |
| --stride 60 \ |
| --input_len 600 \ |
| --it_d_model 896 \ |
| --it_n_heads 8 \ |
| --it_layers 2 \ |
| --prefix_num 25 \ |
| --adapter_type "$ADAPTER_TYPE" \ |
| "${SFT_EXTRA_ARGS[@]}" \ |
| --llm_model_path "$LLM_MODEL_PATH" \ |
| --load_ts_encoder "$TS_ENCODER_CHECKPOINT" \ |
| --output_dir "$SFT_OUTPUT_DIR" \ |
| --per_device_train_batch_size "$SFT_BATCH_SIZE" \ |
| --gradient_accumulation_steps "$SFT_GRADIENT_ACCUMULATION_STEPS" \ |
| --bf16 \ |
| --learning_rate "$SFT_LR" \ |
| --num_train_epochs "$SFT_EPOCHS" \ |
| --dataloader_num_workers 4 \ |
| --logging_steps "$SFT_LOGGING_STEPS" \ |
| --save_total_limit "$SFT_SAVE_TOTAL_LIMIT" \ |
| --report_to swanlab |
|
|
| echo "Done." |
| echo "TS encoder checkpoint: $TS_ENCODER_CHECKPOINT" |
| echo "SFT output dir: $SFT_OUTPUT_DIR" |
|
|