ITFormer / scripts /run_tsqa_2gpu.sh
a12354's picture
Add files using upload-large-folder tool
aabdb98 verified
Raw
History Blame Contribute Delete
10 kB
#!/usr/bin/env bash
set -Eeuo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ITFORMER_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
MQA_DIR="${MQA_DIR:-/mnt/share01/sqk/MQA}"
DATA_ROOT="${DATA_ROOT:-/mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp}"
TRAIN_PATH="${TRAIN_PATH:-${DATA_ROOT}/train.jsonl}"
EVAL_PATH="${EVAL_PATH:-${DATA_ROOT}/eval.jsonl}"
PYTHON_BIN="${PYTHON_BIN:-/home/suiqk/anaconda3/envs/scalerag-ts-v4/bin/python}"
ACCELERATE_BIN="${ACCELERATE_BIN:-$(dirname "$PYTHON_BIN")/accelerate}"
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-${ITFORMER_DIR}/accelerate_config_2gpu.yaml}"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}"
LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-7b-instruct}"
TS_ENCODER_CHECKPOINT="${TS_ENCODER_CHECKPOINT:-${ITFORMER_DIR}/save/pretrain/model.safetensors}"
RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}"
SMOKE_TEST="${SMOKE_TEST:-0}"
DATA_CHECK_ONLY="${DATA_CHECK_ONLY:-0}"
DRY_RUN="${DRY_RUN:-0}"
FORCE_TRAIN="${FORCE_TRAIN:-0}"
FORCE_EVAL="${FORCE_EVAL:-0}"
if [ "$SMOKE_TEST" = "1" ]; then
RUN_NAME="${RUN_NAME:-itformer_tsqa_smoke_${RUN_ID}}"
TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-8}"
EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-8}"
NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-1}"
TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-1}"
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-1}"
SAVE_STEPS="${SAVE_STEPS:-1}"
MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-64}"
DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-0}"
EVAL_NUM_WORKERS="${EVAL_NUM_WORKERS:-0}"
else
RUN_NAME="${RUN_NAME:-itformer_tsqa_full_${RUN_ID}}"
TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-0}"
EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-0}"
NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-2}"
TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-1}"
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-2}"
SAVE_STEPS="${SAVE_STEPS:-500}"
MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-256}"
DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-4}"
EVAL_NUM_WORKERS="${EVAL_NUM_WORKERS:-2}"
fi
RUN_ROOT="${RUN_ROOT:-${ITFORMER_DIR}/runs/${RUN_NAME}}"
CHECKPOINT_DIR="${CHECKPOINT_DIR:-${RUN_ROOT}/checkpoint_final}"
EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-${RUN_ROOT}/eval}"
LOG_DIR="${LOG_DIR:-${RUN_ROOT}/logs}"
TRAIN_LOG="${TRAIN_LOG:-${LOG_DIR}/train.log}"
EVAL_LOG="${EVAL_LOG:-${LOG_DIR}/eval.log}"
TRAIN_PORT="${TRAIN_PORT:-30610}"
EVAL_PORT="${EVAL_PORT:-30611}"
SEED="${SEED:-42}"
INPUT_LEN="${INPUT_LEN:-600}"
PATCH_LEN="${PATCH_LEN:-60}"
STRIDE="${STRIDE:-60}"
PREFIX_NUM="${PREFIX_NUM:-25}"
IT_D_MODEL="${IT_D_MODEL:-896}"
IT_N_HEADS="${IT_N_HEADS:-16}"
IT_LAYERS="${IT_LAYERS:-2}"
LEARNING_RATE="${LEARNING_RATE:-5e-5}"
MAX_GRAD_NORM="${MAX_GRAD_NORM:-1.0}"
WEIGHT_DECAY="${WEIGHT_DECAY:-1e-6}"
EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-1}"
MAX_SEQ_LENGTH="${MAX_SEQ_LENGTH:-4096}"
LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-}"
LLM_TORCH_DTYPE="${LLM_TORCH_DTYPE:-bfloat16}"
USE_LORA="${USE_LORA:-true}"
LORA_R="${LORA_R:-16}"
LORA_ALPHA="${LORA_ALPHA:-32}"
LORA_DROPOUT="${LORA_DROPOUT:-0.05}"
LORA_TARGET_MODULES="${LORA_TARGET_MODULES:-q_proj k_proj v_proj o_proj gate_proj up_proj down_proj}"
GRADIENT_CHECKPOINTING="${GRADIENT_CHECKPOINTING:-true}"
read -r -a LORA_TARGET_MODULE_ARRAY <<< "$LORA_TARGET_MODULES"
export ITFORMER_DIR
export MQA_DIR
export CUDA_VISIBLE_DEVICES
export PYTHONPATH="${ITFORMER_DIR}:${MQA_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
export TOKENIZERS_PARALLELISM=false
export WANDB_MODE=offline
export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
export MPLCONFIGDIR="${MPLCONFIGDIR:-/dev/shm/itformer_tsqa_cache/matplotlib}"
export XDG_CACHE_HOME="${XDG_CACHE_HOME:-/dev/shm/itformer_tsqa_cache/xdg}"
mkdir -p "$MPLCONFIGDIR" "$XDG_CACHE_HOME"
require_file() {
local path="$1"
local label="$2"
if [ ! -f "$path" ]; then
echo "Missing ${label}: ${path}" >&2
exit 1
fi
}
require_file "$PYTHON_BIN" "Python executable"
require_file "$ACCELERATE_BIN" "Accelerate executable"
require_file "$ACCELERATE_CONFIG" "Accelerate config"
require_file "$TRAIN_PATH" "TSQA train split"
require_file "$EVAL_PATH" "TSQA eval split"
require_file "$LLM_MODEL_PATH/config.json" "Qwen2.5-7B config"
require_file "$TS_ENCODER_CHECKPOINT" "ITFormer pretrained time-series encoder"
require_file "$ITFORMER_DIR/dataset/tsqa_dataset.py" "TSQA data adapter"
require_file "$ITFORMER_DIR/train_sft_tsqa.py" "ITFormer TSQA trainer"
require_file "$ITFORMER_DIR/inference_tsqa.py" "ITFormer TSQA evaluator"
echo "Running ITFormer TSQA data-contract check."
"$PYTHON_BIN" "$ITFORMER_DIR/dataset/tsqa_dataset.py" \
--train_path "$TRAIN_PATH" \
--eval_path "$EVAL_PATH" \
--model_path "$LLM_MODEL_PATH" \
--prefix_num "$PREFIX_NUM" \
--input_len "$INPUT_LEN" \
--samples 8
if [ "$DATA_CHECK_ONLY" = "1" ]; then
echo "DATA_CHECK_ONLY=1: data adapter check passed; stopping before GPU work."
exit 0
fi
train_cmd=(
"$ACCELERATE_BIN" launch
--config_file "$ACCELERATE_CONFIG"
--main_process_port "$TRAIN_PORT"
"$ITFORMER_DIR/train_sft_tsqa.py"
--train_path "$TRAIN_PATH"
--eval_path "$EVAL_PATH"
--llm_model_path "$LLM_MODEL_PATH"
--load_ts_encoder "$TS_ENCODER_CHECKPOINT"
--output_dir "$CHECKPOINT_DIR"
--max_train_samples "$TRAIN_MAX_SAMPLES"
--max_eval_samples "$EVAL_MAX_SAMPLES"
--max_seq_length "$MAX_SEQ_LENGTH"
--seed "$SEED"
--input_len "$INPUT_LEN"
--patch_len "$PATCH_LEN"
--stride "$STRIDE"
--prefix_num "$PREFIX_NUM"
--it_d_model "$IT_D_MODEL"
--it_n_heads "$IT_N_HEADS"
--it_layers "$IT_LAYERS"
--llm_torch_dtype "$LLM_TORCH_DTYPE"
--use_lora "$USE_LORA"
--lora_r "$LORA_R"
--lora_alpha "$LORA_ALPHA"
--lora_dropout "$LORA_DROPOUT"
--lora_target_modules "${LORA_TARGET_MODULE_ARRAY[@]}"
--gradient_checkpointing "$GRADIENT_CHECKPOINTING"
--per_device_train_batch_size "$TRAIN_BATCH_SIZE"
--per_device_eval_batch_size "$EVAL_BATCH_SIZE"
--gradient_accumulation_steps "$GRADIENT_ACCUMULATION_STEPS"
--learning_rate "$LEARNING_RATE"
--max_grad_norm "$MAX_GRAD_NORM"
--weight_decay "$WEIGHT_DECAY"
--num_train_epochs "$NUM_TRAIN_EPOCHS"
--save_steps "$SAVE_STEPS"
--logging_steps 1
--dataloader_num_workers "$DATALOADER_NUM_WORKERS"
--report_to none
--bf16
)
eval_cmd=(
"$ACCELERATE_BIN" launch
--config_file "$ACCELERATE_CONFIG"
--main_process_port "$EVAL_PORT"
"$ITFORMER_DIR/inference_tsqa.py"
--checkpoint "$CHECKPOINT_DIR"
--eval_path "$EVAL_PATH"
--llm_model_path "$LLM_MODEL_PATH"
--output_dir "$EVAL_OUTPUT_DIR"
--max_eval_samples "$EVAL_MAX_SAMPLES"
--seed "$SEED"
--input_len "$INPUT_LEN"
--patch_len "$PATCH_LEN"
--stride "$STRIDE"
--prefix_num "$PREFIX_NUM"
--it_d_model "$IT_D_MODEL"
--it_n_heads "$IT_N_HEADS"
--it_layers "$IT_LAYERS"
--llm_torch_dtype "$LLM_TORCH_DTYPE"
--lora_r "$LORA_R"
--lora_alpha "$LORA_ALPHA"
--lora_dropout "$LORA_DROPOUT"
--lora_target_modules "${LORA_TARGET_MODULE_ARRAY[@]}"
--batch_size "$EVAL_BATCH_SIZE"
--num_workers "$EVAL_NUM_WORKERS"
--max_new_tokens "$MAX_NEW_TOKENS"
--bf16
)
if [ "${USE_LORA,,}" = "true" ] || [ "$USE_LORA" = "1" ]; then
eval_cmd+=(--use_lora)
fi
if [ -n "$LLM_ATTN_IMPLEMENTATION" ]; then
train_cmd+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION")
eval_cmd+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION")
fi
echo "============================================================"
echo "ITFormer on Time-MQA TSQA"
echo "Mode: $([ "$SMOKE_TEST" = "1" ] && echo smoke || echo full)"
echo "Train samples: ${TRAIN_MAX_SAMPLES} (0 means all)"
echo "Eval samples: ${EVAL_MAX_SAMPLES} (0 means all)"
echo "Checkpoint: ${CHECKPOINT_DIR}"
echo "Evaluation: ${EVAL_OUTPUT_DIR}"
echo "GPUs: ${CUDA_VISIBLE_DEVICES}"
echo "Train batch/GPU: ${TRAIN_BATCH_SIZE}"
echo "Gradient accumulation: ${GRADIENT_ACCUMULATION_STEPS}"
echo "Learning rate / max grad norm: ${LEARNING_RATE} / ${MAX_GRAD_NORM}"
echo "ITFormer hidden size / heads / layers: ${IT_D_MODEL} / ${IT_N_HEADS} / ${IT_LAYERS}"
echo "LLM LoRA: ${USE_LORA} (r=${LORA_R}, alpha=${LORA_ALPHA}, dropout=${LORA_DROPOUT})"
echo "LoRA targets: ${LORA_TARGET_MODULES}"
echo "Gradient checkpointing: ${GRADIENT_CHECKPOINTING}"
echo "Max train sequence length: ${MAX_SEQ_LENGTH} (longer samples are skipped)"
echo "============================================================"
if [ "$DRY_RUN" = "1" ]; then
printf 'TRAIN: '
printf '%q ' "${train_cmd[@]}"
printf '\nEVAL: '
printf '%q ' "${eval_cmd[@]}"
printf '\n'
exit 0
fi
mkdir -p "$RUN_ROOT" "$LOG_DIR"
checkpoint_ready=0
if [ -f "$CHECKPOINT_DIR/config.json" ] && compgen -G "$CHECKPOINT_DIR/model*.safetensors" >/dev/null; then
checkpoint_ready=1
fi
if [ "$FORCE_TRAIN" != "1" ] && [ "$checkpoint_ready" = "1" ]; then
echo "Training skipped: final checkpoint already exists at ${CHECKPOINT_DIR}"
else
echo "Starting ITFormer TSQA training. Log: ${TRAIN_LOG}"
"${train_cmd[@]}" 2>&1 | tee "$TRAIN_LOG"
fi
if [ ! -f "$CHECKPOINT_DIR/config.json" ] || ! compgen -G "$CHECKPOINT_DIR/model*.safetensors" >/dev/null; then
echo "Training did not produce a loadable final checkpoint: ${CHECKPOINT_DIR}" >&2
exit 1
fi
if [ "$FORCE_EVAL" != "1" ] && [ -f "$EVAL_OUTPUT_DIR/metrics.json" ]; then
echo "Evaluation skipped: ${EVAL_OUTPUT_DIR}/metrics.json already exists."
else
echo "Starting unified TSQA evaluation. Log: ${EVAL_LOG}"
mkdir -p "$EVAL_OUTPUT_DIR"
"${eval_cmd[@]}" 2>&1 | tee "$EVAL_LOG"
fi
require_file "$EVAL_OUTPUT_DIR/metrics.json" "unified TSQA metrics"
require_file "$EVAL_OUTPUT_DIR/predictions.jsonl" "TSQA predictions"
echo "Done."
echo "Metrics: ${EVAL_OUTPUT_DIR}/metrics.json"
echo "Predictions: ${EVAL_OUTPUT_DIR}/predictions.jsonl"