| #!/usr/bin/env bash |
|
|
| set -Eeuo pipefail |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| ITFORMER_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" |
| MQA_DIR="${MQA_DIR:-/mnt/share01/sqk/MQA}" |
| DATA_ROOT="${DATA_ROOT:-/mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp}" |
| TRAIN_PATH="${TRAIN_PATH:-${DATA_ROOT}/train.jsonl}" |
| EVAL_PATH="${EVAL_PATH:-${DATA_ROOT}/eval.jsonl}" |
|
|
| PYTHON_BIN="${PYTHON_BIN:-/home/suiqk/anaconda3/envs/scalerag-ts-v4/bin/python}" |
| ACCELERATE_BIN="${ACCELERATE_BIN:-$(dirname "$PYTHON_BIN")/accelerate}" |
| ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-${ITFORMER_DIR}/accelerate_config_2gpu.yaml}" |
| CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}" |
| LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-7b-instruct}" |
| TS_ENCODER_CHECKPOINT="${TS_ENCODER_CHECKPOINT:-${ITFORMER_DIR}/save/pretrain/model.safetensors}" |
|
|
| RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}" |
| SMOKE_TEST="${SMOKE_TEST:-0}" |
| DATA_CHECK_ONLY="${DATA_CHECK_ONLY:-0}" |
| DRY_RUN="${DRY_RUN:-0}" |
| FORCE_TRAIN="${FORCE_TRAIN:-0}" |
| FORCE_EVAL="${FORCE_EVAL:-0}" |
|
|
| if [ "$SMOKE_TEST" = "1" ]; then |
| RUN_NAME="${RUN_NAME:-itformer_tsqa_smoke_${RUN_ID}}" |
| TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-8}" |
| EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-8}" |
| NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-1}" |
| TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-1}" |
| GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-1}" |
| SAVE_STEPS="${SAVE_STEPS:-1}" |
| MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-64}" |
| DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-0}" |
| EVAL_NUM_WORKERS="${EVAL_NUM_WORKERS:-0}" |
| else |
| RUN_NAME="${RUN_NAME:-itformer_tsqa_full_${RUN_ID}}" |
| TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-0}" |
| EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-0}" |
| NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-2}" |
| TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-1}" |
| GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-2}" |
| SAVE_STEPS="${SAVE_STEPS:-500}" |
| MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-256}" |
| DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-4}" |
| EVAL_NUM_WORKERS="${EVAL_NUM_WORKERS:-2}" |
| fi |
|
|
| RUN_ROOT="${RUN_ROOT:-${ITFORMER_DIR}/runs/${RUN_NAME}}" |
| CHECKPOINT_DIR="${CHECKPOINT_DIR:-${RUN_ROOT}/checkpoint_final}" |
| EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-${RUN_ROOT}/eval}" |
| LOG_DIR="${LOG_DIR:-${RUN_ROOT}/logs}" |
| TRAIN_LOG="${TRAIN_LOG:-${LOG_DIR}/train.log}" |
| EVAL_LOG="${EVAL_LOG:-${LOG_DIR}/eval.log}" |
|
|
| TRAIN_PORT="${TRAIN_PORT:-30610}" |
| EVAL_PORT="${EVAL_PORT:-30611}" |
| SEED="${SEED:-42}" |
| INPUT_LEN="${INPUT_LEN:-600}" |
| PATCH_LEN="${PATCH_LEN:-60}" |
| STRIDE="${STRIDE:-60}" |
| PREFIX_NUM="${PREFIX_NUM:-25}" |
| IT_D_MODEL="${IT_D_MODEL:-896}" |
| IT_N_HEADS="${IT_N_HEADS:-16}" |
| IT_LAYERS="${IT_LAYERS:-2}" |
| LEARNING_RATE="${LEARNING_RATE:-5e-5}" |
| MAX_GRAD_NORM="${MAX_GRAD_NORM:-1.0}" |
| WEIGHT_DECAY="${WEIGHT_DECAY:-1e-6}" |
| EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-1}" |
| MAX_SEQ_LENGTH="${MAX_SEQ_LENGTH:-4096}" |
| LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-}" |
| LLM_TORCH_DTYPE="${LLM_TORCH_DTYPE:-bfloat16}" |
| USE_LORA="${USE_LORA:-true}" |
| LORA_R="${LORA_R:-16}" |
| LORA_ALPHA="${LORA_ALPHA:-32}" |
| LORA_DROPOUT="${LORA_DROPOUT:-0.05}" |
| LORA_TARGET_MODULES="${LORA_TARGET_MODULES:-q_proj k_proj v_proj o_proj gate_proj up_proj down_proj}" |
| GRADIENT_CHECKPOINTING="${GRADIENT_CHECKPOINTING:-true}" |
| read -r -a LORA_TARGET_MODULE_ARRAY <<< "$LORA_TARGET_MODULES" |
|
|
| export ITFORMER_DIR |
| export MQA_DIR |
| export CUDA_VISIBLE_DEVICES |
| export PYTHONPATH="${ITFORMER_DIR}:${MQA_DIR}${PYTHONPATH:+:${PYTHONPATH}}" |
| export TOKENIZERS_PARALLELISM=false |
| export WANDB_MODE=offline |
| export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub" |
| export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" |
| export MPLCONFIGDIR="${MPLCONFIGDIR:-/dev/shm/itformer_tsqa_cache/matplotlib}" |
| export XDG_CACHE_HOME="${XDG_CACHE_HOME:-/dev/shm/itformer_tsqa_cache/xdg}" |
| mkdir -p "$MPLCONFIGDIR" "$XDG_CACHE_HOME" |
|
|
| require_file() { |
| local path="$1" |
| local label="$2" |
| if [ ! -f "$path" ]; then |
| echo "Missing ${label}: ${path}" >&2 |
| exit 1 |
| fi |
| } |
|
|
| require_file "$PYTHON_BIN" "Python executable" |
| require_file "$ACCELERATE_BIN" "Accelerate executable" |
| require_file "$ACCELERATE_CONFIG" "Accelerate config" |
| require_file "$TRAIN_PATH" "TSQA train split" |
| require_file "$EVAL_PATH" "TSQA eval split" |
| require_file "$LLM_MODEL_PATH/config.json" "Qwen2.5-7B config" |
| require_file "$TS_ENCODER_CHECKPOINT" "ITFormer pretrained time-series encoder" |
| require_file "$ITFORMER_DIR/dataset/tsqa_dataset.py" "TSQA data adapter" |
| require_file "$ITFORMER_DIR/train_sft_tsqa.py" "ITFormer TSQA trainer" |
| require_file "$ITFORMER_DIR/inference_tsqa.py" "ITFormer TSQA evaluator" |
|
|
| echo "Running ITFormer TSQA data-contract check." |
| "$PYTHON_BIN" "$ITFORMER_DIR/dataset/tsqa_dataset.py" \ |
| --train_path "$TRAIN_PATH" \ |
| --eval_path "$EVAL_PATH" \ |
| --model_path "$LLM_MODEL_PATH" \ |
| --prefix_num "$PREFIX_NUM" \ |
| --input_len "$INPUT_LEN" \ |
| --samples 8 |
|
|
| if [ "$DATA_CHECK_ONLY" = "1" ]; then |
| echo "DATA_CHECK_ONLY=1: data adapter check passed; stopping before GPU work." |
| exit 0 |
| fi |
|
|
| train_cmd=( |
| "$ACCELERATE_BIN" launch |
| --config_file "$ACCELERATE_CONFIG" |
| --main_process_port "$TRAIN_PORT" |
| "$ITFORMER_DIR/train_sft_tsqa.py" |
| --train_path "$TRAIN_PATH" |
| --eval_path "$EVAL_PATH" |
| --llm_model_path "$LLM_MODEL_PATH" |
| --load_ts_encoder "$TS_ENCODER_CHECKPOINT" |
| --output_dir "$CHECKPOINT_DIR" |
| --max_train_samples "$TRAIN_MAX_SAMPLES" |
| --max_eval_samples "$EVAL_MAX_SAMPLES" |
| --max_seq_length "$MAX_SEQ_LENGTH" |
| --seed "$SEED" |
| --input_len "$INPUT_LEN" |
| --patch_len "$PATCH_LEN" |
| --stride "$STRIDE" |
| --prefix_num "$PREFIX_NUM" |
| --it_d_model "$IT_D_MODEL" |
| --it_n_heads "$IT_N_HEADS" |
| --it_layers "$IT_LAYERS" |
| --llm_torch_dtype "$LLM_TORCH_DTYPE" |
| --use_lora "$USE_LORA" |
| --lora_r "$LORA_R" |
| --lora_alpha "$LORA_ALPHA" |
| --lora_dropout "$LORA_DROPOUT" |
| --lora_target_modules "${LORA_TARGET_MODULE_ARRAY[@]}" |
| --gradient_checkpointing "$GRADIENT_CHECKPOINTING" |
| --per_device_train_batch_size "$TRAIN_BATCH_SIZE" |
| --per_device_eval_batch_size "$EVAL_BATCH_SIZE" |
| --gradient_accumulation_steps "$GRADIENT_ACCUMULATION_STEPS" |
| --learning_rate "$LEARNING_RATE" |
| --max_grad_norm "$MAX_GRAD_NORM" |
| --weight_decay "$WEIGHT_DECAY" |
| --num_train_epochs "$NUM_TRAIN_EPOCHS" |
| --save_steps "$SAVE_STEPS" |
| --logging_steps 1 |
| --dataloader_num_workers "$DATALOADER_NUM_WORKERS" |
| --report_to none |
| --bf16 |
| ) |
|
|
| eval_cmd=( |
| "$ACCELERATE_BIN" launch |
| --config_file "$ACCELERATE_CONFIG" |
| --main_process_port "$EVAL_PORT" |
| "$ITFORMER_DIR/inference_tsqa.py" |
| --checkpoint "$CHECKPOINT_DIR" |
| --eval_path "$EVAL_PATH" |
| --llm_model_path "$LLM_MODEL_PATH" |
| --output_dir "$EVAL_OUTPUT_DIR" |
| --max_eval_samples "$EVAL_MAX_SAMPLES" |
| --seed "$SEED" |
| --input_len "$INPUT_LEN" |
| --patch_len "$PATCH_LEN" |
| --stride "$STRIDE" |
| --prefix_num "$PREFIX_NUM" |
| --it_d_model "$IT_D_MODEL" |
| --it_n_heads "$IT_N_HEADS" |
| --it_layers "$IT_LAYERS" |
| --llm_torch_dtype "$LLM_TORCH_DTYPE" |
| --lora_r "$LORA_R" |
| --lora_alpha "$LORA_ALPHA" |
| --lora_dropout "$LORA_DROPOUT" |
| --lora_target_modules "${LORA_TARGET_MODULE_ARRAY[@]}" |
| --batch_size "$EVAL_BATCH_SIZE" |
| --num_workers "$EVAL_NUM_WORKERS" |
| --max_new_tokens "$MAX_NEW_TOKENS" |
| --bf16 |
| ) |
|
|
| if [ "${USE_LORA,,}" = "true" ] || [ "$USE_LORA" = "1" ]; then |
| eval_cmd+=(--use_lora) |
| fi |
|
|
| if [ -n "$LLM_ATTN_IMPLEMENTATION" ]; then |
| train_cmd+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION") |
| eval_cmd+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION") |
| fi |
|
|
| echo "============================================================" |
| echo "ITFormer on Time-MQA TSQA" |
| echo "Mode: $([ "$SMOKE_TEST" = "1" ] && echo smoke || echo full)" |
| echo "Train samples: ${TRAIN_MAX_SAMPLES} (0 means all)" |
| echo "Eval samples: ${EVAL_MAX_SAMPLES} (0 means all)" |
| echo "Checkpoint: ${CHECKPOINT_DIR}" |
| echo "Evaluation: ${EVAL_OUTPUT_DIR}" |
| echo "GPUs: ${CUDA_VISIBLE_DEVICES}" |
| echo "Train batch/GPU: ${TRAIN_BATCH_SIZE}" |
| echo "Gradient accumulation: ${GRADIENT_ACCUMULATION_STEPS}" |
| echo "Learning rate / max grad norm: ${LEARNING_RATE} / ${MAX_GRAD_NORM}" |
| echo "ITFormer hidden size / heads / layers: ${IT_D_MODEL} / ${IT_N_HEADS} / ${IT_LAYERS}" |
| echo "LLM LoRA: ${USE_LORA} (r=${LORA_R}, alpha=${LORA_ALPHA}, dropout=${LORA_DROPOUT})" |
| echo "LoRA targets: ${LORA_TARGET_MODULES}" |
| echo "Gradient checkpointing: ${GRADIENT_CHECKPOINTING}" |
| echo "Max train sequence length: ${MAX_SEQ_LENGTH} (longer samples are skipped)" |
| echo "============================================================" |
|
|
| if [ "$DRY_RUN" = "1" ]; then |
| printf 'TRAIN: ' |
| printf '%q ' "${train_cmd[@]}" |
| printf '\nEVAL: ' |
| printf '%q ' "${eval_cmd[@]}" |
| printf '\n' |
| exit 0 |
| fi |
|
|
| mkdir -p "$RUN_ROOT" "$LOG_DIR" |
|
|
| checkpoint_ready=0 |
| if [ -f "$CHECKPOINT_DIR/config.json" ] && compgen -G "$CHECKPOINT_DIR/model*.safetensors" >/dev/null; then |
| checkpoint_ready=1 |
| fi |
|
|
| if [ "$FORCE_TRAIN" != "1" ] && [ "$checkpoint_ready" = "1" ]; then |
| echo "Training skipped: final checkpoint already exists at ${CHECKPOINT_DIR}" |
| else |
| echo "Starting ITFormer TSQA training. Log: ${TRAIN_LOG}" |
| "${train_cmd[@]}" 2>&1 | tee "$TRAIN_LOG" |
| fi |
|
|
| if [ ! -f "$CHECKPOINT_DIR/config.json" ] || ! compgen -G "$CHECKPOINT_DIR/model*.safetensors" >/dev/null; then |
| echo "Training did not produce a loadable final checkpoint: ${CHECKPOINT_DIR}" >&2 |
| exit 1 |
| fi |
|
|
| if [ "$FORCE_EVAL" != "1" ] && [ -f "$EVAL_OUTPUT_DIR/metrics.json" ]; then |
| echo "Evaluation skipped: ${EVAL_OUTPUT_DIR}/metrics.json already exists." |
| else |
| echo "Starting unified TSQA evaluation. Log: ${EVAL_LOG}" |
| mkdir -p "$EVAL_OUTPUT_DIR" |
| "${eval_cmd[@]}" 2>&1 | tee "$EVAL_LOG" |
| fi |
|
|
| require_file "$EVAL_OUTPUT_DIR/metrics.json" "unified TSQA metrics" |
| require_file "$EVAL_OUTPUT_DIR/predictions.jsonl" "TSQA predictions" |
| echo "Done." |
| echo "Metrics: ${EVAL_OUTPUT_DIR}/metrics.json" |
| echo "Predictions: ${EVAL_OUTPUT_DIR}/predictions.jsonl" |
|
|