#!/usr/bin/env bash set -Eeuo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ITFORMER_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" MQA_DIR="${MQA_DIR:-/mnt/share01/sqk/MQA}" DATA_ROOT="${DATA_ROOT:-/mnt/share01/sqk/datasets/Time-MQA_TSQA/tmp}" TRAIN_PATH="${TRAIN_PATH:-${DATA_ROOT}/train.jsonl}" EVAL_PATH="${EVAL_PATH:-${DATA_ROOT}/eval.jsonl}" PYTHON_BIN="${PYTHON_BIN:-/home/suiqk/anaconda3/envs/scalerag-ts-v4/bin/python}" ACCELERATE_BIN="${ACCELERATE_BIN:-$(dirname "$PYTHON_BIN")/accelerate}" ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-${ITFORMER_DIR}/accelerate_config_2gpu.yaml}" CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1}" LLM_MODEL_PATH="${LLM_MODEL_PATH:-/mnt/share01/sqk/models/qwen2.5-7b-instruct}" TS_ENCODER_CHECKPOINT="${TS_ENCODER_CHECKPOINT:-${ITFORMER_DIR}/save/pretrain/model.safetensors}" RUN_ID="${RUN_ID:-$(date +%Y%m%d_%H%M%S)}" SMOKE_TEST="${SMOKE_TEST:-0}" DATA_CHECK_ONLY="${DATA_CHECK_ONLY:-0}" DRY_RUN="${DRY_RUN:-0}" FORCE_TRAIN="${FORCE_TRAIN:-0}" FORCE_EVAL="${FORCE_EVAL:-0}" if [ "$SMOKE_TEST" = "1" ]; then RUN_NAME="${RUN_NAME:-itformer_tsqa_smoke_${RUN_ID}}" TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-8}" EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-8}" NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-1}" TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-1}" GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-1}" SAVE_STEPS="${SAVE_STEPS:-1}" MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-64}" DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-0}" EVAL_NUM_WORKERS="${EVAL_NUM_WORKERS:-0}" else RUN_NAME="${RUN_NAME:-itformer_tsqa_full_${RUN_ID}}" TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-0}" EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-0}" NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-2}" TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-1}" GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-2}" SAVE_STEPS="${SAVE_STEPS:-500}" MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-256}" DATALOADER_NUM_WORKERS="${DATALOADER_NUM_WORKERS:-4}" EVAL_NUM_WORKERS="${EVAL_NUM_WORKERS:-2}" fi RUN_ROOT="${RUN_ROOT:-${ITFORMER_DIR}/runs/${RUN_NAME}}" CHECKPOINT_DIR="${CHECKPOINT_DIR:-${RUN_ROOT}/checkpoint_final}" EVAL_OUTPUT_DIR="${EVAL_OUTPUT_DIR:-${RUN_ROOT}/eval}" LOG_DIR="${LOG_DIR:-${RUN_ROOT}/logs}" TRAIN_LOG="${TRAIN_LOG:-${LOG_DIR}/train.log}" EVAL_LOG="${EVAL_LOG:-${LOG_DIR}/eval.log}" TRAIN_PORT="${TRAIN_PORT:-30610}" EVAL_PORT="${EVAL_PORT:-30611}" SEED="${SEED:-42}" INPUT_LEN="${INPUT_LEN:-600}" PATCH_LEN="${PATCH_LEN:-60}" STRIDE="${STRIDE:-60}" PREFIX_NUM="${PREFIX_NUM:-25}" IT_D_MODEL="${IT_D_MODEL:-896}" IT_N_HEADS="${IT_N_HEADS:-16}" IT_LAYERS="${IT_LAYERS:-2}" LEARNING_RATE="${LEARNING_RATE:-5e-5}" MAX_GRAD_NORM="${MAX_GRAD_NORM:-1.0}" WEIGHT_DECAY="${WEIGHT_DECAY:-1e-6}" EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-1}" MAX_SEQ_LENGTH="${MAX_SEQ_LENGTH:-4096}" LLM_ATTN_IMPLEMENTATION="${LLM_ATTN_IMPLEMENTATION:-}" LLM_TORCH_DTYPE="${LLM_TORCH_DTYPE:-bfloat16}" USE_LORA="${USE_LORA:-true}" LORA_R="${LORA_R:-16}" LORA_ALPHA="${LORA_ALPHA:-32}" LORA_DROPOUT="${LORA_DROPOUT:-0.05}" LORA_TARGET_MODULES="${LORA_TARGET_MODULES:-q_proj k_proj v_proj o_proj gate_proj up_proj down_proj}" GRADIENT_CHECKPOINTING="${GRADIENT_CHECKPOINTING:-true}" read -r -a LORA_TARGET_MODULE_ARRAY <<< "$LORA_TARGET_MODULES" export ITFORMER_DIR export MQA_DIR export CUDA_VISIBLE_DEVICES export PYTHONPATH="${ITFORMER_DIR}:${MQA_DIR}${PYTHONPATH:+:${PYTHONPATH}}" export TOKENIZERS_PARALLELISM=false export WANDB_MODE=offline export PYTHONWARNINGS="ignore::FutureWarning:transformers.utils.hub" export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" export MPLCONFIGDIR="${MPLCONFIGDIR:-/dev/shm/itformer_tsqa_cache/matplotlib}" export XDG_CACHE_HOME="${XDG_CACHE_HOME:-/dev/shm/itformer_tsqa_cache/xdg}" mkdir -p "$MPLCONFIGDIR" "$XDG_CACHE_HOME" require_file() { local path="$1" local label="$2" if [ ! -f "$path" ]; then echo "Missing ${label}: ${path}" >&2 exit 1 fi } require_file "$PYTHON_BIN" "Python executable" require_file "$ACCELERATE_BIN" "Accelerate executable" require_file "$ACCELERATE_CONFIG" "Accelerate config" require_file "$TRAIN_PATH" "TSQA train split" require_file "$EVAL_PATH" "TSQA eval split" require_file "$LLM_MODEL_PATH/config.json" "Qwen2.5-7B config" require_file "$TS_ENCODER_CHECKPOINT" "ITFormer pretrained time-series encoder" require_file "$ITFORMER_DIR/dataset/tsqa_dataset.py" "TSQA data adapter" require_file "$ITFORMER_DIR/train_sft_tsqa.py" "ITFormer TSQA trainer" require_file "$ITFORMER_DIR/inference_tsqa.py" "ITFormer TSQA evaluator" echo "Running ITFormer TSQA data-contract check." "$PYTHON_BIN" "$ITFORMER_DIR/dataset/tsqa_dataset.py" \ --train_path "$TRAIN_PATH" \ --eval_path "$EVAL_PATH" \ --model_path "$LLM_MODEL_PATH" \ --prefix_num "$PREFIX_NUM" \ --input_len "$INPUT_LEN" \ --samples 8 if [ "$DATA_CHECK_ONLY" = "1" ]; then echo "DATA_CHECK_ONLY=1: data adapter check passed; stopping before GPU work." exit 0 fi train_cmd=( "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" --main_process_port "$TRAIN_PORT" "$ITFORMER_DIR/train_sft_tsqa.py" --train_path "$TRAIN_PATH" --eval_path "$EVAL_PATH" --llm_model_path "$LLM_MODEL_PATH" --load_ts_encoder "$TS_ENCODER_CHECKPOINT" --output_dir "$CHECKPOINT_DIR" --max_train_samples "$TRAIN_MAX_SAMPLES" --max_eval_samples "$EVAL_MAX_SAMPLES" --max_seq_length "$MAX_SEQ_LENGTH" --seed "$SEED" --input_len "$INPUT_LEN" --patch_len "$PATCH_LEN" --stride "$STRIDE" --prefix_num "$PREFIX_NUM" --it_d_model "$IT_D_MODEL" --it_n_heads "$IT_N_HEADS" --it_layers "$IT_LAYERS" --llm_torch_dtype "$LLM_TORCH_DTYPE" --use_lora "$USE_LORA" --lora_r "$LORA_R" --lora_alpha "$LORA_ALPHA" --lora_dropout "$LORA_DROPOUT" --lora_target_modules "${LORA_TARGET_MODULE_ARRAY[@]}" --gradient_checkpointing "$GRADIENT_CHECKPOINTING" --per_device_train_batch_size "$TRAIN_BATCH_SIZE" --per_device_eval_batch_size "$EVAL_BATCH_SIZE" --gradient_accumulation_steps "$GRADIENT_ACCUMULATION_STEPS" --learning_rate "$LEARNING_RATE" --max_grad_norm "$MAX_GRAD_NORM" --weight_decay "$WEIGHT_DECAY" --num_train_epochs "$NUM_TRAIN_EPOCHS" --save_steps "$SAVE_STEPS" --logging_steps 1 --dataloader_num_workers "$DATALOADER_NUM_WORKERS" --report_to none --bf16 ) eval_cmd=( "$ACCELERATE_BIN" launch --config_file "$ACCELERATE_CONFIG" --main_process_port "$EVAL_PORT" "$ITFORMER_DIR/inference_tsqa.py" --checkpoint "$CHECKPOINT_DIR" --eval_path "$EVAL_PATH" --llm_model_path "$LLM_MODEL_PATH" --output_dir "$EVAL_OUTPUT_DIR" --max_eval_samples "$EVAL_MAX_SAMPLES" --seed "$SEED" --input_len "$INPUT_LEN" --patch_len "$PATCH_LEN" --stride "$STRIDE" --prefix_num "$PREFIX_NUM" --it_d_model "$IT_D_MODEL" --it_n_heads "$IT_N_HEADS" --it_layers "$IT_LAYERS" --llm_torch_dtype "$LLM_TORCH_DTYPE" --lora_r "$LORA_R" --lora_alpha "$LORA_ALPHA" --lora_dropout "$LORA_DROPOUT" --lora_target_modules "${LORA_TARGET_MODULE_ARRAY[@]}" --batch_size "$EVAL_BATCH_SIZE" --num_workers "$EVAL_NUM_WORKERS" --max_new_tokens "$MAX_NEW_TOKENS" --bf16 ) if [ "${USE_LORA,,}" = "true" ] || [ "$USE_LORA" = "1" ]; then eval_cmd+=(--use_lora) fi if [ -n "$LLM_ATTN_IMPLEMENTATION" ]; then train_cmd+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION") eval_cmd+=(--llm_attn_implementation "$LLM_ATTN_IMPLEMENTATION") fi echo "============================================================" echo "ITFormer on Time-MQA TSQA" echo "Mode: $([ "$SMOKE_TEST" = "1" ] && echo smoke || echo full)" echo "Train samples: ${TRAIN_MAX_SAMPLES} (0 means all)" echo "Eval samples: ${EVAL_MAX_SAMPLES} (0 means all)" echo "Checkpoint: ${CHECKPOINT_DIR}" echo "Evaluation: ${EVAL_OUTPUT_DIR}" echo "GPUs: ${CUDA_VISIBLE_DEVICES}" echo "Train batch/GPU: ${TRAIN_BATCH_SIZE}" echo "Gradient accumulation: ${GRADIENT_ACCUMULATION_STEPS}" echo "Learning rate / max grad norm: ${LEARNING_RATE} / ${MAX_GRAD_NORM}" echo "ITFormer hidden size / heads / layers: ${IT_D_MODEL} / ${IT_N_HEADS} / ${IT_LAYERS}" echo "LLM LoRA: ${USE_LORA} (r=${LORA_R}, alpha=${LORA_ALPHA}, dropout=${LORA_DROPOUT})" echo "LoRA targets: ${LORA_TARGET_MODULES}" echo "Gradient checkpointing: ${GRADIENT_CHECKPOINTING}" echo "Max train sequence length: ${MAX_SEQ_LENGTH} (longer samples are skipped)" echo "============================================================" if [ "$DRY_RUN" = "1" ]; then printf 'TRAIN: ' printf '%q ' "${train_cmd[@]}" printf '\nEVAL: ' printf '%q ' "${eval_cmd[@]}" printf '\n' exit 0 fi mkdir -p "$RUN_ROOT" "$LOG_DIR" checkpoint_ready=0 if [ -f "$CHECKPOINT_DIR/config.json" ] && compgen -G "$CHECKPOINT_DIR/model*.safetensors" >/dev/null; then checkpoint_ready=1 fi if [ "$FORCE_TRAIN" != "1" ] && [ "$checkpoint_ready" = "1" ]; then echo "Training skipped: final checkpoint already exists at ${CHECKPOINT_DIR}" else echo "Starting ITFormer TSQA training. Log: ${TRAIN_LOG}" "${train_cmd[@]}" 2>&1 | tee "$TRAIN_LOG" fi if [ ! -f "$CHECKPOINT_DIR/config.json" ] || ! compgen -G "$CHECKPOINT_DIR/model*.safetensors" >/dev/null; then echo "Training did not produce a loadable final checkpoint: ${CHECKPOINT_DIR}" >&2 exit 1 fi if [ "$FORCE_EVAL" != "1" ] && [ -f "$EVAL_OUTPUT_DIR/metrics.json" ]; then echo "Evaluation skipped: ${EVAL_OUTPUT_DIR}/metrics.json already exists." else echo "Starting unified TSQA evaluation. Log: ${EVAL_LOG}" mkdir -p "$EVAL_OUTPUT_DIR" "${eval_cmd[@]}" 2>&1 | tee "$EVAL_LOG" fi require_file "$EVAL_OUTPUT_DIR/metrics.json" "unified TSQA metrics" require_file "$EVAL_OUTPUT_DIR/predictions.jsonl" "TSQA predictions" echo "Done." echo "Metrics: ${EVAL_OUTPUT_DIR}/metrics.json" echo "Predictions: ${EVAL_OUTPUT_DIR}/predictions.jsonl"