File size: 8,320 Bytes
08ff31f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env bash
# Equivalent of:
#   python scripts/run_speed_embedding_ablation.py \
#     --data-root /robby/share/Robotics/zhangtianqi/datasets/lerobot/libero \
#     --pi05-base /robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch \
#     --batch-size 512 --lr 1e-4 --num-gpus 8 --num-workers 2 \
#     --num-train-steps 30000 --eval-speeds 0.75 1.0 1.25 1.5 --num-trials 50
#
# Runs three speed-integration ablations end-to-end:
#   norm stats -> train (8-GPU torchrun) -> serve + eval at 4 speeds.

set -x

umask 007 

# ---------------------------------------------------------------- config
PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed"
DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero"
PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch"

ASSET_ID="online_sliding_speed_embed_0p75_1p0_1p25_1p5_pi05"
SPEEDS=(0.75 1 1.25 1.5)         # used in CLI args
EVAL_SPEEDS=(0.75 1 1.25 1.5)    # eval rollouts
EVAL_SPEED_TAGS=(0p75x 1x 1p25x 1p5x)

NUM_GPUS=8
BATCH_SIZE=512
LR=1e-4
NUM_WORKERS=2
NUM_TRAIN_STEPS=30000
LOG_INTERVAL=100
SAVE_INTERVAL=5000
COMPILE_MODE="None"
NUM_TRIALS=50
BASE_PORT=8000
HOST="0.0.0.0"
SERVER_WAIT_SECONDS=120

LOG_DIR="${PROJECT_ROOT}/logs/speed_embedding_ablation"
RESULTS_DIR="${PROJECT_ROOT}/results/speed_embedding_ablation"
TORCHRUN_LOG_DIR="${LOG_DIR}/torchrun"
SERVER_LOG_DIR="${LOG_DIR}/servers"

# experiment name | train-config | exp-name | extra train args (space-separated)
EXPERIMENTS=(
  "modulation|pi05_libero_speed_embed_modulation|pi05_online_sliding_speed_embed_modulation_bs512_lr1e4|--data.speed-integration modulation --model.speed-modulation"
  "soft_prompt|pi05_libero_speed_embed_softprompt_p8|pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 8 --model.soft-prompt-speeds 0.75 1 1.25 1.5"
)

# ---------------------------------------------------------------- env
cd "${PROJECT_ROOT}"
# export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export WANDB__SERVICE_WAIT="${WANDB__SERVICE_WAIT:-300}"
unset WANDB_API_KEY WANDB_API_KEY_FILE || true

mkdir -p "${LOG_DIR}" "${TORCHRUN_LOG_DIR}" "${SERVER_LOG_DIR}" "${RESULTS_DIR}"

# ---------------------------------------------------------------- helpers
SERVER_PIDS=()

cleanup_servers() {
  if [ "${#SERVER_PIDS[@]}" -eq 0 ]; then
    return
  fi
  echo "Stopping policy servers: ${SERVER_PIDS[*]}"
  for pid in "${SERVER_PIDS[@]}"; do
    kill -TERM "${pid}" 2>/dev/null || true
  done
  # wait up to 30s, then SIGKILL
  for _ in $(seq 1 30); do
    local alive=0
    for pid in "${SERVER_PIDS[@]}"; do
      if kill -0 "${pid}" 2>/dev/null; then alive=1; fi
    done
    [ "${alive}" -eq 0 ] && break
    sleep 1
  done
  for pid in "${SERVER_PIDS[@]}"; do
    if kill -0 "${pid}" 2>/dev/null; then
      kill -KILL "${pid}" 2>/dev/null || true
    fi
  done
  SERVER_PIDS=()
}

trap cleanup_servers EXIT INT TERM

latest_ckpt_dir() {
  # $1 = train_config, $2 = exp_name
  local root="${PROJECT_ROOT}/checkpoints/$1/$2"
  if [ ! -d "${root}" ]; then
    echo "${root}/$((NUM_TRAIN_STEPS - 1))"
    return
  fi
  local latest
  latest="$(find "${root}" -mindepth 1 -maxdepth 1 -type d -regex '.*/[0-9]+' \
             -printf '%f\n' 2>/dev/null | sort -n | tail -n 1 || true)"
  if [ -z "${latest}" ]; then
    echo "${root}/$((NUM_TRAIN_STEPS - 1))"
  else
    echo "${root}/${latest}"
  fi
}

# ---------------------------------------------------------------- 1) norm
echo "=========================================================="
echo "Stage: compute norm stats"
echo "=========================================================="
for entry in "${EXPERIMENTS[@]}"; do
  IFS='|' read -r name cfg exp extra <<<"${entry}"
  stats="${PROJECT_ROOT}/assets/${cfg}/${ASSET_ID}/norm_stats.json"
  if [ -f "${stats}" ]; then
    echo "[skip norm] ${name}: ${stats}"
    continue
  fi
  echo "========== norm: ${name} =========="
  python scripts/compute_norm_stats.py \
    --config-name "${cfg}" \
    --repo-id "${DATA_ROOT}" \
    --asset-id "${ASSET_ID}" \
    --online-sliding-chunks \
    --online-sliding-speeds "${SPEEDS[@]}"
done


########### Multinode setting ###########
NGPU=${NGPU:-"8"}
PORT=${PORT:-"1106"}
LOG_RANK=${LOG_RANK:-"0"}
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}

sleep 1

## node setting
num_gpu=${NGPU}
log_rank=${LOG_RANK}
torchft_lighthouse=${TORCHFT_LIGHTHOUSE}
job_num=${WORKER_NUM}
job_id=${RANK}

echo "multiple nodes training: ${WORLD_SIZE}, ${RANK}"

## ib setting
export NCCL_IB_TC=136
export NCCL_IB_SL=5
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=bond1
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5_bond
export NCCL_IB_TIMEOUT=20
export NCCL_NET_PLUGIN=none
export NCCL_IB_QPS_PER_CONNECTION=8
export NCCL_IB_SPLIT_DATA_ON_QPS=1
export NCCL_MIN_NCHANNELS=4
export GLOO_SOCKET_IFNAME=bond1
export TOKENIZERS_PARALLELISM=false

## wandb setting
export WANDB_API_KEY=local-73813ba405c87d3b1ad539b4d31124351374cdb6
export WANDB_BASE_URL=http://33.180.4.104

# ---------------------------------------------------------------- 2) train
echo "=========================================================="
echo "Stage: train"
echo "=========================================================="
for entry in "${EXPERIMENTS[@]}"; do
  IFS='|' read -r name cfg exp extra <<<"${entry}"
  echo "========== train: ${name} =========="
  # shellcheck disable=SC2086
  PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" TORCHFT_LIGHTHOUSE=${torchft_lighthouse} \
  dlrover-run --network-check --max_restarts=3 --rdzv_conf join_timeout=10800 --log_dir ${log_dir}/${NODE_ID} -r=3 \
    --nproc_per_node=${num_gpu} \
    --nnodes=${job_num} \
    --local-ranks-filter=${log_rank} \
    --tee 3 \
    scripts/train_pytorch.py "${cfg}" \
    --exp-name "${exp}" \
    --pytorch-weight-path "${PI05_BASE}" \
    --batch-size "${BATCH_SIZE}" \
    --num-workers "${NUM_WORKERS}" \
    --num-train-steps "${NUM_TRAIN_STEPS}" \
    --log-interval "${LOG_INTERVAL}" \
    --save-interval "${SAVE_INTERVAL}" \
    --lr-schedule.peak-lr "${LR}" \
    --lr-schedule.decay-lr "${LR}" \
    --eval-speed-set "${SPEEDS[@]}" \
    --data.repo-id "${DATA_ROOT}" \
    --data.assets.asset-id "${ASSET_ID}" \
    --data.online-sliding-chunks \
    --data.online-sliding-speeds "${SPEEDS[@]}" \
    --model.pytorch-compile-mode "${COMPILE_MODE}" \
    ${extra} \
    --overwrite
done

# ---------------------------------------------------------------- 3) eval
echo "=========================================================="
echo "Stage: eval"
echo "=========================================================="
for entry in "${EXPERIMENTS[@]}"; do
  IFS='|' read -r name cfg exp extra <<<"${entry}"
  ckpt_dir="$(latest_ckpt_dir "${cfg}" "${exp}")"
  if [ ! -d "${ckpt_dir}" ]; then
    echo "ERROR: checkpoint for eval does not exist: ${ckpt_dir}" >&2
    exit 1
  fi
  echo "========== eval: ${name} ckpt=${ckpt_dir} =========="

  # spin up 8 policy servers, one per GPU, ports BASE_PORT..BASE_PORT+7
  SERVER_PIDS=()
  srv_log_dir="${SERVER_LOG_DIR}/${name}"
  mkdir -p "${srv_log_dir}"
  for rank in $(seq 0 $((NUM_GPUS - 1))); do
    port=$((BASE_PORT + rank))
    log_file="${srv_log_dir}/gpu${rank}.log"
    echo "  -> server gpu${rank} port=${port} log=${log_file}"
    CUDA_VISIBLE_DEVICES="${rank}" \
      python scripts/serve_policy.py policy:checkpoint \
        --policy.config "${cfg}" \
        --policy.dir "${ckpt_dir}" \
        --port "${port}" \
        >"${log_file}" 2>&1 &
    SERVER_PIDS+=("$!")
  done

  echo "Waiting ${SERVER_WAIT_SECONDS}s for policy servers to load..."
  sleep "${SERVER_WAIT_SECONDS}"

  # run eval for each speed
  for i in "${!EVAL_SPEEDS[@]}"; do
    speed="${EVAL_SPEEDS[$i]}"
    tag="${EVAL_SPEED_TAGS[$i]}"
    results_dir="${RESULTS_DIR}/${exp}/speed_${tag}"
    echo "  -> eval speed=${speed} tag=${tag} -> ${results_dir}"
    SPEED="${speed}" \
      BASE_PORT="${BASE_PORT}" \
      HOST="${HOST}" \
      NUM_TRIALS="${NUM_TRIALS}" \
      SAVE_VIDEOS="0" \
      PYTHON_CMD="python" \
      RESULTS_DIR="${results_dir}" \
      ./scripts/eval_libero_8gpu.sh
  done

  cleanup_servers
done

echo "All stages completed."