#!/usr/bin/env bash
# Equivalent of:
#   python scripts/run_speed_embedding_ablation.py \
#     --data-root /robby/share/Robotics/zhangtianqi/datasets/lerobot/libero \
#     --pi05-base /robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch \
#     --batch-size 512 --lr 1e-4 --num-gpus 8 --num-workers 2 \
#     --num-train-steps 30000 --eval-speeds 0.75 1.0 1.25 1.5 --num-trials 50
#
# Runs three speed-integration ablations end-to-end:
#   norm stats -> train (8-GPU torchrun) -> serve + eval at 4 speeds.

set -euo pipefail

# ---------------------------------------------------------------- config
PROJECT_ROOT="/robby/share/Robotics/zhangtianqi/code/VLAwithVariousSpeed"
DATA_ROOT="/robby/share/Robotics/zhangtianqi/cache/huggingface/lerobot/your_hf_username/libero"
PI05_BASE="/robby/share/Robotics/zhangtianqi/model/pi_base_models_torch/pi05_base_torch"

# ASSET_ID="online_sliding_speed_embed_0p5_1p0_1p5_2p0_pi05"
# SPEEDS=(0.5 1 1.5 2.0)         # used in CLI args
# EVAL_SPEEDS=(0.5 1 1.5 2.0)    # eval rollouts
# EVAL_SPEED_TAGS=(0p5x 1x 1p5x 2p0x)

# ASSET_ID="online_sliding_speed_embed_1p0_pi05"
# SPEEDS=(1)         # used in CLI args
# EVAL_SPEEDS=(1)    # eval rollouts
# EVAL_SPEED_TAGS=(1x)

# ASSET_ID="online_sliding_speed_embed_0p25_0p5_1p0_2p0_4p0_pi05"
# SPEEDS=(0.25 0.5 1.0 2.0 4.0)         # used in CLI args
# EVAL_SPEEDS=(0.25 0.5 1.0 2.0 4.0)    # eval rollouts
# EVAL_SPEED_TAGS=(0p25x 0p5x 1p0x 2p0x 4p0x)

ASSET_ID="online_sliding_speed_embed_0p5_0p75_1p0_1p25_1p5_1p75_2p0_pi05"
SPEEDS=(0.5 0.75 1.0 1.25 1.5 1.75 2.0)         # used in CLI args
EVAL_SPEEDS=(0.5 0.75 1.0 1.25 1.5 1.75 2.0)    # eval rollouts
EVAL_SPEED_TAGS=(0p5x 0p75x 1x 1p25x 1p5x 1p75x 2p0x)


NUM_GPUS=8
NUM_TRAIN_STEPS=30000
NUM_TRIALS=50
BASE_PORT=8020
HOST="localhost"
SERVER_WAIT_SECONDS=120

LOG_DIR="${PROJECT_ROOT}/logs/speed_embedding_ablation"
RESULTS_DIR="${PROJECT_ROOT}/results/speed_embedding_ablation"
TORCHRUN_LOG_DIR="${LOG_DIR}/torchrun"
SERVER_LOG_DIR="${LOG_DIR}/servers"

# experiment name | train-config | exp-name | extra train args (space-separated)
EXPERIMENTS=(
  #"text|pi05_libero_speed_embed_text|0510_pi05_online_sliding_speed_embed_text_bs512_lr1e4|--data.speed-integration text"
  #"modulation|pi05_libero_speed_embed_modulation|0510_pi05_online_sliding_speed_embed_modulation_bs512_lr1e4|--data.speed-integration modulation --model.speed-modulation"
  #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0510_pi05_online_sliding_speed_embed_softprompt_p8_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 8 --model.soft-prompt-speeds 0.75 1 1.25 1.5"
  #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0511_pi05_online_sliding_speed_embed_softprompt_p4_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 4 --model.soft-prompt-speeds 0.75 1 1.25 1.5"
  #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0511_pi05_online_sliding_speed_embed_softprompt_p16_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 16 --model.soft-prompt-speeds 0.75 1 1.25 1.5"
  #"soft_prompt|pi05_libero_speed_embed_softprompt_p8|0511_pi05_online_sliding_speed_embed_softprompt_p32_bs512_lr1e4|--data.speed-integration soft_prompt --model.soft-prompt-p 32 --model.soft-prompt-speeds 0.75 1 1.25 1.5"
  #"text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_0p5_1p0_1p5_2p0|--data.speed-integration text"
  #"text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_1p0|--data.speed-integration text"
  #"text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_0p25_0p5_1p0_2p0_4p0|--data.speed-integration text"
  "text|pi05_libero_speed_embed_text|0513_pi05_online_sliding_speed_embed_text_0p5_0p75_1p0_1p25_1p5_1p75_2p0|--data.speed-integration text"
)

# ---------------------------------------------------------------- env
cd "${PROJECT_ROOT}"
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

# MuJoCo offscreen rendering on GPU (EGL); otherwise LIBERO sim falls back to
# CPU software rendering and 8 concurrent envs will peg CPU while GPU stays idle.
export MUJOCO_GL="${MUJOCO_GL:-egl}"
export PYOPENGL_PLATFORM="${PYOPENGL_PLATFORM:-egl}"

# Disable torch.compile during eval. The train config ships with
# pytorch_compile_mode='max-autotune', whose first-call codegen + ptxas pass
# eats 5-15 min per server and stalls the eval clients waiting on the first
# infer response. For eval we don't need the squeezed-out throughput.
export TORCH_COMPILE_DISABLE="${TORCH_COMPILE_DISABLE:-1}"

mkdir -p "${LOG_DIR}" "${TORCHRUN_LOG_DIR}" "${SERVER_LOG_DIR}" "${RESULTS_DIR}"

# ---------------------------------------------------------------- helpers
# Intentionally no trap/cleanup: we sometimes run several instances of this
# script concurrently against different checkpoints, and any pkill-style
# cleanup would tear down a sibling run's servers/clients. Manage stragglers
# manually (e.g. ps -ef | grep serve_policy) when needed.

latest_ckpt_dir() {
  # $1 = train_config, $2 = exp_name
  local root="${PROJECT_ROOT}/checkpoints/$1/$2"
  if [ ! -d "${root}" ]; then
    echo "${root}/$((NUM_TRAIN_STEPS - 1))"
    return
  fi
  local latest
  latest="$(find "${root}" -mindepth 1 -maxdepth 1 -type d -regex '.*/[0-9]+' \
             -printf '%f\n' 2>/dev/null | sort -n | tail -n 1 || true)"
  if [ -z "${latest}" ]; then
    echo "${root}/$((NUM_TRAIN_STEPS - 1))"
  else
    echo "${root}/${latest}"
  fi
}

# ---------------------------------------------------------------- 3) eval
echo "=========================================================="
echo "Stage: eval"
echo "=========================================================="
for entry in "${EXPERIMENTS[@]}"; do
  IFS='|' read -r name cfg exp extra <<<"${entry}"
  ckpt_dir="$(latest_ckpt_dir "${cfg}" "${exp}")"
  if [ ! -d "${ckpt_dir}" ]; then
    echo "ERROR: checkpoint for eval does not exist: ${ckpt_dir}" >&2
    exit 1
  fi
  echo "========== eval: ${name} ckpt=${ckpt_dir} =========="

  # spin up 8 policy servers, one per GPU, ports BASE_PORT..BASE_PORT+7
  srv_log_dir="${SERVER_LOG_DIR}/${name}"
  mkdir -p "${srv_log_dir}"
  for rank in $(seq 0 $((NUM_GPUS - 1))); do
    port=$((BASE_PORT + rank))
    log_file="${srv_log_dir}/gpu${rank}.log"
    echo "  -> server gpu${rank} port=${port} log=${log_file}"
    CUDA_VISIBLE_DEVICES="${rank}" \
      python scripts/serve_policy.py \
        --port "${port}" \
        policy:checkpoint \
        --policy.config "${cfg}" \
        --policy.dir "${ckpt_dir}" \
        --policy.asset-id "${ASSET_ID}" \
        >"${log_file}" 2>&1 &
  done

  echo "Waiting ${SERVER_WAIT_SECONDS}s for policy servers to load..."
  sleep "${SERVER_WAIT_SECONDS}"

  # run eval for each speed
  for i in "${!EVAL_SPEEDS[@]}"; do
    speed="${EVAL_SPEEDS[$i]}"
    tag="${EVAL_SPEED_TAGS[$i]}"
    results_dir="${RESULTS_DIR}/${exp}/speed_${tag}"
    echo "  -> eval speed=${speed} tag=${tag} -> ${results_dir}"
    SPEED="${speed}" \
      BASE_PORT="${BASE_PORT}" \
      HOST="${HOST}" \
      NUM_TRIALS="${NUM_TRIALS}" \
      SAVE_VIDEOS="1" \
      PYTHON_CMD="python" \
      RESULTS_DIR="${results_dir}" \
      ./scripts/eval_libero_8gpu.sh
  done
done

echo "All stages completed."