VLAwithVariousSpeed / scripts /eval_libero_8gpu.sh

Upload folder using huggingface_hub

08ff31f verified about 2 months ago

6.08 kB

	#!/usr/bin/env bash
	# Run LIBERO eval on an 8-GPU server, partitioned for time balance:
	#
	# GPU 0 -> libero_spatial (full suite, 10 tasks x 50 trials = 500 episodes)
	# GPU 1 -> libero_goal (full suite, 10 tasks x 50 trials = 500 episodes)
	# GPU 2 -> libero_object (full suite, 10 tasks x 50 trials = 500 episodes)
	# GPU 3 -> libero_10 t0..1 (2 tasks x 50 trials = 100 episodes, longer rollouts)
	# GPU 4 -> libero_10 t2..3
	# GPU 5 -> libero_10 t4..5
	# GPU 6 -> libero_10 t6..7
	# GPU 7 -> libero_10 t8..9
	#
	# Each GPU points to its own websocket policy server (one per GPU at ports
	# $BASE_PORT+0..7 by default). You start the servers separately -- see the
	# "Servers" section in the printout below for the canonical commands.
	#
	# Required env:
	# SPEED target speed for this eval pass (e.g., 1.0, 1.5, 2.0)
	#
	# Optional env (defaults shown):
	# BASE_PORT=8000 ports BASE_PORT..BASE_PORT+7 (one per client)
	# HOST=0.0.0.0 server hostname
	# RESULTS_DIR=results/libero_eval_<SPEED>x_<timestamp>
	# VIDEO_DIR=$RESULTS_DIR/videos
	# LOG_DIR=$RESULTS_DIR/logs
	# NUM_TRIALS=50 trials per task
	# SAVE_VIDEOS=1 set to 0 to skip mp4 saves (faster)
	# PYTHON_CMD="uv run python"
	#
	# Example:
	# SPEED=1.5 BASE_PORT=8000 ./scripts/eval_libero_8gpu.sh
	#
	set -euo pipefail

	SPEED="${SPEED:?SPEED is required (e.g., SPEED=1.0)}"
	BASE_PORT="${BASE_PORT:-8000}"
	HOST="${HOST:-0.0.0.0}"
	NUM_TRIALS="${NUM_TRIALS:-50}"
	SAVE_VIDEOS="${SAVE_VIDEOS:-1}"
	PYTHON_CMD="${PYTHON_CMD:-uv run python}"

	TS="$(date +%Y%m%d_%H%M%S)"
	SPEED_TAG="$(printf '%s' "$SPEED" \| tr '.' 'p')x"
	RESULTS_DIR="${RESULTS_DIR:-results/libero_eval_${SPEED_TAG}_${TS}}"
	VIDEO_DIR="${VIDEO_DIR:-$RESULTS_DIR/videos}"
	LOG_DIR="${LOG_DIR:-$RESULTS_DIR/logs}"
	mkdir -p "$RESULTS_DIR" "$VIDEO_DIR" "$LOG_DIR"

	# Partition: (rank, gpu, suite, task_ids, results_json basename).
	# GPU 0/1/2 -> spatial/goal/object full; GPU 3..7 -> libero_10 split 5 ways.
	RANKS=(0 1 2 3 4 5 6 7)
	SUITES=(libero_spatial libero_goal libero_object libero_10 libero_10 libero_10 libero_10 libero_10)
	TASK_IDS=(all all all "0,1" "2,3" "4,5" "6,7" "8,9")
	LABELS=(spatial goal object long_t0_1 long_t2_3 long_t4_5 long_t6_7 long_t8_9)

	if [[ "${#RANKS[@]}" -ne 8 ]]; then
	echo "Hardcoded for 8 ranks; edit the partition arrays to change." >&2
	exit 2
	fi

	cat <<EOF
	====================================================================
	LIBERO 8-GPU eval driver
	speed = $SPEED ($SPEED_TAG)
	results_dir = $RESULTS_DIR
	base_port = $BASE_PORT (clients hit $HOST:$((BASE_PORT))..$HOST:$((BASE_PORT+7)))
	num_trials = $NUM_TRIALS per task
	save_videos = $SAVE_VIDEOS

	Partition:
	EOF
	for i in "${!RANKS[@]}"; do
	printf " rank=%d gpu=%d port=%d suite=%-15s task_ids=%-7s -> %s\n" \
	"${RANKS[$i]}" "${RANKS[$i]}" "$((BASE_PORT + RANKS[$i]))" \
	"${SUITES[$i]}" "${TASK_IDS[$i]}" "${LABELS[$i]}"
	done
	cat <<EOF

	Servers (one per GPU, you must start these separately):
	for g in 0 1 2 3 4 5 6 7; do
	CUDA_VISIBLE_DEVICES=\$g $PYTHON_CMD scripts/serve_policy.py \\
	policy:checkpoint --policy.config=<your_config> \\
	--policy.dir=<your_ckpt_dir> --port=\$((BASE_PORT + g)) &
	done

	====================================================================
	EOF

	pids=()
	for i in "${!RANKS[@]}"; do
	rank="${RANKS[$i]}"
	port=$((BASE_PORT + rank))
	suite="${SUITES[$i]}"
	ids="${TASK_IDS[$i]}"
	label="${LABELS[$i]}"
	results_json="$RESULTS_DIR/${label}_${SPEED_TAG}.json"
	log_path="$LOG_DIR/${label}_${SPEED_TAG}.log"

	echo "Launching rank=$rank ($label, suite=$suite, port=$port) -> $log_path"
	$PYTHON_CMD scripts/eval_libero_speed.py \
	--task-suite-name "$suite" \
	--task-ids "$ids" \
	--host "$HOST" --port "$port" \
	--speed "$SPEED" \
	--num-trials-per-task "$NUM_TRIALS" \
	--rank "$rank" \
	--video-out-path "$VIDEO_DIR/${label}_${SPEED_TAG}" \
	--results-json "$results_json" \
	$([[ "$SAVE_VIDEOS" == "1" ]] \|\| echo "--no-save-videos") \
	>"$log_path" 2>&1 &
	pids+=("$!")
	done

	echo
	echo "All 8 ranks launched. Waiting..."

	status=0
	for i in "${!pids[@]}"; do
	pid="${pids[$i]}"
	label="${LABELS[$i]}"
	if wait "$pid"; then
	echo "[done] rank=${RANKS[$i]} $label"
	else
	echo "[FAIL] rank=${RANKS[$i]} $label (see $LOG_DIR/${label}_${SPEED_TAG}.log)" >&2
	status=1
	fi
	done

	echo
	echo "=================== Aggregated summary ==================="
	$PYTHON_CMD - <<PYEOF
	import json
	import pathlib

	results_dir = pathlib.Path("$RESULTS_DIR")
	files = sorted(results_dir.glob("*.json"))
	if not files:
	print("No result JSONs found in", results_dir)
	raise SystemExit(0)

	per_rank = []
	for fp in files:
	with fp.open() as f:
	d = json.load(f)
	per_rank.append(d["summary"])

	# Per-rank lines
	for s in per_rank:
	print(s["summary_line"])

	# Cross-rank rollups
	def _agg(rows, keep_suite=None):
	eps = []
	for fp in files:
	with fp.open() as f:
	d = json.load(f)
	if keep_suite is None or d["summary"]["suite"] == keep_suite:
	eps.extend(d["episodes"])
	if not eps:
	return None
	succ = [e for e in eps if e["success"]]
	return {
	"n": len(eps),
	"n_succ": len(succ),
	"sr": len(succ) / len(eps),
	"mean_steps_succ": (sum(e["steps"] for e in succ) / len(succ)) if succ else float("nan"),
	"mean_steps_all": sum(e["steps"] for e in eps) / len(eps),
	}

	print()
	print("--- per-suite rollup ---")
	for suite in ("libero_spatial", "libero_goal", "libero_object", "libero_10"):
	r = _agg(per_rank, keep_suite=suite)
	if r:
	print(f" {suite:16s} success={r['n_succ']}/{r['n']} ({r['sr']*100:.1f}%) "
	f"mean_steps_success={r['mean_steps_succ']:.1f} mean_steps_all={r['mean_steps_all']:.1f}")

	g = _agg(per_rank)
	if g:
	print()
	print(f"GLOBAL (speed=$SPEED): success={g['n_succ']}/{g['n']} ({g['sr']*100:.1f}%) "
	f"mean_steps_success={g['mean_steps_succ']:.1f} mean_steps_all={g['mean_steps_all']:.1f}")
	PYEOF

	exit "$status"