opd_zt / scripts /launch_opd.sh
sdzt's picture
Add files using upload-large-folder tool
bf46e5d verified
Raw
History Blame Contribute Delete
1.55 kB
#!/usr/bin/env bash
# Launch (CD-)OPD training detached so it survives SSH disconnect.
# Usage:
# scripts/launch_opd.sh [extra hydra overrides ...] # vanilla OPD
# CONFIG=cd_opd scripts/launch_opd.sh [extra hydra overrides ...] # CD-OPD (+Mi/+Pa)
#
# Ablation env vars consumed by configs/cd_opd_qwen25vl.sh:
# CFG_ENABLED, CFG_ALPHA, CFG_PERTURBATION, CFG_PERTURBATIONS, CFG_AGGREGATION
# TOTAL_EPOCHS, ACTOR_LR, EXPERIMENT_NAME
set -euo pipefail
ROOT=/mnt/local-fast/opd_zt
cd "$ROOT"
source "$ROOT/.env"
source "$ROOT/.venv/bin/activate"
CONFIG=${CONFIG:-opd}
case "$CONFIG" in
opd) CONFIG_FILE="$ROOT/configs/opd_qwen25vl.sh"; TAG=opd ;;
cd_opd) CONFIG_FILE="$ROOT/configs/cd_opd_qwen25vl.sh"; TAG=cd_opd ;;
*) echo "Unknown CONFIG=$CONFIG (expected opd|cd_opd)"; exit 1 ;;
esac
LOG_DIR="$ROOT/logs"
mkdir -p "$LOG_DIR"
STAMP=$(date +%Y%m%d_%H%M%S)
LOG="$LOG_DIR/${TAG}_${STAMP}.log"
PID_FILE="$LOG_DIR/${TAG}.pid"
if [[ -f "$PID_FILE" ]] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then
echo "$TAG training already running, pid=$(cat "$PID_FILE")"
echo "Tail log: tail -f $(readlink -f "$LOG_DIR/${TAG}.latest.log")"
exit 1
fi
setsid nohup bash "$CONFIG_FILE" "$@" \
> "$LOG" 2>&1 < /dev/null &
PID=$!
echo "$PID" > "$PID_FILE"
ln -sfn "$LOG" "$LOG_DIR/${TAG}.latest.log"
disown $PID || true
echo "launched $TAG, pid=$PID"
echo "log: $LOG (symlinked at $LOG_DIR/${TAG}.latest.log)"
echo
echo "Watch with: tail -f $LOG_DIR/${TAG}.latest.log"
echo "Stop with: kill $PID"