TeddyBearKinova / bundle /code /openpi /scripts /stop_train_at_checkpoint.sh
lsnu's picture
Upload folder using huggingface_hub
d93804e verified
#!/usr/bin/env bash
set -euo pipefail
if [[ $# -lt 3 ]]; then
echo "usage: $0 <pid> <checkpoint_root> <target_step> [status_log] [interval_s]" >&2
exit 1
fi
PID="$1"
CHECKPOINT_ROOT="$2"
TARGET_STEP="$3"
STATUS_LOG="${4:-/workspace/openpi/stop_at_checkpoint.log}"
INTERVAL_S="${5:-15}"
mkdir -p "$(dirname "$STATUS_LOG")"
log() {
printf '%s | %s\n' "$(date -u +%FT%TZ)" "$*" >> "$STATUS_LOG"
}
checkpoint_ready() {
local step="$1"
local src="${CHECKPOINT_ROOT}/${step}"
[[ -f "${src}/_CHECKPOINT_METADATA" && -f "${src}/params/_METADATA" ]]
}
log "watch_start pid=${PID} target_step=${TARGET_STEP} checkpoint_root=${CHECKPOINT_ROOT}"
while kill -0 "$PID" 2>/dev/null; do
if checkpoint_ready "$TARGET_STEP"; then
log "checkpoint_ready step=${TARGET_STEP}"
kill -TERM "$PID"
log "signal_sent signal=TERM pid=${PID}"
sleep 5
if kill -0 "$PID" 2>/dev/null; then
kill -INT "$PID"
log "signal_sent signal=INT pid=${PID}"
fi
exit 0
fi
sleep "$INTERVAL_S"
done
log "watch_end pid=${PID} process_not_running"