File size: 1,066 Bytes
d93804e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | #!/usr/bin/env bash
set -euo pipefail
if [[ $# -lt 3 ]]; then
echo "usage: $0 <pid> <checkpoint_root> <target_step> [status_log] [interval_s]" >&2
exit 1
fi
PID="$1"
CHECKPOINT_ROOT="$2"
TARGET_STEP="$3"
STATUS_LOG="${4:-/workspace/openpi/stop_at_checkpoint.log}"
INTERVAL_S="${5:-15}"
mkdir -p "$(dirname "$STATUS_LOG")"
log() {
printf '%s | %s\n' "$(date -u +%FT%TZ)" "$*" >> "$STATUS_LOG"
}
checkpoint_ready() {
local step="$1"
local src="${CHECKPOINT_ROOT}/${step}"
[[ -f "${src}/_CHECKPOINT_METADATA" && -f "${src}/params/_METADATA" ]]
}
log "watch_start pid=${PID} target_step=${TARGET_STEP} checkpoint_root=${CHECKPOINT_ROOT}"
while kill -0 "$PID" 2>/dev/null; do
if checkpoint_ready "$TARGET_STEP"; then
log "checkpoint_ready step=${TARGET_STEP}"
kill -TERM "$PID"
log "signal_sent signal=TERM pid=${PID}"
sleep 5
if kill -0 "$PID" 2>/dev/null; then
kill -INT "$PID"
log "signal_sent signal=INT pid=${PID}"
fi
exit 0
fi
sleep "$INTERVAL_S"
done
log "watch_end pid=${PID} process_not_running"
|