| set -euo pipefail | |
| if [[ $# -lt 3 ]]; then | |
| echo "usage: $0 <pid> <checkpoint_root> <target_step> [status_log] [interval_s]" >&2 | |
| exit 1 | |
| fi | |
| PID="$1" | |
| CHECKPOINT_ROOT="$2" | |
| TARGET_STEP="$3" | |
| STATUS_LOG="${4:-/workspace/openpi/stop_at_checkpoint.log}" | |
| INTERVAL_S="${5:-15}" | |
| mkdir -p "$(dirname "$STATUS_LOG")" | |
| log() { | |
| printf '%s | %s\n' "$(date -u +%FT%TZ)" "$*" >> "$STATUS_LOG" | |
| } | |
| checkpoint_ready() { | |
| local step="$1" | |
| local src="${CHECKPOINT_ROOT}/${step}" | |
| [[ -f "${src}/_CHECKPOINT_METADATA" && -f "${src}/params/_METADATA" ]] | |
| } | |
| log "watch_start pid=${PID} target_step=${TARGET_STEP} checkpoint_root=${CHECKPOINT_ROOT}" | |
| while kill -0 "$PID" 2>/dev/null; do | |
| if checkpoint_ready "$TARGET_STEP"; then | |
| log "checkpoint_ready step=${TARGET_STEP}" | |
| kill -TERM "$PID" | |
| log "signal_sent signal=TERM pid=${PID}" | |
| sleep 5 | |
| if kill -0 "$PID" 2>/dev/null; then | |
| kill -INT "$PID" | |
| log "signal_sent signal=INT pid=${PID}" | |
| fi | |
| exit 0 | |
| fi | |
| sleep "$INTERVAL_S" | |
| done | |
| log "watch_end pid=${PID} process_not_running" | |