File size: 1,066 Bytes
d93804e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env bash
set -euo pipefail

if [[ $# -lt 3 ]]; then
  echo "usage: $0 <pid> <checkpoint_root> <target_step> [status_log] [interval_s]" >&2
  exit 1
fi

PID="$1"
CHECKPOINT_ROOT="$2"
TARGET_STEP="$3"
STATUS_LOG="${4:-/workspace/openpi/stop_at_checkpoint.log}"
INTERVAL_S="${5:-15}"

mkdir -p "$(dirname "$STATUS_LOG")"

log() {
  printf '%s | %s\n' "$(date -u +%FT%TZ)" "$*" >> "$STATUS_LOG"
}

checkpoint_ready() {
  local step="$1"
  local src="${CHECKPOINT_ROOT}/${step}"
  [[ -f "${src}/_CHECKPOINT_METADATA" && -f "${src}/params/_METADATA" ]]
}

log "watch_start pid=${PID} target_step=${TARGET_STEP} checkpoint_root=${CHECKPOINT_ROOT}"

while kill -0 "$PID" 2>/dev/null; do
  if checkpoint_ready "$TARGET_STEP"; then
    log "checkpoint_ready step=${TARGET_STEP}"
    kill -TERM "$PID"
    log "signal_sent signal=TERM pid=${PID}"
    sleep 5
    if kill -0 "$PID" 2>/dev/null; then
      kill -INT "$PID"
      log "signal_sent signal=INT pid=${PID}"
    fi
    exit 0
  fi
  sleep "$INTERVAL_S"
done

log "watch_end pid=${PID} process_not_running"