TeddyBearKinova / bundle /code /openpi /scripts /watch_teddybear_train.sh
lsnu's picture
Upload folder using huggingface_hub
d93804e verified
#!/usr/bin/env bash
set -uo pipefail
if [[ $# -lt 4 ]]; then
echo "usage: $0 <pid> <train_log> <checkpoint_dir> <status_log> [interval_s]" >&2
exit 1
fi
PID="$1"
TRAIN_LOG="$2"
CHECKPOINT_DIR="$3"
STATUS_LOG="$4"
INTERVAL_S="${5:-60}"
mkdir -p "$(dirname "$STATUS_LOG")"
latest_step_line() {
grep -oE 'Step [0-9]+:.*' "$TRAIN_LOG" | tail -n 1 || true
}
latest_gpu_line() {
nvidia-smi \
--query-gpu=utilization.gpu,utilization.memory,memory.used,memory.free \
--format=csv,noheader,nounits | head -n 1 || true
}
latest_ckpts() {
find "$CHECKPOINT_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null \
| sort -n \
| tail -n 5 \
| paste -sd ',' -
}
printf 'watch start %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG"
while kill -0 "$PID" 2>/dev/null; do
STEP_LINE="$(latest_step_line || true)"
GPU_LINE="$(latest_gpu_line || true)"
CKPTS="$(latest_ckpts || true)"
printf '%s | %s | gpu=%s | ckpts=%s\n' \
"$(date -u +%FT%TZ)" \
"${STEP_LINE:-no_step_yet}" \
"${GPU_LINE:-na}" \
"${CKPTS:-none}" >> "$STATUS_LOG"
sleep "$INTERVAL_S" || true
done
printf 'watch end %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG"
tail -n 40 "$TRAIN_LOG" >> "$STATUS_LOG" || true