#!/usr/bin/env bash set -Eeuo pipefail job_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" status_json="$job_dir/status.json" log_path="$job_dir/train.log" telemetry_path="$job_dir/gpu_telemetry_nvidia_smi.csv" write_status() { local state="$1" local extra="${2:-}" local now now="$(date -Iseconds)" cat > "$status_json" </dev/null 2>&1; then ( while true; do date -Iseconds nvidia-smi --query-gpu=timestamp,name,utilization.gpu,utilization.memory,memory.used,memory.total,power.draw,temperature.gpu --format=csv,noheader,nounits sleep 5 done ) > "$telemetry_path" 2>&1 & monitor_pid="$!" fi cleanup() { if [[ -n "$monitor_pid" ]]; then kill "$monitor_pid" 2>/dev/null || true wait "$monitor_pid" 2>/dev/null || true fi } trap cleanup EXIT set +e "$job_dir/command.sh" > "$log_path" 2>&1 exit_code="$?" set -e if [[ "$exit_code" -eq 0 ]]; then touch "$job_dir/DONE" write_status "completed" ",\"exit_code\":0,\"end_time\":\"$(date -Iseconds)\"" else echo "$exit_code" > "$job_dir/FAILED" write_status "failed" ",\"exit_code\":$exit_code,\"end_time\":\"$(date -Iseconds)\"" fi exit "$exit_code"