StarMist0012's picture
Add files using upload-large-folder tool
e2bfccc verified
#!/usr/bin/env bash
set -Eeuo pipefail
job_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
status_json="$job_dir/status.json"
log_path="$job_dir/train.log"
telemetry_path="$job_dir/gpu_telemetry_nvidia_smi.csv"
write_status() {
local state="$1"
local extra="${2:-}"
local now
now="$(date -Iseconds)"
cat > "$status_json" <<STATUS
{"state":"$state","updated_at":"$now","job_dir":"$job_dir"$extra}
STATUS
}
write_status "running" ",\"start_time\":\"$(date -Iseconds)\""
rm -f "$job_dir/DONE" "$job_dir/FAILED"
monitor_pid=""
if command -v nvidia-smi >/dev/null 2>&1; then
(
while true; do
date -Iseconds
nvidia-smi --query-gpu=timestamp,name,utilization.gpu,utilization.memory,memory.used,memory.total,power.draw,temperature.gpu --format=csv,noheader,nounits
sleep 5
done
) > "$telemetry_path" 2>&1 &
monitor_pid="$!"
fi
cleanup() {
if [[ -n "$monitor_pid" ]]; then
kill "$monitor_pid" 2>/dev/null || true
wait "$monitor_pid" 2>/dev/null || true
fi
}
trap cleanup EXIT
set +e
"$job_dir/command.sh" > "$log_path" 2>&1
exit_code="$?"
set -e
if [[ "$exit_code" -eq 0 ]]; then
touch "$job_dir/DONE"
write_status "completed" ",\"exit_code\":0,\"end_time\":\"$(date -Iseconds)\""
else
echo "$exit_code" > "$job_dir/FAILED"
write_status "failed" ",\"exit_code\":$exit_code,\"end_time\":\"$(date -Iseconds)\""
fi
exit "$exit_code"