TaoNet-mini-T2 / code /TaoTrain /scripts /remote /submit_detached_job.sh
StarMist0012's picture
Add files using upload-large-folder tool
e2bfccc verified
#!/usr/bin/env bash
set -euo pipefail
RUN_ID="${RUN_ID:-}"
JOB_COMMAND="${JOB_COMMAND:-}"
JOB_ROOT="${JOB_ROOT:-/home/student/YouZheng/jobs/taotern}"
SESSION_NAME="${SESSION_NAME:-}"
if [[ -z "$RUN_ID" ]]; then
echo "RUN_ID is required" >&2
exit 2
fi
if [[ -z "$JOB_COMMAND" ]]; then
echo "JOB_COMMAND is required" >&2
exit 2
fi
if ! command -v tmux >/dev/null 2>&1; then
echo "tmux is required for detached jobs" >&2
exit 2
fi
safe_run_id="$(printf '%s' "$RUN_ID" | tr -c 'A-Za-z0-9_.-' '_')"
session="${SESSION_NAME:-taotern_${safe_run_id}}"
job_dir="${JOB_ROOT%/}/${safe_run_id}"
output_dir="${OUTPUT_DIR:-$job_dir/outputs}"
checkpoint_dir="${CHECKPOINT_DIR:-$job_dir/checkpoints}"
mkdir -p "$job_dir" "$output_dir" "$checkpoint_dir"
cat > "$job_dir/command.sh" <<EOF
#!/usr/bin/env bash
set -euo pipefail
export REPOBRIDGE_OUTPUT_DIR="$output_dir"
export TAOTERN_CHECKPOINT_DIR="$checkpoint_dir"
cd "$(pwd)"
$JOB_COMMAND
EOF
chmod +x "$job_dir/command.sh"
cat > "$job_dir/run.sh" <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
job_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
status_json="$job_dir/status.json"
log_path="$job_dir/train.log"
telemetry_path="$job_dir/gpu_telemetry_nvidia_smi.csv"
write_status() {
local state="$1"
local extra="${2:-}"
local now
now="$(date -Iseconds)"
cat > "$status_json" <<STATUS
{"state":"$state","updated_at":"$now","job_dir":"$job_dir"$extra}
STATUS
}
write_status "running" ",\"start_time\":\"$(date -Iseconds)\""
rm -f "$job_dir/DONE" "$job_dir/FAILED"
monitor_pid=""
if command -v nvidia-smi >/dev/null 2>&1; then
(
while true; do
date -Iseconds
nvidia-smi --query-gpu=timestamp,name,utilization.gpu,utilization.memory,memory.used,memory.total,power.draw,temperature.gpu --format=csv,noheader,nounits
sleep 5
done
) > "$telemetry_path" 2>&1 &
monitor_pid="$!"
fi
cleanup() {
if [[ -n "$monitor_pid" ]]; then
kill "$monitor_pid" 2>/dev/null || true
wait "$monitor_pid" 2>/dev/null || true
fi
}
trap cleanup EXIT
set +e
"$job_dir/command.sh" > "$log_path" 2>&1
exit_code="$?"
set -e
if [[ "$exit_code" -eq 0 ]]; then
touch "$job_dir/DONE"
write_status "completed" ",\"exit_code\":0,\"end_time\":\"$(date -Iseconds)\""
else
echo "$exit_code" > "$job_dir/FAILED"
write_status "failed" ",\"exit_code\":$exit_code,\"end_time\":\"$(date -Iseconds)\""
fi
exit "$exit_code"
EOF
chmod +x "$job_dir/run.sh"
cat > "$job_dir/status.json" <<STATUS
{"state":"submitted","updated_at":"$(date -Iseconds)","job_dir":"$job_dir","session":"$session","output_dir":"$output_dir","checkpoint_dir":"$checkpoint_dir"}
STATUS
if tmux has-session -t "$session" 2>/dev/null; then
echo "tmux session already exists: $session" >&2
echo "$job_dir"
exit 3
fi
tmux new-session -d -s "$session" "bash '$job_dir/run.sh'"
echo "Submitted detached job"
echo " run_id: $safe_run_id"
echo " session: $session"
echo " job_dir: $job_dir"
echo " output_dir: $output_dir"
echo " checkpoint_dir: $checkpoint_dir"