#!/usr/bin/env bash set -euo pipefail RUN_ID="${RUN_ID:-}" JOB_COMMAND="${JOB_COMMAND:-}" JOB_ROOT="${JOB_ROOT:-/home/student/YouZheng/jobs/taotern}" SESSION_NAME="${SESSION_NAME:-}" if [[ -z "$RUN_ID" ]]; then echo "RUN_ID is required" >&2 exit 2 fi if [[ -z "$JOB_COMMAND" ]]; then echo "JOB_COMMAND is required" >&2 exit 2 fi if ! command -v tmux >/dev/null 2>&1; then echo "tmux is required for detached jobs" >&2 exit 2 fi safe_run_id="$(printf '%s' "$RUN_ID" | tr -c 'A-Za-z0-9_.-' '_')" session="${SESSION_NAME:-taotern_${safe_run_id}}" job_dir="${JOB_ROOT%/}/${safe_run_id}" output_dir="${OUTPUT_DIR:-$job_dir/outputs}" checkpoint_dir="${CHECKPOINT_DIR:-$job_dir/checkpoints}" mkdir -p "$job_dir" "$output_dir" "$checkpoint_dir" cat > "$job_dir/command.sh" < "$job_dir/run.sh" <<'EOF' #!/usr/bin/env bash set -Eeuo pipefail job_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" status_json="$job_dir/status.json" log_path="$job_dir/train.log" telemetry_path="$job_dir/gpu_telemetry_nvidia_smi.csv" write_status() { local state="$1" local extra="${2:-}" local now now="$(date -Iseconds)" cat > "$status_json" </dev/null 2>&1; then ( while true; do date -Iseconds nvidia-smi --query-gpu=timestamp,name,utilization.gpu,utilization.memory,memory.used,memory.total,power.draw,temperature.gpu --format=csv,noheader,nounits sleep 5 done ) > "$telemetry_path" 2>&1 & monitor_pid="$!" fi cleanup() { if [[ -n "$monitor_pid" ]]; then kill "$monitor_pid" 2>/dev/null || true wait "$monitor_pid" 2>/dev/null || true fi } trap cleanup EXIT set +e "$job_dir/command.sh" > "$log_path" 2>&1 exit_code="$?" set -e if [[ "$exit_code" -eq 0 ]]; then touch "$job_dir/DONE" write_status "completed" ",\"exit_code\":0,\"end_time\":\"$(date -Iseconds)\"" else echo "$exit_code" > "$job_dir/FAILED" write_status "failed" ",\"exit_code\":$exit_code,\"end_time\":\"$(date -Iseconds)\"" fi exit "$exit_code" EOF chmod +x "$job_dir/run.sh" cat > "$job_dir/status.json" </dev/null; then echo "tmux session already exists: $session" >&2 echo "$job_dir" exit 3 fi tmux new-session -d -s "$session" "bash '$job_dir/run.sh'" echo "Submitted detached job" echo " run_id: $safe_run_id" echo " session: $session" echo " job_dir: $job_dir" echo " output_dir: $output_dir" echo " checkpoint_dir: $checkpoint_dir"