| #!/usr/bin/env bash |
| |
| SWEEP="${1:-$(ls -dt /home/ubuntu/curriculum_cot/_runs/baseline_1p5b_v4_* 2>/dev/null | head -1)}" |
| [[ -z "${SWEEP}" || ! -d "${SWEEP}" ]] && { echo "no sweep"; exit 1; } |
| echo "=== sweep: ${SWEEP} ===" |
| echo "=== nvidia-smi ===" |
| nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total,power.draw --format=csv,noheader |
| echo |
| echo "=== pids ===" |
| while read -r pid gpu name; do |
| if kill -0 "$pid" 2>/dev/null; then alive=ALIVE; else alive=DEAD; fi |
| printf ' pid=%-6s gpu=%s %-30s %s\n' "$pid" "$gpu" "$name" "$alive" |
| done < "${SWEEP}/PIDS.txt" |
| echo |
| echo "=== per-variant phase + best/last eval ===" |
| for v in "${SWEEP}"/pipe_*; do |
| vn="$(basename "$v")" |
| current_phase="(starting)" |
| for ph in s2_sft_extra s2_grpo s3_sft s3_grpo; do |
| [[ -d "$v/$ph" ]] && current_phase="$ph" |
| done |
| printf '\n--- %s (phase=%s) ---\n' "$vn" "${current_phase}" |
| |
| if [[ -f "$v/PIPELINE.log" ]]; then |
| tail -3 "$v/PIPELINE.log" | sed 's/^/ PL: /' |
| fi |
| |
| for ph in s2_sft_extra s2_grpo s3_sft s3_grpo; do |
| log="$v/$ph/train.log" |
| [[ -f "$log" ]] || continue |
| |
| last_sft="$(grep -E "\[baseline sft eval\] " "$log" 2>/dev/null | tail -3)" |
| last_grpo="$(grep -E "\[baseline grpo (custom )?eval" "$log" 2>/dev/null | tail -3)" |
| last_train="$(grep -E "\[baseline (sft|grpo) (train|final)" "$log" 2>/dev/null | tail -1)" |
| if [[ -n "$last_sft$last_grpo$last_train" ]]; then |
| printf ' [%s]\n' "$ph" |
| [[ -n "$last_train" ]] && echo "$last_train" | sed 's/^/ tr: /' |
| [[ -n "$last_sft" ]] && echo "$last_sft" | sed 's/^/ ev: /' |
| [[ -n "$last_grpo" ]] && echo "$last_grpo" | sed 's/^/ ev: /' |
| fi |
| done |
| done |
|
|