File size: 1,788 Bytes
48c96cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env bash
# One-shot snapshot of the active sweep.
SWEEP="${1:-$(ls -dt /home/ubuntu/curriculum_cot/_runs/baseline_1p5b_v4_* 2>/dev/null | head -1)}"
[[ -z "${SWEEP}" || ! -d "${SWEEP}" ]] && { echo "no sweep"; exit 1; }
echo "=== sweep: ${SWEEP} ==="
echo "=== nvidia-smi ==="
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total,power.draw --format=csv,noheader
echo
echo "=== pids ==="
while read -r pid gpu name; do
  if kill -0 "$pid" 2>/dev/null; then alive=ALIVE; else alive=DEAD; fi
  printf '  pid=%-6s gpu=%s %-30s %s\n' "$pid" "$gpu" "$name" "$alive"
done < "${SWEEP}/PIDS.txt"
echo
echo "=== per-variant phase + best/last eval ==="
for v in "${SWEEP}"/pipe_*; do
  vn="$(basename "$v")"
  current_phase="(starting)"
  for ph in s2_sft_extra s2_grpo s3_sft s3_grpo; do
    [[ -d "$v/$ph" ]] && current_phase="$ph"
  done
  printf '\n--- %s (phase=%s) ---\n' "$vn" "${current_phase}"
  # Pipeline log tail
  if [[ -f "$v/PIPELINE.log" ]]; then
    tail -3 "$v/PIPELINE.log" | sed 's/^/    PL: /'
  fi
  # Phase-specific evals
  for ph in s2_sft_extra s2_grpo s3_sft s3_grpo; do
    log="$v/$ph/train.log"
    [[ -f "$log" ]] || continue
    # SFT eval lines
    last_sft="$(grep -E "\[baseline sft eval\] " "$log" 2>/dev/null | tail -3)"
    last_grpo="$(grep -E "\[baseline grpo (custom )?eval" "$log" 2>/dev/null | tail -3)"
    last_train="$(grep -E "\[baseline (sft|grpo) (train|final)" "$log" 2>/dev/null | tail -1)"
    if [[ -n "$last_sft$last_grpo$last_train" ]]; then
      printf '  [%s]\n' "$ph"
      [[ -n "$last_train" ]] && echo "$last_train" | sed 's/^/      tr: /'
      [[ -n "$last_sft" ]]   && echo "$last_sft"   | sed 's/^/      ev: /'
      [[ -n "$last_grpo" ]]  && echo "$last_grpo"  | sed 's/^/      ev: /'
    fi
  done
done