File size: 1,599 Bytes
370f342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/bash

echo "=== AMP Flow Training Monitor ==="
echo "Timestamp: $(date)"
echo ""

# Check if training process is running
echo "1. Process Status:"
if pgrep -f "amp_flow_training_single_gpu_full_data.py" > /dev/null; then
    echo "✓ Training process is running"
    PID=$(pgrep -f "amp_flow_training_single_gpu_full_data.py")
    echo "  PID: $PID"
    echo "  Runtime: $(ps -o etime= -p $PID)"
else
    echo "❌ Training process not found"
    exit 1
fi

echo ""

# Check GPU usage
echo "2. GPU Usage:"
nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits | while IFS=, read -r idx name util mem_used mem_total; do
    echo "  GPU $idx ($name): $util% | ${mem_used}MB/${mem_total}MB"
done

echo ""

# Check log file
echo "3. Recent Log Output:"
if [ -f "overnight_training.log" ]; then
    echo "  Log file size: $(du -h overnight_training.log | cut -f1)"
    echo "  Last 5 lines:"
    tail -5 overnight_training.log | sed 's/^/    /'
else
    echo "  ❌ Log file not found"
fi

echo ""

# Check for checkpoint files
echo "4. Checkpoint Files:"
if [ -d "/data2/edwardsun/flow_checkpoints" ]; then
    echo "  Checkpoint directory: /data2/edwardsun/flow_checkpoints"
    ls -la /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | wc -l | xargs echo "  Number of checkpoints:"
    echo "  Latest checkpoint:"
    ls -t /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | head -1 | xargs -I {} basename {} 2>/dev/null || echo "    None yet"
else
    echo "  ❌ Checkpoint directory not found"
fi

echo ""
echo "=== End Monitor ==="