File size: 1,599 Bytes
370f342 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
#!/bin/bash
echo "=== AMP Flow Training Monitor ==="
echo "Timestamp: $(date)"
echo ""
# Check if training process is running
echo "1. Process Status:"
if pgrep -f "amp_flow_training_single_gpu_full_data.py" > /dev/null; then
echo "✓ Training process is running"
PID=$(pgrep -f "amp_flow_training_single_gpu_full_data.py")
echo " PID: $PID"
echo " Runtime: $(ps -o etime= -p $PID)"
else
echo "❌ Training process not found"
exit 1
fi
echo ""
# Check GPU usage
echo "2. GPU Usage:"
nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits | while IFS=, read -r idx name util mem_used mem_total; do
echo " GPU $idx ($name): $util% | ${mem_used}MB/${mem_total}MB"
done
echo ""
# Check log file
echo "3. Recent Log Output:"
if [ -f "overnight_training.log" ]; then
echo " Log file size: $(du -h overnight_training.log | cut -f1)"
echo " Last 5 lines:"
tail -5 overnight_training.log | sed 's/^/ /'
else
echo " ❌ Log file not found"
fi
echo ""
# Check for checkpoint files
echo "4. Checkpoint Files:"
if [ -d "/data2/edwardsun/flow_checkpoints" ]; then
echo " Checkpoint directory: /data2/edwardsun/flow_checkpoints"
ls -la /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | wc -l | xargs echo " Number of checkpoints:"
echo " Latest checkpoint:"
ls -t /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | head -1 | xargs -I {} basename {} 2>/dev/null || echo " None yet"
else
echo " ❌ Checkpoint directory not found"
fi
echo ""
echo "=== End Monitor ===" |