FlowAMP / monitor_training.sh
esunAI's picture
Initial FlowAMP upload: Complete project with all essential files
370f342
#!/bin/bash
echo "=== AMP Flow Training Monitor ==="
echo "Timestamp: $(date)"
echo ""
# Check if training process is running
echo "1. Process Status:"
if pgrep -f "amp_flow_training_single_gpu_full_data.py" > /dev/null; then
echo "✓ Training process is running"
PID=$(pgrep -f "amp_flow_training_single_gpu_full_data.py")
echo " PID: $PID"
echo " Runtime: $(ps -o etime= -p $PID)"
else
echo "❌ Training process not found"
exit 1
fi
echo ""
# Check GPU usage
echo "2. GPU Usage:"
nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits | while IFS=, read -r idx name util mem_used mem_total; do
echo " GPU $idx ($name): $util% | ${mem_used}MB/${mem_total}MB"
done
echo ""
# Check log file
echo "3. Recent Log Output:"
if [ -f "overnight_training.log" ]; then
echo " Log file size: $(du -h overnight_training.log | cut -f1)"
echo " Last 5 lines:"
tail -5 overnight_training.log | sed 's/^/ /'
else
echo " ❌ Log file not found"
fi
echo ""
# Check for checkpoint files
echo "4. Checkpoint Files:"
if [ -d "/data2/edwardsun/flow_checkpoints" ]; then
echo " Checkpoint directory: /data2/edwardsun/flow_checkpoints"
ls -la /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | wc -l | xargs echo " Number of checkpoints:"
echo " Latest checkpoint:"
ls -t /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | head -1 | xargs -I {} basename {} 2>/dev/null || echo " None yet"
else
echo " ❌ Checkpoint directory not found"
fi
echo ""
echo "=== End Monitor ==="