|
|
#!/bin/bash |
|
|
|
|
|
echo "=== AMP Flow Training Monitor ===" |
|
|
echo "Timestamp: $(date)" |
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "1. Process Status:" |
|
|
if pgrep -f "amp_flow_training_single_gpu_full_data.py" > /dev/null; then |
|
|
echo "✓ Training process is running" |
|
|
PID=$(pgrep -f "amp_flow_training_single_gpu_full_data.py") |
|
|
echo " PID: $PID" |
|
|
echo " Runtime: $(ps -o etime= -p $PID)" |
|
|
else |
|
|
echo "❌ Training process not found" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "2. GPU Usage:" |
|
|
nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits | while IFS=, read -r idx name util mem_used mem_total; do |
|
|
echo " GPU $idx ($name): $util% | ${mem_used}MB/${mem_total}MB" |
|
|
done |
|
|
|
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "3. Recent Log Output:" |
|
|
if [ -f "overnight_training.log" ]; then |
|
|
echo " Log file size: $(du -h overnight_training.log | cut -f1)" |
|
|
echo " Last 5 lines:" |
|
|
tail -5 overnight_training.log | sed 's/^/ /' |
|
|
else |
|
|
echo " ❌ Log file not found" |
|
|
fi |
|
|
|
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "4. Checkpoint Files:" |
|
|
if [ -d "/data2/edwardsun/flow_checkpoints" ]; then |
|
|
echo " Checkpoint directory: /data2/edwardsun/flow_checkpoints" |
|
|
ls -la /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | wc -l | xargs echo " Number of checkpoints:" |
|
|
echo " Latest checkpoint:" |
|
|
ls -t /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | head -1 | xargs -I {} basename {} 2>/dev/null || echo " None yet" |
|
|
else |
|
|
echo " ❌ Checkpoint directory not found" |
|
|
fi |
|
|
|
|
|
echo "" |
|
|
echo "=== End Monitor ===" |