#!/bin/bash echo "=== AMP Flow Training Monitor ===" echo "Timestamp: $(date)" echo "" # Check if training process is running echo "1. Process Status:" if pgrep -f "amp_flow_training_single_gpu_full_data.py" > /dev/null; then echo "✓ Training process is running" PID=$(pgrep -f "amp_flow_training_single_gpu_full_data.py") echo " PID: $PID" echo " Runtime: $(ps -o etime= -p $PID)" else echo "❌ Training process not found" exit 1 fi echo "" # Check GPU usage echo "2. GPU Usage:" nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits | while IFS=, read -r idx name util mem_used mem_total; do echo " GPU $idx ($name): $util% | ${mem_used}MB/${mem_total}MB" done echo "" # Check log file echo "3. Recent Log Output:" if [ -f "overnight_training.log" ]; then echo " Log file size: $(du -h overnight_training.log | cut -f1)" echo " Last 5 lines:" tail -5 overnight_training.log | sed 's/^/ /' else echo " ❌ Log file not found" fi echo "" # Check for checkpoint files echo "4. Checkpoint Files:" if [ -d "/data2/edwardsun/flow_checkpoints" ]; then echo " Checkpoint directory: /data2/edwardsun/flow_checkpoints" ls -la /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | wc -l | xargs echo " Number of checkpoints:" echo " Latest checkpoint:" ls -t /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | head -1 | xargs -I {} basename {} 2>/dev/null || echo " None yet" else echo " ❌ Checkpoint directory not found" fi echo "" echo "=== End Monitor ==="