| # Early-stopping wrapper for opf train | |
| # Monitors val_loss and kills training if no improvement for PATIENCE epochs | |
| # | |
| # Usage: ./early_stop_train.sh <train_log_file> <patience> | |
| # Run opf train with 2>&1 | tee <train_log_file> in background first, | |
| # then run this script pointing at the log. | |
| LOG="${1:-train.log}" | |
| PATIENCE="${2:-3}" | |
| PID="${3}" # optional: PID of the opf train process | |
| echo "Early stopping monitor: watching $LOG, patience=$PATIENCE" | |
| best_val_loss=999.0 | |
| epochs_without_improvement=0 | |
| last_epoch=0 | |
| tail -f "$LOG" | while read -r line; do | |
| # Match epoch summary lines like: epoch 4/15: train_loss=0.096 val_loss=0.126 ... | |
| if echo "$line" | grep -qP '^epoch \d+/\d+:.*val_loss='; then | |
| epoch=$(echo "$line" | grep -oP 'epoch \K\d+') | |
| val_loss=$(echo "$line" | grep -oP 'val_loss=\K[0-9.]+') | |
| echo "[early_stop] Epoch $epoch: val_loss=$val_loss (best=$best_val_loss, patience=$epochs_without_improvement/$PATIENCE)" | |
| # Check if improved | |
| improved=$(python3 -c "print(1 if $val_loss < $best_val_loss else 0)") | |
| if [ "$improved" = "1" ]; then | |
| best_val_loss=$val_loss | |
| epochs_without_improvement=0 | |
| echo "[early_stop] New best val_loss: $best_val_loss" | |
| else | |
| epochs_without_improvement=$((epochs_without_improvement + 1)) | |
| echo "[early_stop] No improvement for $epochs_without_improvement epochs" | |
| fi | |
| if [ "$epochs_without_improvement" -ge "$PATIENCE" ]; then | |
| echo "[early_stop] STOPPING: No improvement for $PATIENCE epochs. Best val_loss=$best_val_loss" | |
| # Kill the training process | |
| if [ -n "$PID" ]; then | |
| kill "$PID" 2>/dev/null | |
| else | |
| # Try to find opf train process | |
| pkill -f "opf train" 2>/dev/null | |
| fi | |
| break | |
| fi | |
| fi | |
| done | |