#!/bin/bash # Early-stopping wrapper for opf train # Monitors val_loss and kills training if no improvement for PATIENCE epochs # # Usage: ./early_stop_train.sh # Run opf train with 2>&1 | tee in background first, # then run this script pointing at the log. LOG="${1:-train.log}" PATIENCE="${2:-3}" PID="${3}" # optional: PID of the opf train process echo "Early stopping monitor: watching $LOG, patience=$PATIENCE" best_val_loss=999.0 epochs_without_improvement=0 last_epoch=0 tail -f "$LOG" | while read -r line; do # Match epoch summary lines like: epoch 4/15: train_loss=0.096 val_loss=0.126 ... if echo "$line" | grep -qP '^epoch \d+/\d+:.*val_loss='; then epoch=$(echo "$line" | grep -oP 'epoch \K\d+') val_loss=$(echo "$line" | grep -oP 'val_loss=\K[0-9.]+') echo "[early_stop] Epoch $epoch: val_loss=$val_loss (best=$best_val_loss, patience=$epochs_without_improvement/$PATIENCE)" # Check if improved improved=$(python3 -c "print(1 if $val_loss < $best_val_loss else 0)") if [ "$improved" = "1" ]; then best_val_loss=$val_loss epochs_without_improvement=0 echo "[early_stop] New best val_loss: $best_val_loss" else epochs_without_improvement=$((epochs_without_improvement + 1)) echo "[early_stop] No improvement for $epochs_without_improvement epochs" fi if [ "$epochs_without_improvement" -ge "$PATIENCE" ]; then echo "[early_stop] STOPPING: No improvement for $PATIENCE epochs. Best val_loss=$best_val_loss" # Kill the training process if [ -n "$PID" ]; then kill "$PID" 2>/dev/null else # Try to find opf train process pkill -f "opf train" 2>/dev/null fi break fi fi done