arcspan / scripts /early_stop_monitor.sh
chairulridjal's picture
Add files using upload-large-folder tool
3dac39e verified
#!/bin/bash
# Early-stopping wrapper for opf train
# Monitors val_loss and kills training if no improvement for PATIENCE epochs
#
# Usage: ./early_stop_train.sh <train_log_file> <patience>
# Run opf train with 2>&1 | tee <train_log_file> in background first,
# then run this script pointing at the log.
LOG="${1:-train.log}"
PATIENCE="${2:-3}"
PID="${3}" # optional: PID of the opf train process
echo "Early stopping monitor: watching $LOG, patience=$PATIENCE"
best_val_loss=999.0
epochs_without_improvement=0
last_epoch=0
tail -f "$LOG" | while read -r line; do
# Match epoch summary lines like: epoch 4/15: train_loss=0.096 val_loss=0.126 ...
if echo "$line" | grep -qP '^epoch \d+/\d+:.*val_loss='; then
epoch=$(echo "$line" | grep -oP 'epoch \K\d+')
val_loss=$(echo "$line" | grep -oP 'val_loss=\K[0-9.]+')
echo "[early_stop] Epoch $epoch: val_loss=$val_loss (best=$best_val_loss, patience=$epochs_without_improvement/$PATIENCE)"
# Check if improved
improved=$(python3 -c "print(1 if $val_loss < $best_val_loss else 0)")
if [ "$improved" = "1" ]; then
best_val_loss=$val_loss
epochs_without_improvement=0
echo "[early_stop] New best val_loss: $best_val_loss"
else
epochs_without_improvement=$((epochs_without_improvement + 1))
echo "[early_stop] No improvement for $epochs_without_improvement epochs"
fi
if [ "$epochs_without_improvement" -ge "$PATIENCE" ]; then
echo "[early_stop] STOPPING: No improvement for $PATIENCE epochs. Best val_loss=$best_val_loss"
# Kill the training process
if [ -n "$PID" ]; then
kill "$PID" 2>/dev/null
else
# Try to find opf train process
pkill -f "opf train" 2>/dev/null
fi
break
fi
fi
done