File size: 1,910 Bytes
3dac39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/bash
# Early-stopping wrapper for opf train
# Monitors val_loss and kills training if no improvement for PATIENCE epochs
#
# Usage: ./early_stop_train.sh <train_log_file> <patience>
# Run opf train with 2>&1 | tee <train_log_file> in background first,
# then run this script pointing at the log.

LOG="${1:-train.log}"
PATIENCE="${2:-3}"
PID="${3}"  # optional: PID of the opf train process

echo "Early stopping monitor: watching $LOG, patience=$PATIENCE"

best_val_loss=999.0
epochs_without_improvement=0
last_epoch=0

tail -f "$LOG" | while read -r line; do
    # Match epoch summary lines like: epoch 4/15: train_loss=0.096 val_loss=0.126 ...
    if echo "$line" | grep -qP '^epoch \d+/\d+:.*val_loss='; then
        epoch=$(echo "$line" | grep -oP 'epoch \K\d+')
        val_loss=$(echo "$line" | grep -oP 'val_loss=\K[0-9.]+')

        echo "[early_stop] Epoch $epoch: val_loss=$val_loss (best=$best_val_loss, patience=$epochs_without_improvement/$PATIENCE)"

        # Check if improved
        improved=$(python3 -c "print(1 if $val_loss < $best_val_loss else 0)")

        if [ "$improved" = "1" ]; then
            best_val_loss=$val_loss
            epochs_without_improvement=0
            echo "[early_stop] New best val_loss: $best_val_loss"
        else
            epochs_without_improvement=$((epochs_without_improvement + 1))
            echo "[early_stop] No improvement for $epochs_without_improvement epochs"
        fi

        if [ "$epochs_without_improvement" -ge "$PATIENCE" ]; then
            echo "[early_stop] STOPPING: No improvement for $PATIENCE epochs. Best val_loss=$best_val_loss"

            # Kill the training process
            if [ -n "$PID" ]; then
                kill "$PID" 2>/dev/null
            else
                # Try to find opf train process
                pkill -f "opf train" 2>/dev/null
            fi
            break
        fi
    fi
done