#!/bin/bash # Round 5: Train on ENRICHED data with per-epoch saving + early stopping # 5a = 5-class, 5b = 13-class # Uses patch_epoch_saving.py to save checkpoints every epoch # Early stopping monitor can safely kill training since per-epoch saves exist set -euo pipefail cd ~/alkyline export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True PATIENCE=3 DATA=data/processed LABELS=data/label_spaces echo "===== Round 5a: Enriched 5-class =====" opf train "$DATA/enriched_5class_train.jsonl" \ --validation-dataset "$DATA/enriched_5class_valid.jsonl" \ --label-space-json "$LABELS/cyner_5class.json" \ --output-dir checkpoints/r5a_enriched_5class \ --overwrite-output \ --epochs 20 --batch-size 4 --grad-accum-steps 2 \ --learning-rate 5e-5 --warmup-fraction 0 --lr-schedule flat \ --device cuda 2>&1 | tee train_r5a.log & TRAIN_PID=$! # Early stopping monitor — safe to kill now because per-epoch checkpoints exist bash scripts/early_stop_monitor.sh train_r5a.log $PATIENCE $TRAIN_PID & MONITOR_PID=$! wait $TRAIN_PID 2>/dev/null || true kill $MONITOR_PID 2>/dev/null || true # Find the best epoch checkpoint (lowest val_loss from log) BEST_EPOCH=$(grep '^epoch' train_r5a.log | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' | sort -n | head -1 | awk '{print $2}') echo "Best epoch: $BEST_EPOCH" # The main output-dir has the final best checkpoint from opf train # Per-epoch checkpoints are in output-dir/epoch_N/ # If training was killed by early stopping, use the best epoch checkpoint CKPT=checkpoints/r5a_enriched_5class if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then echo "Training killed early — using epoch $BEST_EPOCH checkpoint" CKPT="checkpoints/r5a_enriched_5class/epoch_${BEST_EPOCH}" fi echo "===== Eval R5a on enriched test =====" opf eval "$DATA/enriched_5class_test.jsonl" \ --checkpoint "$CKPT" \ --decode-mode viterbi --per-class --label-counts \ --device cuda 2>&1 | tee eval_r5a_enriched.log echo "===== Eval R5a on CyNER test =====" opf eval "$DATA/cyner_test.jsonl" \ --checkpoint "$CKPT" \ --decode-mode viterbi --per-class --label-counts \ --device cuda 2>&1 | tee eval_r5a_cyner.log echo "===== Eval R5a on SecureBERT2 test =====" opf eval "$DATA/securebert2_test.jsonl" \ --checkpoint "$CKPT" \ --decode-mode viterbi --per-class --label-counts \ --device cuda 2>&1 | tee eval_r5a_sb2.log echo "===== Round 5b: Enriched 13-class =====" opf train "$DATA/enriched_13class_train.jsonl" \ --validation-dataset "$DATA/enriched_13class_valid.jsonl" \ --label-space-json "$LABELS/cyber_13class.json" \ --output-dir checkpoints/r5b_enriched_13class \ --overwrite-output \ --epochs 20 --batch-size 4 --grad-accum-steps 2 \ --learning-rate 5e-5 --warmup-fraction 0 --lr-schedule flat \ --device cuda 2>&1 | tee train_r5b.log & TRAIN_PID=$! bash scripts/early_stop_monitor.sh train_r5b.log $PATIENCE $TRAIN_PID & MONITOR_PID=$! wait $TRAIN_PID 2>/dev/null || true kill $MONITOR_PID 2>/dev/null || true BEST_EPOCH=$(grep '^epoch' train_r5b.log | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' | sort -n | head -1 | awk '{print $2}') echo "Best epoch: $BEST_EPOCH" CKPT=checkpoints/r5b_enriched_13class if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then echo "Training killed early — using epoch $BEST_EPOCH checkpoint" CKPT="checkpoints/r5b_enriched_13class/epoch_${BEST_EPOCH}" fi echo "===== Eval R5b on enriched test =====" opf eval "$DATA/enriched_13class_test.jsonl" \ --checkpoint "$CKPT" \ --decode-mode viterbi --per-class --label-counts \ --device cuda 2>&1 | tee eval_r5b_enriched.log echo "===== Eval R5b on CyNER test (5-class labels, 13-class model) =====" opf eval "$DATA/cyner_test.jsonl" \ --checkpoint "$CKPT" \ --decode-mode viterbi --per-class --label-counts \ --device cuda 2>&1 | tee eval_r5b_cyner.log echo "===== ALL ROUND 5 DONE ====="