File size: 3,957 Bytes
038e086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
# Round 5: Train on ENRICHED data with per-epoch saving + early stopping
# 5a = 5-class, 5b = 13-class
# Uses patch_epoch_saving.py to save checkpoints every epoch
# Early stopping monitor can safely kill training since per-epoch saves exist
set -euo pipefail
cd ~/alkyline

export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
PATIENCE=3

DATA=data/processed
LABELS=data/label_spaces

echo "===== Round 5a: Enriched 5-class ====="
opf train "$DATA/enriched_5class_train.jsonl" \
  --validation-dataset "$DATA/enriched_5class_valid.jsonl" \
  --label-space-json "$LABELS/cyner_5class.json" \
  --output-dir checkpoints/r5a_enriched_5class \
  --overwrite-output \
  --epochs 20 --batch-size 4 --grad-accum-steps 2 \
  --learning-rate 5e-5 --warmup-fraction 0 --lr-schedule flat \
  --device cuda 2>&1 | tee train_r5a.log &
TRAIN_PID=$!

# Early stopping monitor — safe to kill now because per-epoch checkpoints exist
bash scripts/early_stop_monitor.sh train_r5a.log $PATIENCE $TRAIN_PID &
MONITOR_PID=$!

wait $TRAIN_PID 2>/dev/null || true
kill $MONITOR_PID 2>/dev/null || true

# Find the best epoch checkpoint (lowest val_loss from log)
BEST_EPOCH=$(grep '^epoch' train_r5a.log | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' | sort -n | head -1 | awk '{print $2}')
echo "Best epoch: $BEST_EPOCH"

# The main output-dir has the final best checkpoint from opf train
# Per-epoch checkpoints are in output-dir/epoch_N/
# If training was killed by early stopping, use the best epoch checkpoint
CKPT=checkpoints/r5a_enriched_5class
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
  echo "Training killed early — using epoch $BEST_EPOCH checkpoint"
  CKPT="checkpoints/r5a_enriched_5class/epoch_${BEST_EPOCH}"
fi

echo "===== Eval R5a on enriched test ====="
opf eval "$DATA/enriched_5class_test.jsonl" \
  --checkpoint "$CKPT" \
  --decode-mode viterbi --per-class --label-counts \
  --device cuda 2>&1 | tee eval_r5a_enriched.log

echo "===== Eval R5a on CyNER test ====="
opf eval "$DATA/cyner_test.jsonl" \
  --checkpoint "$CKPT" \
  --decode-mode viterbi --per-class --label-counts \
  --device cuda 2>&1 | tee eval_r5a_cyner.log

echo "===== Eval R5a on SecureBERT2 test ====="
opf eval "$DATA/securebert2_test.jsonl" \
  --checkpoint "$CKPT" \
  --decode-mode viterbi --per-class --label-counts \
  --device cuda 2>&1 | tee eval_r5a_sb2.log

echo "===== Round 5b: Enriched 13-class ====="
opf train "$DATA/enriched_13class_train.jsonl" \
  --validation-dataset "$DATA/enriched_13class_valid.jsonl" \
  --label-space-json "$LABELS/cyber_13class.json" \
  --output-dir checkpoints/r5b_enriched_13class \
  --overwrite-output \
  --epochs 20 --batch-size 4 --grad-accum-steps 2 \
  --learning-rate 5e-5 --warmup-fraction 0 --lr-schedule flat \
  --device cuda 2>&1 | tee train_r5b.log &
TRAIN_PID=$!

bash scripts/early_stop_monitor.sh train_r5b.log $PATIENCE $TRAIN_PID &
MONITOR_PID=$!

wait $TRAIN_PID 2>/dev/null || true
kill $MONITOR_PID 2>/dev/null || true

BEST_EPOCH=$(grep '^epoch' train_r5b.log | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' | sort -n | head -1 | awk '{print $2}')
echo "Best epoch: $BEST_EPOCH"

CKPT=checkpoints/r5b_enriched_13class
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
  echo "Training killed early — using epoch $BEST_EPOCH checkpoint"
  CKPT="checkpoints/r5b_enriched_13class/epoch_${BEST_EPOCH}"
fi

echo "===== Eval R5b on enriched test ====="
opf eval "$DATA/enriched_13class_test.jsonl" \
  --checkpoint "$CKPT" \
  --decode-mode viterbi --per-class --label-counts \
  --device cuda 2>&1 | tee eval_r5b_enriched.log

echo "===== Eval R5b on CyNER test (5-class labels, 13-class model) ====="
opf eval "$DATA/cyner_test.jsonl" \
  --checkpoint "$CKPT" \
  --decode-mode viterbi --per-class --label-counts \
  --device cuda 2>&1 | tee eval_r5b_cyner.log

echo "===== ALL ROUND 5 DONE ====="