File size: 5,980 Bytes
3dac39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash
# =============================================================================
# Round 8: FINAL training run on DELEAKED 5-class cybersecurity NER data
#
# Key differences from R6:
#   - Data: r8_5class_{train,valid}.jsonl — fully deleaked (no train/test overlap)
#   - O-token downsampling: --o-downsample 0.7 (mask 70% of O-tokens from loss)
#     This should boost entity recall given ~85-90% O-token prevalence
#   - Four evaluation test sets (incl. APTNER zero-leakage independent benchmark)
#   - Same proven hyperparams: focal(γ=2), cosine LR, LLRD 0.9, LR 5e-5
#
# Expect slightly worse val_loss vs R6 (honest eval on clean data).
# GPU: RTX PRO 6000 96GB VRAM
# =============================================================================
set -euo pipefail
cd ~/alkyline

export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
PATIENCE=3
DATA=data/processed
LABELS=data/label_spaces
TRAIN_LOG=train_r8.log
CKPT_DIR=checkpoints/r8_5class

# ---------------------------------------------------------------------------
# Training
# ---------------------------------------------------------------------------
echo "===== R8: Deleaked 5-class (focal + cosine + LLRD + O-downsample) ====="
echo "Start time: $(date)"

opf train "$DATA/r8_5class_train.jsonl" \
  --validation-dataset "$DATA/r8_5class_valid.jsonl" \
  --label-space-json "$LABELS/cyner_5class.json" \
  --output-dir "$CKPT_DIR" \
  --overwrite-output \
  --epochs 15 --batch-size 4 --grad-accum-steps 2 \
  --learning-rate 5e-5 \
  --warmup-fraction 0.1 --lr-schedule cosine \
  --loss-fn focal --focal-gamma 2.0 \
  --llrd-factor 0.9 \
  --o-downsample 0.7 \
  --device cuda 2>&1 | tee "$TRAIN_LOG" &
TRAIN_PID=$!

# Early stopping monitor — kills training if val_loss hasn't improved for $PATIENCE epochs
bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" &
MONITOR_PID=$!

wait $TRAIN_PID 2>/dev/null || true
kill $MONITOR_PID 2>/dev/null || true

echo "Training finished: $(date)"

# ---------------------------------------------------------------------------
# Best-epoch checkpoint selection
#
# The trainer saves per-epoch checkpoints as epoch_N/ and a final model.
# If early stopping killed training, the final model.safetensors may not
# exist. In that case, find the epoch with the lowest val_loss and use it.
# ---------------------------------------------------------------------------
BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \
  | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \
  | sort -n | head -1 | awk '{print $2}')
echo "Best epoch by val_loss: $BEST_EPOCH"

CKPT="$CKPT_DIR"
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
  echo "Training killed early — using epoch $BEST_EPOCH checkpoint"
  CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then
  # Even if training ran to completion, prefer the best-epoch checkpoint
  echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final"
  CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
fi
echo "Selected checkpoint: $CKPT"

# ---------------------------------------------------------------------------
# Evaluation — four test sets
# ---------------------------------------------------------------------------
EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device cuda"

echo ""
echo "================================================================"
echo "  EVALUATION PHASE"
echo "================================================================"

# 1) Primary benchmark: enriched 5-class test
echo ""
echo "===== Eval R8: Enriched 5-class test (primary benchmark) ====="
opf eval "$DATA/enriched_5class_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_enriched.log

# 2) CyNER test (supplementary — 98% overlap with enriched)
echo ""
echo "===== Eval R8: CyNER test (supplementary, ~98% overlap) ====="
opf eval "$DATA/cyner_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_cyner.log

# 3) SecureBERT2 test (supplementary — 96% overlap)
echo ""
echo "===== Eval R8: SecureBERT2 5-class test (supplementary, ~96% overlap) ====="
opf eval "$DATA/securebert2_5class_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_sb2.log

# 4) APTNER — independent benchmark (ZERO leakage, most important)
echo ""
echo "===== Eval R8: APTNER 5-class test (INDEPENDENT — zero leakage) ====="
opf eval "$DATA/aptner_5class_test_clean.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_aptner.log

# ---------------------------------------------------------------------------
# Viterbi grid search on validation set
# Find optimal Viterbi transition penalties for inference
# ---------------------------------------------------------------------------
echo ""
echo "===== Viterbi Grid Search (on validation set) ====="
python3 scripts/viterbi_grid_search.py \
  --checkpoint "$CKPT" \
  --val-data "$DATA/r8_5class_valid.jsonl" \
  --output results/viterbi_r8_best.json \
  --device cuda 2>&1 | tee viterbi_r8.log

# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "================================================================"
echo "  R8 COMPLETE — $(date)"
echo "================================================================"
echo "Checkpoint:  $CKPT"
echo "Logs:        $TRAIN_LOG, eval_r8_{enriched,cyner,sb2,aptner}.log, viterbi_r8.log"
echo ""
echo "Key results:"
echo "--- Enriched (primary) ---"
grep -E '(micro|macro|^  )' eval_r8_enriched.log 2>/dev/null | head -20 || true
echo ""
echo "--- APTNER (independent) ---"
grep -E '(micro|macro|^  )' eval_r8_aptner.log 2>/dev/null | head -20 || true
echo "================================================================"