File size: 5,980 Bytes
3dac39e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | #!/bin/bash
# =============================================================================
# Round 8: FINAL training run on DELEAKED 5-class cybersecurity NER data
#
# Key differences from R6:
# - Data: r8_5class_{train,valid}.jsonl — fully deleaked (no train/test overlap)
# - O-token downsampling: --o-downsample 0.7 (mask 70% of O-tokens from loss)
# This should boost entity recall given ~85-90% O-token prevalence
# - Four evaluation test sets (incl. APTNER zero-leakage independent benchmark)
# - Same proven hyperparams: focal(γ=2), cosine LR, LLRD 0.9, LR 5e-5
#
# Expect slightly worse val_loss vs R6 (honest eval on clean data).
# GPU: RTX PRO 6000 96GB VRAM
# =============================================================================
set -euo pipefail
cd ~/alkyline
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
PATIENCE=3
DATA=data/processed
LABELS=data/label_spaces
TRAIN_LOG=train_r8.log
CKPT_DIR=checkpoints/r8_5class
# ---------------------------------------------------------------------------
# Training
# ---------------------------------------------------------------------------
echo "===== R8: Deleaked 5-class (focal + cosine + LLRD + O-downsample) ====="
echo "Start time: $(date)"
opf train "$DATA/r8_5class_train.jsonl" \
--validation-dataset "$DATA/r8_5class_valid.jsonl" \
--label-space-json "$LABELS/cyner_5class.json" \
--output-dir "$CKPT_DIR" \
--overwrite-output \
--epochs 15 --batch-size 4 --grad-accum-steps 2 \
--learning-rate 5e-5 \
--warmup-fraction 0.1 --lr-schedule cosine \
--loss-fn focal --focal-gamma 2.0 \
--llrd-factor 0.9 \
--o-downsample 0.7 \
--device cuda 2>&1 | tee "$TRAIN_LOG" &
TRAIN_PID=$!
# Early stopping monitor — kills training if val_loss hasn't improved for $PATIENCE epochs
bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" &
MONITOR_PID=$!
wait $TRAIN_PID 2>/dev/null || true
kill $MONITOR_PID 2>/dev/null || true
echo "Training finished: $(date)"
# ---------------------------------------------------------------------------
# Best-epoch checkpoint selection
#
# The trainer saves per-epoch checkpoints as epoch_N/ and a final model.
# If early stopping killed training, the final model.safetensors may not
# exist. In that case, find the epoch with the lowest val_loss and use it.
# ---------------------------------------------------------------------------
BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \
| awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \
| sort -n | head -1 | awk '{print $2}')
echo "Best epoch by val_loss: $BEST_EPOCH"
CKPT="$CKPT_DIR"
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
echo "Training killed early — using epoch $BEST_EPOCH checkpoint"
CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then
# Even if training ran to completion, prefer the best-epoch checkpoint
echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final"
CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
fi
echo "Selected checkpoint: $CKPT"
# ---------------------------------------------------------------------------
# Evaluation — four test sets
# ---------------------------------------------------------------------------
EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device cuda"
echo ""
echo "================================================================"
echo " EVALUATION PHASE"
echo "================================================================"
# 1) Primary benchmark: enriched 5-class test
echo ""
echo "===== Eval R8: Enriched 5-class test (primary benchmark) ====="
opf eval "$DATA/enriched_5class_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_enriched.log
# 2) CyNER test (supplementary — 98% overlap with enriched)
echo ""
echo "===== Eval R8: CyNER test (supplementary, ~98% overlap) ====="
opf eval "$DATA/cyner_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_cyner.log
# 3) SecureBERT2 test (supplementary — 96% overlap)
echo ""
echo "===== Eval R8: SecureBERT2 5-class test (supplementary, ~96% overlap) ====="
opf eval "$DATA/securebert2_5class_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_sb2.log
# 4) APTNER — independent benchmark (ZERO leakage, most important)
echo ""
echo "===== Eval R8: APTNER 5-class test (INDEPENDENT — zero leakage) ====="
opf eval "$DATA/aptner_5class_test_clean.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_aptner.log
# ---------------------------------------------------------------------------
# Viterbi grid search on validation set
# Find optimal Viterbi transition penalties for inference
# ---------------------------------------------------------------------------
echo ""
echo "===== Viterbi Grid Search (on validation set) ====="
python3 scripts/viterbi_grid_search.py \
--checkpoint "$CKPT" \
--val-data "$DATA/r8_5class_valid.jsonl" \
--output results/viterbi_r8_best.json \
--device cuda 2>&1 | tee viterbi_r8.log
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "================================================================"
echo " R8 COMPLETE — $(date)"
echo "================================================================"
echo "Checkpoint: $CKPT"
echo "Logs: $TRAIN_LOG, eval_r8_{enriched,cyner,sb2,aptner}.log, viterbi_r8.log"
echo ""
echo "Key results:"
echo "--- Enriched (primary) ---"
grep -E '(micro|macro|^ )' eval_r8_enriched.log 2>/dev/null | head -20 || true
echo ""
echo "--- APTNER (independent) ---"
grep -E '(micro|macro|^ )' eval_r8_aptner.log 2>/dev/null | head -20 || true
echo "================================================================"
|