File size: 5,980 Bytes

3dac39e

#!/bin/bash
# =============================================================================
# Round 8: FINAL training run on DELEAKED 5-class cybersecurity NER data
#
# Key differences from R6:
#   - Data: r8_5class_{train,valid}.jsonl — fully deleaked (no train/test overlap)
#   - O-token downsampling: --o-downsample 0.7 (mask 70% of O-tokens from loss)
#     This should boost entity recall given ~85-90% O-token prevalence
#   - Four evaluation test sets (incl. APTNER zero-leakage independent benchmark)
#   - Same proven hyperparams: focal(γ=2), cosine LR, LLRD 0.9, LR 5e-5
#
# Expect slightly worse val_loss vs R6 (honest eval on clean data).
# GPU: RTX PRO 6000 96GB VRAM
# =============================================================================
set -euo pipefail
cd ~/alkyline

export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
PATIENCE=3
DATA=data/processed
LABELS=data/label_spaces
TRAIN_LOG=train_r8.log
CKPT_DIR=checkpoints/r8_5class

# ---------------------------------------------------------------------------
# Training
# ---------------------------------------------------------------------------
echo "===== R8: Deleaked 5-class (focal + cosine + LLRD + O-downsample) ====="
echo "Start time: $(date)"

opf train "$DATA/r8_5class_train.jsonl" \
  --validation-dataset "$DATA/r8_5class_valid.jsonl" \
  --label-space-json "$LABELS/cyner_5class.json" \
  --output-dir "$CKPT_DIR" \
  --overwrite-output \
  --epochs 15 --batch-size 4 --grad-accum-steps 2 \
  --learning-rate 5e-5 \
  --warmup-fraction 0.1 --lr-schedule cosine \
  --loss-fn focal --focal-gamma 2.0 \
  --llrd-factor 0.9 \
  --o-downsample 0.7 \
  --device cuda 2>&1 | tee "$TRAIN_LOG" &
TRAIN_PID=$!

# Early stopping monitor — kills training if val_loss hasn't improved for $PATIENCE epochs
bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" &
MONITOR_PID=$!

wait $TRAIN_PID 2>/dev/null || true
kill $MONITOR_PID 2>/dev/null || true

echo "Training finished: $(date)"

# ---------------------------------------------------------------------------
# Best-epoch checkpoint selection
#
# The trainer saves per-epoch checkpoints as epoch_N/ and a final model.
# If early stopping killed training, the final model.safetensors may not
# exist. In that case, find the epoch with the lowest val_loss and use it.
# ---------------------------------------------------------------------------
BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \
  | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \
  | sort -n | head -1 | awk '{print $2}')
echo "Best epoch by val_loss: $BEST_EPOCH"

CKPT="$CKPT_DIR"
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
  echo "Training killed early — using epoch $BEST_EPOCH checkpoint"
  CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then
  # Even if training ran to completion, prefer the best-epoch checkpoint
  echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final"
  CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
fi
echo "Selected checkpoint: $CKPT"

# ---------------------------------------------------------------------------
# Evaluation — four test sets
# ---------------------------------------------------------------------------
EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device cuda"

echo ""
echo "================================================================"
echo "  EVALUATION PHASE"
echo "================================================================"

# 1) Primary benchmark: enriched 5-class test
echo ""
echo "===== Eval R8: Enriched 5-class test (primary benchmark) ====="
opf eval "$DATA/enriched_5class_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_enriched.log

# 2) CyNER test (supplementary — 98% overlap with enriched)
echo ""
echo "===== Eval R8: CyNER test (supplementary, ~98% overlap) ====="
opf eval "$DATA/cyner_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_cyner.log

# 3) SecureBERT2 test (supplementary — 96% overlap)
echo ""
echo "===== Eval R8: SecureBERT2 5-class test (supplementary, ~96% overlap) ====="
opf eval "$DATA/securebert2_5class_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_sb2.log

# 4) APTNER — independent benchmark (ZERO leakage, most important)
echo ""
echo "===== Eval R8: APTNER 5-class test (INDEPENDENT — zero leakage) ====="
opf eval "$DATA/aptner_5class_test_clean.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r8_aptner.log

# ---------------------------------------------------------------------------
# Viterbi grid search on validation set
# Find optimal Viterbi transition penalties for inference
# ---------------------------------------------------------------------------
echo ""
echo "===== Viterbi Grid Search (on validation set) ====="
python3 scripts/viterbi_grid_search.py \
  --checkpoint "$CKPT" \
  --val-data "$DATA/r8_5class_valid.jsonl" \
  --output results/viterbi_r8_best.json \
  --device cuda 2>&1 | tee viterbi_r8.log

# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "================================================================"
echo "  R8 COMPLETE — $(date)"
echo "================================================================"
echo "Checkpoint:  $CKPT"
echo "Logs:        $TRAIN_LOG, eval_r8_{enriched,cyner,sb2,aptner}.log, viterbi_r8.log"
echo ""
echo "Key results:"
echo "--- Enriched (primary) ---"
grep -E '(micro|macro|^  )' eval_r8_enriched.log 2>/dev/null | head -20 || true
echo ""
echo "--- APTNER (independent) ---"
grep -E '(micro|macro|^  )' eval_r8_aptner.log 2>/dev/null | head -20 || true
echo "================================================================"