#!/bin/bash # ============================================================================= # Round 8: FINAL training run on DELEAKED 5-class cybersecurity NER data # # Key differences from R6: # - Data: r8_5class_{train,valid}.jsonl — fully deleaked (no train/test overlap) # - O-token downsampling: --o-downsample 0.7 (mask 70% of O-tokens from loss) # This should boost entity recall given ~85-90% O-token prevalence # - Four evaluation test sets (incl. APTNER zero-leakage independent benchmark) # - Same proven hyperparams: focal(γ=2), cosine LR, LLRD 0.9, LR 5e-5 # # Expect slightly worse val_loss vs R6 (honest eval on clean data). # GPU: RTX PRO 6000 96GB VRAM # ============================================================================= set -euo pipefail cd ~/alkyline export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- PATIENCE=3 DATA=data/processed LABELS=data/label_spaces TRAIN_LOG=train_r8.log CKPT_DIR=checkpoints/r8_5class # --------------------------------------------------------------------------- # Training # --------------------------------------------------------------------------- echo "===== R8: Deleaked 5-class (focal + cosine + LLRD + O-downsample) =====" echo "Start time: $(date)" opf train "$DATA/r8_5class_train.jsonl" \ --validation-dataset "$DATA/r8_5class_valid.jsonl" \ --label-space-json "$LABELS/cyner_5class.json" \ --output-dir "$CKPT_DIR" \ --overwrite-output \ --epochs 15 --batch-size 4 --grad-accum-steps 2 \ --learning-rate 5e-5 \ --warmup-fraction 0.1 --lr-schedule cosine \ --loss-fn focal --focal-gamma 2.0 \ --llrd-factor 0.9 \ --o-downsample 0.7 \ --device cuda 2>&1 | tee "$TRAIN_LOG" & TRAIN_PID=$! # Early stopping monitor — kills training if val_loss hasn't improved for $PATIENCE epochs bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" & MONITOR_PID=$! wait $TRAIN_PID 2>/dev/null || true kill $MONITOR_PID 2>/dev/null || true echo "Training finished: $(date)" # --------------------------------------------------------------------------- # Best-epoch checkpoint selection # # The trainer saves per-epoch checkpoints as epoch_N/ and a final model. # If early stopping killed training, the final model.safetensors may not # exist. In that case, find the epoch with the lowest val_loss and use it. # --------------------------------------------------------------------------- BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \ | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \ | sort -n | head -1 | awk '{print $2}') echo "Best epoch by val_loss: $BEST_EPOCH" CKPT="$CKPT_DIR" if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then echo "Training killed early — using epoch $BEST_EPOCH checkpoint" CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}" elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then # Even if training ran to completion, prefer the best-epoch checkpoint echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final" CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}" fi echo "Selected checkpoint: $CKPT" # --------------------------------------------------------------------------- # Evaluation — four test sets # --------------------------------------------------------------------------- EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device cuda" echo "" echo "================================================================" echo " EVALUATION PHASE" echo "================================================================" # 1) Primary benchmark: enriched 5-class test echo "" echo "===== Eval R8: Enriched 5-class test (primary benchmark) =====" opf eval "$DATA/enriched_5class_test.jsonl" \ $EVAL_FLAGS 2>&1 | tee eval_r8_enriched.log # 2) CyNER test (supplementary — 98% overlap with enriched) echo "" echo "===== Eval R8: CyNER test (supplementary, ~98% overlap) =====" opf eval "$DATA/cyner_test.jsonl" \ $EVAL_FLAGS 2>&1 | tee eval_r8_cyner.log # 3) SecureBERT2 test (supplementary — 96% overlap) echo "" echo "===== Eval R8: SecureBERT2 5-class test (supplementary, ~96% overlap) =====" opf eval "$DATA/securebert2_5class_test.jsonl" \ $EVAL_FLAGS 2>&1 | tee eval_r8_sb2.log # 4) APTNER — independent benchmark (ZERO leakage, most important) echo "" echo "===== Eval R8: APTNER 5-class test (INDEPENDENT — zero leakage) =====" opf eval "$DATA/aptner_5class_test_clean.jsonl" \ $EVAL_FLAGS 2>&1 | tee eval_r8_aptner.log # --------------------------------------------------------------------------- # Viterbi grid search on validation set # Find optimal Viterbi transition penalties for inference # --------------------------------------------------------------------------- echo "" echo "===== Viterbi Grid Search (on validation set) =====" python3 scripts/viterbi_grid_search.py \ --checkpoint "$CKPT" \ --val-data "$DATA/r8_5class_valid.jsonl" \ --output results/viterbi_r8_best.json \ --device cuda 2>&1 | tee viterbi_r8.log # --------------------------------------------------------------------------- # Summary # --------------------------------------------------------------------------- echo "" echo "================================================================" echo " R8 COMPLETE — $(date)" echo "================================================================" echo "Checkpoint: $CKPT" echo "Logs: $TRAIN_LOG, eval_r8_{enriched,cyner,sb2,aptner}.log, viterbi_r8.log" echo "" echo "Key results:" echo "--- Enriched (primary) ---" grep -E '(micro|macro|^ )' eval_r8_enriched.log 2>/dev/null | head -20 || true echo "" echo "--- APTNER (independent) ---" grep -E '(micro|macro|^ )' eval_r8_aptner.log 2>/dev/null | head -20 || true echo "================================================================"